From baed0f603f60517443279e085c092feb3ed254c3 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Thu, 5 Sep 2024 05:26:07 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 76019 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 76414 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..7ee12543 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-08-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.15992v1","updated":"2024-08-28T17:58:39Z","published":"2024-08-28T17:58:39Z","title":"CoGen: Learning from Feedback with Coupled Comprehension and Generation","summary":" Systems with both language comprehension and generation capabilities can\nbenefit from the tight connection between the two. This work studies coupling\ncomprehension and generation with focus on continually learning from\ninteraction with users. We propose techniques to tightly integrate the two\ncapabilities for both learning and inference. We situate our studies in\ntwo-player reference games, and deploy various models for thousands of\ninteractions with human users, while learning from interaction feedback\nsignals. We show dramatic improvements in performance over time, with\ncomprehension-generation coupling leading to performance improvements up to 26%\nin absolute terms and up to 17% higher accuracies compared to a non-coupled\nsystem. Our analysis also shows coupling has substantial qualitative impact on\nthe system's language, making it significantly more human-like.\n","authors":["Mustafa Omer Gul","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2408.15992v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15971v1","updated":"2024-08-28T17:43:55Z","published":"2024-08-28T17:43:55Z","title":"BattleAgentBench: A Benchmark for Evaluating Cooperation and Competition\n Capabilities of Language Models in Multi-Agent Systems","summary":" Large Language Models (LLMs) are becoming increasingly powerful and capable\nof handling complex tasks, e.g., building single agents and multi-agent\nsystems. Compared to single agents, multi-agent systems have higher\nrequirements for the collaboration capabilities of language models. Many\nbenchmarks are proposed to evaluate their collaborative abilities. However,\nthese benchmarks lack fine-grained evaluations of LLM collaborative\ncapabilities. Additionally, multi-agent collaborative and competitive scenarios\nare ignored in existing works. To address these two problems, we propose a\nbenchmark, called BattleAgentBench, which defines seven sub-stages of three\nvarying difficulty levels and conducts a fine-grained evaluation of language\nmodels in terms of single-agent scenario navigation capabilities, paired-agent\ntask execution abilities, and multi-agent collaboration and competition\ncapabilities. We conducted extensive evaluations on leading four closed-source\nand seven open-source models. Experimental results indicate that API-based\nmodels perform excellently on simple tasks but open-source small models\nstruggle with simple tasks. Regarding difficult tasks that require\ncollaborative and competitive abilities, although API-based models have\ndemonstrated some collaborative capabilities, there is still enormous room for\nimprovement.\n","authors":["Wei Wang","Dan Zhang","Tao Feng","Boyan Wang","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.15971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15966v1","updated":"2024-08-28T17:38:44Z","published":"2024-08-28T17:38:44Z","title":"More Text, Less Point: Towards 3D Data-Efficient Point-Language\n Understanding","summary":" Enabling Large Language Models (LLMs) to comprehend the 3D physical world\nremains a significant challenge. Due to the lack of large-scale 3D-text pair\ndatasets, the success of LLMs has yet to be replicated in 3D understanding. In\nthis paper, we rethink this issue and propose a new task: 3D Data-Efficient\nPoint-Language Understanding. The goal is to enable LLMs to achieve robust 3D\nobject understanding with minimal 3D point cloud and text data pairs. To\naddress this task, we introduce GreenPLM, which leverages more text data to\ncompensate for the lack of 3D data. First, inspired by using CLIP to align\nimages and text, we utilize a pre-trained point cloud-text encoder to map the\n3D point cloud space to the text space. This mapping leaves us to seamlessly\nconnect the text space with LLMs. Once the point-text-LLM connection is\nestablished, we further enhance text-LLM alignment by expanding the\nintermediate text space, thereby reducing the reliance on 3D point cloud data.\nSpecifically, we generate 6M free-text descriptions of 3D objects, and design a\nthree-stage training strategy to help LLMs better explore the intrinsic\nconnections between different modalities. To achieve efficient modality\nalignment, we design a zero-parameter cross-attention module for token pooling.\nExtensive experimental results show that GreenPLM requires only 12% of the 3D\ntraining data used by existing state-of-the-art models to achieve superior 3D\nunderstanding. Remarkably, GreenPLM also achieves competitive performance using\ntext-only data. The code and weights are available at:\nhttps://github.com/TangYuan96/GreenPLM.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Jinfeng Xu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10260v2","updated":"2024-08-28T17:26:03Z","published":"2024-06-11T01:16:10Z","title":"Flextron: Many-in-One Flexible Large Language Model","summary":" Training modern LLMs is extremely resource intensive, and customizing them\nfor various deployment scenarios characterized by limited compute and memory\nresources through repeated training is impractical. In this paper, we introduce\nFlextron, a network architecture and post-training model optimization framework\nsupporting flexible model deployment. The Flextron architecture utilizes a\nnested elastic structure to rapidly adapt to specific user-defined latency and\naccuracy targets during inference with no additional fine-tuning required. It\nis also input-adaptive, and can automatically route tokens through its\nsub-networks for improved performance and efficiency. We present a\nsample-efficient training method and associated routing algorithms for\nsystematically transforming an existing trained LLM into a Flextron model. We\nevaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate\nsuperior performance over multiple end-to-end trained variants and other\nstate-of-the-art elastic networks, all with a single pretraining run that\nconsumes a mere 7.63% tokens compared to original pretraining.\n","authors":["Ruisi Cai","Saurav Muralidharan","Greg Heinrich","Hongxu Yin","Zhangyang Wang","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2406.10260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15915v1","updated":"2024-08-28T16:28:07Z","published":"2024-08-28T16:28:07Z","title":"Leveraging Open Knowledge for Advancing Task Expertise in Large Language\n Models","summary":" The cultivation of expertise for large language models (LLMs) to solve tasks\nof specific areas often requires special-purpose tuning with calibrated\nbehaviors on the expected stable outputs. To avoid huge cost brought by manual\npreparation of instruction datasets and training resources up to hundreds of\nhours, the exploitation of open knowledge including a wealth of low rank\nadaptation (LoRA) models and instruction datasets serves as a good starting\npoint. However, existing methods on model and data selection focus on the\nperformance of general-purpose capabilities while neglecting the knowledge gap\nexposed in domain-specific deployment. In the present study, we propose to\nbridge such gap by introducing few human-annotated samples (i.e., K-shot) for\nadvancing task expertise of LLMs with open knowledge. Specifically, we develop\nan efficient and scalable pipeline to cost-efficiently produce task experts\nwhere K-shot data intervene in selecting the most promising expert candidates\nand the task-relevant instructions. A mixture-of-expert (MoE) system is built\nto make the best use of individual-yet-complementary knowledge between multiple\nexperts. We unveil the two keys to the success of a MoE system, 1) the abidance\nby K-shot, and 2) the insistence on diversity. For the former, we ensure that\nmodels that truly possess problem-solving abilities on K-shot are selected\nrather than those blind guessers. Besides, during data selection, instructions\nthat share task-relevant contexts with K-shot are prioritized. For the latter,\nwe highlight the diversity of constituting experts and that of the fine-tuning\ninstructions throughout the model and data selection process. Extensive\nexperimental results confirm the superiority of our approach over existing\nmethods on utilization of open knowledge across various tasks. Codes and models\nwill be released later.\n","authors":["Yuncheng Yang","Yulei Qin","Tong Wu","Zihan Xu","Gang Li","Pengcheng Guo","Hang Shao","Yucheng Shi","Ke Li","Xing Sun","Jie Yang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2408.15915v1.pdf","comment":"28 pages, 12 tables, 10 figures"},{"id":"http://arxiv.org/abs/2311.11844v3","updated":"2024-08-28T16:26:16Z","published":"2023-11-20T15:34:45Z","title":"Towards Human-Level Text Coding with LLMs: The Case of Fatherhood Roles\n in Public Policy Documents","summary":" Recent advances in large language models (LLMs) like GPT-3.5 and GPT-4\npromise automation with better results and less programming, opening up new\nopportunities for text analysis in political science. In this study, we\nevaluate LLMs on three original coding tasks involving typical complexities\nencountered in political science settings: a non-English language, legal and\npolitical jargon, and complex labels based on abstract constructs. Along the\npaper, we propose a practical workflow to optimize the choice of the model and\nthe prompt. We find that the best prompting strategy consists of providing the\nLLMs with a detailed codebook, as the one provided to human coders. In this\nsetting, an LLM can be as good as or possibly better than a human annotator\nwhile being much faster, considerably cheaper, and much easier to scale to\nlarge amounts of text. We also provide a comparison of GPT and popular\nopen-source LLMs, discussing the trade-offs in the model's choice. Our software\nallows LLMs to be easily used as annotators and is publicly available:\nhttps://github.com/lorelupo/pappa.\n","authors":["Lorenzo Lupo","Oscar Magnusson","Dirk Hovy","Elin Naurin","Lena Wängnerud"],"pdf_url":"https://arxiv.org/pdf/2311.11844v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15903v1","updated":"2024-08-28T16:15:45Z","published":"2024-08-28T16:15:45Z","title":"LLM-Based Multi-Hop Question Answering with Knowledge Graph Integration\n in Evolving Environments","summary":" The rapid obsolescence of information in Large Language Models (LLMs) has\ndriven the development of various techniques to incorporate new facts. However,\nexisting methods for knowledge editing still face difficulties with multi-hop\nquestions that require accurate fact identification and sequential logical\nreasoning, particularly among numerous fact updates. To tackle these\nchallenges, this paper introduces Graph Memory-based Editing for Large Language\nModels (GMeLLo), a straitforward and effective method that merges the explicit\nknowledge representation of Knowledge Graphs (KGs) with the linguistic\nflexibility of LLMs. Beyond merely leveraging LLMs for question answering,\nGMeLLo employs these models to convert free-form language into structured\nqueries and fact triples, facilitating seamless interaction with KGs for rapid\nupdates and precise multi-hop reasoning. Our results show that GMeLLo\nsignificantly surpasses current state-of-the-art knowledge editing methods in\nthe multi-hop question answering benchmark, MQuAKE, especially in scenarios\nwith extensive knowledge edits.\n","authors":["Ruirui Chen","Weifeng Jiang","Chengwei Qin","Ishaan Singh Rawal","Cheston Tan","Dongkyu Choi","Bo Xiong","Bo Ai"],"pdf_url":"https://arxiv.org/pdf/2408.15903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15901v1","updated":"2024-08-28T16:12:55Z","published":"2024-08-28T16:12:55Z","title":"Nexus: Specialization meets Adaptability for Efficiently Training\n Mixture of Experts","summary":" Efficiency, specialization, and adaptability to new data distributions are\nqualities that are hard to combine in current Large Language Models. The\nMixture of Experts (MoE) architecture has been the focus of significant\nresearch because its inherent conditional computation enables such desirable\nproperties. In this work, we focus on \"upcycling\" dense expert models into an\nMoE, aiming to improve specialization while also adding the ability to adapt to\nnew tasks easily. We introduce Nexus, an enhanced MoE architecture with\nadaptive routing where the model learns to project expert embeddings from\ndomain representations. This approach allows Nexus to flexibly add new experts\nafter the initial upcycling through separately trained dense models, without\nrequiring large-scale MoE training for unseen data domains. Our experiments\nshow that Nexus achieves a relative gain of up to 2.1% over the baseline for\ninitial upcycling, and a 18.8% relative gain for extending the MoE with a new\nexpert by using limited finetuning data. This flexibility of Nexus is crucial\nto enable an open-source ecosystem where every user continuously assembles\ntheir own MoE-mix according to their needs.\n","authors":["Nikolas Gritsch","Qizhen Zhang","Acyr Locatelli","Sara Hooker","Ahmet Üstün"],"pdf_url":"https://arxiv.org/pdf/2408.15901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15896v1","updated":"2024-08-28T16:06:12Z","published":"2024-08-28T16:06:12Z","title":"A New Method for Cross-Lingual-based Semantic Role Labeling","summary":" Semantic role labeling is a crucial task in natural language processing,\nenabling better comprehension of natural language. However, the lack of\nannotated data in multiple languages has posed a challenge for researchers. To\naddress this, a deep learning algorithm based on model transfer has been\nproposed. The algorithm utilizes a dataset consisting of the English portion of\nCoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency\nof training, only ten percent of the educational data from each language is\nused. The results of the proposed model demonstrate significant improvements\ncompared to Niksirt et al.'s model. In monolingual mode, the proposed model\nachieved a 2.05 percent improvement on F1-score, while in cross-lingual mode,\nthe improvement was even more substantial, reaching 6.23 percent. Worth noting\nis that the compared model only trained two of the four stages of semantic role\nlabeling and employed golden data for the remaining two stages. This suggests\nthat the actual superiority of the proposed model surpasses the reported\nnumbers by a significant margin. The development of cross-lingual methods for\nsemantic role labeling holds promise, particularly in addressing the scarcity\nof annotated data for various languages. These advancements pave the way for\nfurther research in understanding and processing natural language across\ndifferent linguistic contexts.\n","authors":["Mohammad Ebrahimi","Behrouz Minaei Bidgoli","Nasim Khozouei"],"pdf_url":"https://arxiv.org/pdf/2408.15896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15895v1","updated":"2024-08-28T16:05:20Z","published":"2024-08-28T16:05:20Z","title":"Bias in LLMs as Annotators: The Effect of Party Cues on Labelling\n Decision by Large Language Models","summary":" Human coders are biased. We test similar biases in Large Language Models\n(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and\nMeyer (2018), we find evidence that LLMs use political information, and\nspecifically party cues, to judge political statements. Not only do LLMs use\nrelevant information to contextualize whether a statement is positive,\nnegative, or neutral based on the party cue, they also reflect the biases of\nthe human-generated data upon which they have been trained. We also find that\nunlike humans, who are only biased when faced with statements from extreme\nparties, LLMs exhibit significant bias even when prompted with statements from\ncenter-left and center-right parties. The implications of our findings are\ndiscussed in the conclusion.\n","authors":["Sebastian Vallejo Vera","Hunter Driggers"],"pdf_url":"https://arxiv.org/pdf/2408.15895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15879v1","updated":"2024-08-28T15:50:41Z","published":"2024-08-28T15:50:41Z","title":"Persuasion Games using Large Language Models","summary":" Large Language Models (LLMs) have emerged as formidable instruments capable\nof comprehending and producing human-like text. This paper explores the\npotential of LLMs, to shape human perspectives and subsequently influence their\ndecisions on particular tasks. This capability finds applications in diverse\ndomains such as Investment, Credit cards and Insurance, wherein they assist\nusers in selecting appropriate insurance policies, investment plans, Credit\ncards, Retail, as well as in Behavioral Change Support Systems (BCSS).\n We present a sophisticated multi-agent framework wherein a consortium of\nagents operate in collaborative manner. The primary agent engages directly with\nusers through persuasive dialogue, while the auxiliary agents perform tasks\nsuch as information retrieval, response analysis, development of persuasion\nstrategies, and validation of facts. Empirical evidence from our experiments\ndemonstrates that this collaborative methodology significantly enhances the\npersuasive efficacy of the LLM. We analyze user resistance to persuasive\nefforts continuously and counteract it by employing a combination of rule-based\nand LLM-based resistance-persuasion mapping techniques.\n We employ simulated personas and generate conversations in insurance,\nbanking, and retail domains to evaluate the proficiency of large language\nmodels (LLMs) in recognizing, adjusting to, and influencing various personality\ntypes. Concurrently, we examine the resistance mechanisms employed by LLM\nsimulated personas. Persuasion is quantified via measurable surveys before and\nafter interaction, LLM-generated scores on conversation, and user decisions\n(purchase or non-purchase).\n","authors":["Ganesh Prasath Ramani","Shirish Karande","Santhosh V","Yash Bhatia"],"pdf_url":"https://arxiv.org/pdf/2408.15879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02731v3","updated":"2024-08-28T15:40:22Z","published":"2023-09-06T05:33:57Z","title":"HC3 Plus: A Semantic-Invariant Human ChatGPT Comparison Corpus","summary":" ChatGPT has garnered significant interest due to its impressive performance;\nhowever, there is growing concern about its potential risks, particularly in\nthe detection of AI-generated content (AIGC), which is often challenging for\nuntrained individuals to identify. Current datasets used for detecting\nChatGPT-generated text primarily focus on question-answering tasks, often\noverlooking tasks with semantic-invariant properties, such as summarization,\ntranslation, and paraphrasing. In this paper, we demonstrate that detecting\nmodel-generated text in semantic-invariant tasks is more challenging. To\naddress this gap, we introduce a more extensive and comprehensive dataset that\nincorporates a wider range of tasks than previous work, including those with\nsemantic-invariant properties.\n","authors":["Zhenpeng Su","Xing Wu","Wei Zhou","Guangyuan Ma","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2309.02731v3.pdf","comment":"This paper has been accepted by CIKM2023 workshop"},{"id":"http://arxiv.org/abs/2405.00706v3","updated":"2024-08-28T15:29:10Z","published":"2024-04-23T14:43:35Z","title":"From Complexity to Clarity: How AI Enhances Perceptions of Scientists\n and the Public's Understanding of Science","summary":" This paper evaluated the effectiveness of using generative AI to simplify\nscience communication and enhance the public's understanding of science. By\ncomparing lay summaries of journal articles from PNAS, yoked to those generated\nby AI, this work first assessed linguistic simplicity differences across such\nsummaries and public perceptions in follow-up experiments. Specifically, Study\n1a analyzed simplicity features of PNAS abstracts (scientific summaries) and\nsignificance statements (lay summaries), observing that lay summaries were\nindeed linguistically simpler, but effect size differences were small. Study 1b\nused a large language model, GPT-4, to create significance statements based on\npaper abstracts and this more than doubled the average effect size without\nfine-tuning. Study 2 experimentally demonstrated that simply-written GPT\nsummaries facilitated more favorable perceptions of scientists (they were\nperceived as more credible and trustworthy, but less intelligent) than more\ncomplexly-written human PNAS summaries. Crucially, Study 3 experimentally\ndemonstrated that participants comprehended scientific writing better after\nreading simple GPT summaries compared to complex PNAS summaries. In their own\nwords, participants also summarized scientific papers in a more detailed and\nconcrete manner after reading GPT summaries compared to PNAS summaries of the\nsame article. AI has the potential to engage scientific communities and the\npublic via a simple language heuristic, advocating for its integration into\nscientific dissemination for a more informed society.\n","authors":["David M. Markowitz"],"pdf_url":"https://arxiv.org/pdf/2405.00706v3.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2404.07839v2","updated":"2024-08-28T15:05:42Z","published":"2024-04-11T15:27:22Z","title":"RecurrentGemma: Moving Past Transformers for Efficient Open Language\n Models","summary":" We introduce RecurrentGemma, a family of open language models which uses\nGoogle's novel Griffin architecture. Griffin combines linear recurrences with\nlocal attention to achieve excellent performance on language. It has a\nfixed-sized state, which reduces memory use and enables efficient inference on\nlong sequences. We provide two sizes of models, containing 2B and 9B\nparameters, and provide pre-trained and instruction tuned variants for both.\nOur models achieve comparable performance to similarly-sized Gemma baselines\ndespite being trained on fewer tokens.\n","authors":["Aleksandar Botev","Soham De","Samuel L Smith","Anushan Fernando","George-Cristian Muraru","Ruba Haroun","Leonard Berrada","Razvan Pascanu","Pier Giuseppe Sessa","Robert Dadashi","Léonard Hussenot","Johan Ferret","Sertan Girgin","Olivier Bachem","Alek Andreev","Kathleen Kenealy","Thomas Mesnard","Cassidy Hardin","Surya Bhupatiraju","Shreya Pathak","Laurent Sifre","Morgane Rivière","Mihir Sanjay Kale","Juliette Love","Pouya Tafti","Armand Joulin","Noah Fiedel","Evan Senter","Yutian Chen","Srivatsan Srinivasan","Guillaume Desjardins","David Budden","Arnaud Doucet","Sharad Vikram","Adam Paszke","Trevor Gale","Sebastian Borgeaud","Charlie Chen","Andy Brock","Antonia Paterson","Jenny Brennan","Meg Risdal","Raj Gundluru","Nesh Devanathan","Paul Mooney","Nilay Chauhan","Phil Culliton","Luiz Gustavo Martins","Elisa Bandy","David Huntsperger","Glenn Cameron","Arthur Zucker","Tris Warkentin","Ludovic Peran","Minh Giang","Zoubin Ghahramani","Clément Farabet","Koray Kavukcuoglu","Demis Hassabis","Raia Hadsell","Yee Whye Teh","Nando de Frietas"],"pdf_url":"https://arxiv.org/pdf/2404.07839v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01245v2","updated":"2024-08-28T15:01:04Z","published":"2024-04-01T17:03:41Z","title":"A Statistical Framework of Watermarks for Large Language Models: Pivot,\n Detection Efficiency and Optimal Rules","summary":" Since ChatGPT was introduced in November 2022, embedding (nearly)\nunnoticeable statistical signals into text generated by large language models\n(LLMs), also known as watermarking, has been used as a principled approach to\nprovable detection of LLM-generated text from its human-written counterpart. In\nthis paper, we introduce a general and flexible framework for reasoning about\nthe statistical efficiency of watermarks and designing powerful detection\nrules. Inspired by the hypothesis testing formulation of watermark detection,\nour framework starts by selecting a pivotal statistic of the text and a secret\nkey -- provided by the LLM to the verifier -- to enable controlling the false\npositive rate (the error of mistakenly detecting human-written text as\nLLM-generated). Next, this framework allows one to evaluate the power of\nwatermark detection rules by obtaining a closed-form expression of the\nasymptotic false negative rate (the error of incorrectly classifying\nLLM-generated text as human-written). Our framework further reduces the problem\nof determining the optimal detection rule to solving a minimax optimization\nprogram. We apply this framework to two representative watermarks -- one of\nwhich has been internally implemented at OpenAI -- and obtain several findings\nthat can be instrumental in guiding the practice of implementing watermarks. In\nparticular, we derive optimal detection rules for these watermarks under our\nframework. These theoretically derived detection rules are demonstrated to be\ncompetitive and sometimes enjoy a higher power than existing detection\napproaches through numerical experiments.\n","authors":["Xiang Li","Feng Ruan","Huiyuan Wang","Qi Long","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2404.01245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00612v2","updated":"2024-08-28T14:59:31Z","published":"2024-08-01T14:52:04Z","title":"Downstream bias mitigation is all you need","summary":" The advent of transformer-based architectures and large language models\n(LLMs) have significantly advanced the performance of natural language\nprocessing (NLP) models. Since these LLMs are trained on huge corpuses of data\nfrom the web and other sources, there has been a major concern about harmful\nprejudices that may potentially be transferred from the data. In many\napplications, these pre-trained LLMs are fine-tuned on task specific datasets,\nwhich can further contribute to biases. This paper studies the extent of biases\nabsorbed by LLMs during pre-training as well as task-specific behaviour after\nfine-tuning. We found that controlled interventions on pre-trained LLMs, prior\nto fine-tuning, have minimal effect on lowering biases in classifiers. However,\nthe biases present in domain-specific datasets play a much bigger role, and\nhence mitigating them at this stage has a bigger impact. While pre-training\ndoes matter, but after the model has been pre-trained, even slight changes to\nco-occurrence rates in the fine-tuning dataset has a significant effect on the\nbias of the model.\n","authors":["Arkadeep Baksi","Rahul Singh","Tarun Joshi"],"pdf_url":"https://arxiv.org/pdf/2408.00612v2.pdf","comment":"arXiv admin note: This work has been withdrawn by arXiv\n administrators due to inappropriate text reuse from external sources"},{"id":"http://arxiv.org/abs/2402.16696v3","updated":"2024-08-28T14:54:11Z","published":"2024-02-26T16:11:03Z","title":"Look Before You Leap: Towards Decision-Aware and Generalizable\n Tool-Usage for Large Language Models","summary":" Tool-augmented large language models (LLMs) are attracting widespread\nattention when accessing up-to-date knowledge and alleviating hallucination\nissues. Nowadays, advanced closed-source LLMs (e.g., ChatGPT) have demonstrated\nsurprising tool-usage capabilities through prompting and in-context learning\ntechniques. To empower the capabilities of open-source LLMs (e.g., LLaMA) in\nmanipulating tools, current efforts focus on either template-driven or\ntoken-triggered tool-usage. However, the former hampers LLMs' flexibility to\naddress diverse user's queries due to constrained tool interactions, while the\nlatter limits the generalizability when engaging with new tools, since\ntool-usage learning is based on task- and tool-specific datasets. To alleviate\nthese concerns, in this paper, we propose a decision-aware and generalizable\ntool-usage framework (DEER). Specifically, we first construct the tool-usage\nsamples with multiple decision branches via an automatic generation pipeline,\nthereby inspiring the decision-making awareness of LLMs under diverse\nscenarios. Meanwhile, we propose a novel tool sampling strategy to enhance the\ngeneralizability of LLMs over unseen tools. Extensive experiments demonstrate\nthat our proposed DEER is effective and significantly outperforms baselines\nacross various datasets.\n","authors":["Anchun Gui","Jian Li","Yong Dai","Nan Du","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2402.16696v3.pdf","comment":"20 pages, 18 figures"},{"id":"http://arxiv.org/abs/2408.15836v1","updated":"2024-08-28T14:48:37Z","published":"2024-08-28T14:48:37Z","title":"Knowledge Navigator: LLM-guided Browsing Framework for Exploratory\n Search in Scientific Literature","summary":" The exponential growth of scientific literature necessitates advanced tools\nfor effective knowledge exploration. We present Knowledge Navigator, a system\ndesigned to enhance exploratory search abilities by organizing and structuring\nthe retrieved documents from broad topical queries into a navigable, two-level\nhierarchy of named and descriptive scientific topics and subtopics. This\nstructured organization provides an overall view of the research themes in a\ndomain, while also enabling iterative search and deeper knowledge discovery\nwithin specific subtopics by allowing users to refine their focus and retrieve\nadditional relevant documents. Knowledge Navigator combines LLM capabilities\nwith cluster-based methods to enable an effective browsing method. We\ndemonstrate our approach's effectiveness through automatic and manual\nevaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code,\nprompts, and benchmarks are made publicly available.\n","authors":["Uri Katz","Mosh Levy","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2408.15836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13560v2","updated":"2024-08-28T14:45:57Z","published":"2024-03-20T12:52:38Z","title":"eRST: A Signaled Graph Theory of Discourse Relations and Organization","summary":" In this article we present Enhanced Rhetorical Structure Theory (eRST), a new\ntheoretical framework for computational discourse analysis, based on an\nexpansion of Rhetorical Structure Theory (RST). The framework encompasses\ndiscourse relation graphs with tree-breaking, non-projective and concurrent\nrelations, as well as implicit and explicit signals which give explainable\nrationales to our analyses. We survey shortcomings of RST and other existing\nframeworks, such as Segmented Discourse Representation Theory (SDRT), the Penn\nDiscourse Treebank (PDTB) and Discourse Dependencies, and address these using\nconstructs in the proposed theory. We provide annotation, search and\nvisualization tools for data, and present and evaluate a freely available\ncorpus of English annotated according to our framework, encompassing 12 spoken\nand written genres with over 200K tokens. Finally, we discuss automatic\nparsing, evaluation metrics and applications for data in our framework.\n","authors":["Amir Zeldes","Tatsuya Aoyama","Yang Janet Liu","Siyao Peng","Debopam Das","Luke Gessler"],"pdf_url":"https://arxiv.org/pdf/2403.13560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15827v1","updated":"2024-08-28T14:40:15Z","published":"2024-08-28T14:40:15Z","title":"Automatic Differential Diagnosis using Transformer-Based Multi-Label\n Sequence Classification","summary":" As the field of artificial intelligence progresses, assistive technologies\nare becoming more widely used across all industries. The healthcare industry is\nno different, with numerous studies being done to develop assistive tools for\nhealthcare professionals. Automatic diagnostic systems are one such beneficial\ntool that can assist with a variety of tasks, including collecting patient\ninformation, analyzing test results, and diagnosing patients. However, the idea\nof developing systems that can provide a differential diagnosis has been\nlargely overlooked in most of these research studies. In this study, we propose\na transformer-based approach for providing differential diagnoses based on a\npatient's age, sex, medical history, and symptoms. We use the DDXPlus dataset,\nwhich provides differential diagnosis information for patients based on 49\ndisease types. Firstly, we propose a method to process the tabular patient data\nfrom the dataset and engineer them into patient reports to make them suitable\nfor our research. In addition, we introduce two data modification modules to\ndiversify the training data and consequently improve the robustness of the\nmodels. We approach the task as a multi-label classification problem and\nconduct extensive experiments using four transformer models. All the models\ndisplayed promising results by achieving over 97% F1 score on the held-out test\nset. Moreover, we design additional behavioral tests to get a broader\nunderstanding of the models. In particular, for one of our test cases, we\nprepared a custom test set of 100 samples with the assistance of a doctor. The\nresults on the custom set showed that our proposed data modification modules\nimproved the model's generalization capabilities. We hope our findings will\nprovide future researchers with valuable insights and inspire them to develop\nreliable systems for automatic differential diagnosis.\n","authors":["Abu Adnan Sadi","Mohammad Ashrafuzzaman Khan","Lubaba Binte Saber"],"pdf_url":"https://arxiv.org/pdf/2408.15827v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.14511v2","updated":"2024-08-28T14:13:41Z","published":"2024-08-25T04:07:18Z","title":"Unveiling the Statistical Foundations of Chain-of-Thought Prompting\n Methods","summary":" Chain-of-Thought (CoT) prompting and its variants have gained popularity as\neffective methods for solving multi-step reasoning problems using pretrained\nlarge language models (LLMs). In this work, we analyze CoT prompting from a\nstatistical estimation perspective, providing a comprehensive characterization\nof its sample complexity. To this end, we introduce a multi-step latent\nvariable model that encapsulates the reasoning process, where the latent\nvariable encodes the task information. Under this framework, we demonstrate\nthat when the pretraining dataset is sufficiently large, the estimator formed\nby CoT prompting is equivalent to a Bayesian estimator. This estimator\neffectively solves the multi-step reasoning problem by aggregating a posterior\ndistribution inferred from the demonstration examples in the prompt. Moreover,\nwe prove that the statistical error of the CoT estimator can be decomposed into\ntwo main components: (i) a prompting error, which arises from inferring the\ntrue task using CoT prompts, and (ii) the statistical error of the pretrained\nLLM. We establish that, under appropriate assumptions, the prompting error\ndecays exponentially to zero as the number of demonstrations increases.\nAdditionally, we explicitly characterize the approximation and generalization\nerrors of the pretrained LLM. Notably, we construct a transformer model that\napproximates the target distribution of the multi-step reasoning problem with\nan error that decreases exponentially in the number of transformer blocks. Our\nanalysis extends to other variants of CoT, including Self-Consistent CoT,\nTree-of-Thought, and Selection-Inference, offering a broad perspective on the\nefficacy of these methods. We also provide numerical experiments to validate\nthe theoretical findings.\n","authors":["Xinyang Hu","Fengzhuo Zhang","Siyu Chen","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14511v2.pdf","comment":"150 pages, 18 figures, 3 tables"},{"id":"http://arxiv.org/abs/2402.14846v4","updated":"2024-08-28T14:04:05Z","published":"2024-02-19T14:53:01Z","title":"Stick to your Role! Stability of Personal Values Expressed in Large\n Language Models","summary":" The standard way to study Large Language Models (LLMs) with benchmarks or\npsychology questionnaires is to provide many different queries from similar\nminimal contexts (e.g. multiple choice questions). However, due to LLMs' highly\ncontext-dependent nature, conclusions from such minimal-context evaluations may\nbe little informative about the model's behavior in deployment (where it will\nbe exposed to many new contexts). We argue that context-dependence\n(specifically, value stability) should be studied as a specific property of\nLLMs and used as another dimension of LLM comparison (alongside others such as\ncognitive abilities, knowledge, or model size). We present a case-study on the\nstability of value expression over different contexts (simulated conversations\non different topics) as measured using a standard psychology questionnaire\n(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we\nstudy Rank-order stability on the population (interpersonal) level, and\nIpsative stability on the individual (intrapersonal) level. We consider two\nsettings (with and without instructing LLMs to simulate particular personas),\ntwo simulated populations, and three downstream tasks. We observe consistent\ntrends in the stability of models and model families - Mixtral, Mistral,\nGPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency\nof these trends implies that some models exhibit higher value stability than\nothers, and that stability can be estimated with the set of introduced\nmethodological tools. When instructed to simulate particular personas, LLMs\nexhibit low Rank-order stability, which further diminishes with conversation\nlength. This highlights the need for future research on LLMs that coherently\nsimulate different personas. This paper provides a foundational step in that\ndirection, and, to our knowledge, it is the first study of value stability in\nLLMs.\n","authors":["Grgur Kovač","Rémy Portelas","Masataka Sawayama","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2402.14846v4.pdf","comment":"The project website and code are available at\n https://sites.google.com/view/llmvaluestability Published in PLOS ONE (\n https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ),\n and a shorter version at CogSci 24 (\n https://escholarship.org/uc/item/7w4823c6 )"},{"id":"http://arxiv.org/abs/2408.15801v1","updated":"2024-08-28T13:52:19Z","published":"2024-08-28T13:52:19Z","title":"Scaling Up Summarization: Leveraging Large Language Models for Long Text\n Extractive Summarization","summary":" In an era where digital text is proliferating at an unprecedented rate,\nefficient summarization tools are becoming indispensable. While Large Language\nModels (LLMs) have been successfully applied in various NLP tasks, their role\nin extractive text summarization remains underexplored. This paper introduces\nEYEGLAXS (Easy Yet Efficient larGe LAnguage model for eXtractive\nSummarization), a framework that leverages LLMs, specifically LLAMA2-7B and\nChatGLM2-6B, for extractive summarization of lengthy text documents. Instead of\nabstractive methods, which often suffer from issues like factual inaccuracies\nand hallucinations, EYEGLAXS focuses on extractive summarization to ensure\nfactual and grammatical integrity. Utilizing state-of-the-art techniques such\nas Flash Attention and Parameter-Efficient Fine-Tuning (PEFT), EYEGLAXS\naddresses the computational and resource challenges typically associated with\nLLMs. The system sets new performance benchmarks on well-known datasets like\nPubMed and ArXiv. Furthermore, we extend our research through additional\nanalyses that explore the adaptability of LLMs in handling different sequence\nlengths and their efficiency in training on smaller datasets. These\ncontributions not only set a new standard in the field but also open up\npromising avenues for future research in extractive text summarization.\n","authors":["Léo Hemamou","Mehdi Debiane"],"pdf_url":"https://arxiv.org/pdf/2408.15801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15793v1","updated":"2024-08-28T13:37:07Z","published":"2024-08-28T13:37:07Z","title":"Language Adaptation on a Tight Academic Compute Budget: Tokenizer\n Swapping Works and Pure bfloat16 Is Enough","summary":" We investigate continued pretraining of LLMs for language adaptation on a\ntight academic budget: a setting in which only a few GPUs can be used in\nparallel, for a heavily constrained duration. We focus on adapting Mistral-7B\nto German or Arabic and evaluate several techniques to improve efficiency and\neffectiveness in this setting. Our German models adapted on this tight compute\nbudget underperform compared to the base Mistral-7B, while our Arabic models\noutperform several baselines, showing that for sufficiently well-represented\nlanguages, continued pretraining for specialization is not always helpful. Our\nmain findings focus on training precision and tokenizer swapping. Our results\nshow that pure bfloat16 training is a viable alternative to mixed-precision\ntraining, while being much faster when only using a few GPUs. Swapping the\ntokenizer for a specialized one yields more efficient tokenization and is\ncompetitive with the original tokenizer, which already contains some German\ntokens, but did not significantly increase performance for German. Code and\nmodel weights are available at on GitHub.\n","authors":["Konstantin Dobler","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2408.15793v1.pdf","comment":"WANT@ICML 2024"},{"id":"http://arxiv.org/abs/2408.15787v1","updated":"2024-08-28T13:29:59Z","published":"2024-08-28T13:29:59Z","title":"Interactive Agents: Simulating Counselor-Client Psychological Counseling\n via Role-Playing LLM-to-LLM Interactions","summary":" Virtual counselors powered by large language models (LLMs) aim to create\ninteractive support systems that effectively assist clients struggling with\nmental health challenges. To replicate counselor-client conversations,\nresearchers have built an online mental health platform that allows\nprofessional counselors to provide clients with text-based counseling services\nfor about an hour per session. Notwithstanding its effectiveness, challenges\nexist as human annotation is time-consuming, cost-intensive, privacy-protected,\nand not scalable. To address this issue and investigate the applicability of\nLLMs in psychological counseling conversation simulation, we propose a\nframework that employs two LLMs via role-playing for simulating\ncounselor-client interactions. Our framework involves two LLMs, one acting as a\nclient equipped with a specific and real-life user profile and the other\nplaying the role of an experienced counselor, generating professional responses\nusing integrative therapy techniques. We implement both the counselor and the\nclient by zero-shot prompting the GPT-4 model. In order to assess the\neffectiveness of LLMs in simulating counselor-client interactions and\nunderstand the disparities between LLM- and human-generated conversations, we\nevaluate the synthetic data from various perspectives. We begin by assessing\nthe client's performance through automatic evaluations. Next, we analyze and\ncompare the disparities between dialogues generated by the LLM and those\ngenerated by professional counselors. Furthermore, we conduct extensive\nexperiments to thoroughly examine the performance of our LLM-based counselor\ntrained with synthetic interactive dialogues by benchmarking against\nstate-of-the-art models for mental health.\n","authors":["Huachuan Qiu","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2408.15787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14438v2","updated":"2024-08-28T13:19:36Z","published":"2024-08-26T17:25:16Z","title":"Evaluating Large Language Models on Spatial Tasks: A Multi-Task\n Benchmarking Study","summary":" The advent of large language models such as ChatGPT, Gemini, and others has\nunderscored the importance of evaluating their diverse capabilities, ranging\nfrom natural language understanding to code generation. However, their\nperformance on spatial tasks has not been comprehensively assessed. This study\naddresses this gap by introducing a novel multi-task spatial evaluation\ndataset, designed to systematically explore and compare the performance of\nseveral advanced models on spatial tasks. The dataset encompasses twelve\ndistinct task types, including spatial understanding and path planning, each\nwith verified, accurate answers. We evaluated multiple models, including\nOpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase\ntesting approach. Initially, we conducted zero-shot testing, followed by\ncategorizing the dataset by difficulty and performing prompt tuning tests.\nResults indicate that gpt-4o achieved the highest overall accuracy in the first\nphase, with an average of 71.3%. Although moonshot-v1-8k slightly\nunderperformed overall, it surpassed gpt-4o in place name recognition tasks.\nThe study also highlights the impact of prompt strategies on model performance\nin specific tasks. For example, the Chain-of-Thought (COT) strategy increased\ngpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot\nstrategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to\n76.3%.\n","authors":["Liuchang Xu","Shuo Zhao","Qingming Lin","Luyao Chen","Qianqian Luo","Sensen Wu","Xinyue Ye","Hailin Feng","Zhenhong Du"],"pdf_url":"https://arxiv.org/pdf/2408.14438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15778v1","updated":"2024-08-28T13:16:41Z","published":"2024-08-28T13:16:41Z","title":"LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language\n Models","summary":" Large Language Models (LLMs) have demonstrated notable capabilities across\nvarious tasks, showcasing complex problem-solving abilities. Understanding and\nexecuting complex rules, along with multi-step planning, are fundamental to\nlogical reasoning and critical for practical LLM agents and decision-making\nsystems. However, evaluating LLMs as effective rule-based executors and\nplanners remains underexplored. In this paper, we introduce LogicGame, a novel\nbenchmark designed to evaluate the comprehensive rule understanding, execution,\nand planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame\nprovides diverse games that contain a series of rules with an initial state,\nrequiring models to comprehend and apply predefined regulations to solve\nproblems. We create simulated scenarios in which models execute or plan\noperations to achieve specific outcomes. These game scenarios are specifically\ndesigned to distinguish logical reasoning from mere knowledge by relying\nexclusively on predefined rules. This separation allows for a pure assessment\nof rule-based reasoning capabilities. The evaluation considers not only final\noutcomes but also intermediate steps, providing a comprehensive assessment of\nmodel performance. Moreover, these intermediate steps are deterministic and can\nbe automatically verified. LogicGame defines game scenarios with varying\ndifficulty levels, from simple rule applications to complex reasoning chains,\nin order to offer a precise evaluation of model performance on rule\nunderstanding and multi-step execution. Utilizing LogicGame, we test various\nLLMs and identify notable shortcomings in their rule-based logical reasoning\nabilities.\n","authors":["Jiayi Gui","Yiming Liu","Jiale Cheng","Xiaotao Gu","Xiao Liu","Hongning Wang","Yuxiao Dong","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2408.15778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15769v1","updated":"2024-08-28T13:05:55Z","published":"2024-08-28T13:05:55Z","title":"A Survey on Evaluation of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) mimic human perception and reasoning\nsystem by integrating powerful Large Language Models (LLMs) with various\nmodality encoders (e.g., vision, audio), positioning LLMs as the \"brain\" and\nvarious modality encoders as sensory organs. This framework endows MLLMs with\nhuman-like capabilities, and suggests a potential pathway towards achieving\nartificial general intelligence (AGI). With the emergence of all-round MLLMs\nlike GPT-4V and Gemini, a multitude of evaluation methods have been developed\nto assess their capabilities across different dimensions. This paper presents a\nsystematic and comprehensive review of MLLM evaluation methods, covering the\nfollowing key aspects: (1) the background of MLLMs and their evaluation; (2)\n\"what to evaluate\" that reviews and categorizes existing MLLM evaluation tasks\nbased on the capabilities assessed, including general multimodal recognition,\nperception, reasoning and trustworthiness, and domain-specific applications\nsuch as socioeconomic, natural sciences and engineering, medical usage, AI\nagent, remote sensing, video and audio processing, 3D point cloud analysis, and\nothers; (3) \"where to evaluate\" that summarizes MLLM evaluation benchmarks into\ngeneral and specific benchmarks; (4) \"how to evaluate\" that reviews and\nillustrates MLLM evaluation steps and metrics; Our overarching goal is to\nprovide valuable insights for researchers in the field of MLLM evaluation,\nthereby facilitating the development of more capable and reliable MLLMs. We\nemphasize that evaluation should be regarded as a critical discipline,\nessential for advancing the field of MLLMs.\n","authors":["Jiaxing Huang","Jingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15766v1","updated":"2024-08-28T12:59:12Z","published":"2024-08-28T12:59:12Z","title":"Harmonized Speculative Sampling","summary":" Speculative sampling has proven to be an effective solution to accelerate\ndecoding from large language models, where the acceptance rate significantly\ndetermines the performance. Most previous works on improving the acceptance\nrate focus on aligned training and efficient decoding, implicitly paying less\nattention to the linkage of training and decoding. In this work, we first\ninvestigate the linkage of training and decoding for speculative sampling and\nthen propose a solution named HArmonized Speculative Sampling (HASS). HASS\nimproves the acceptance rate without extra inference overhead by harmonizing\ntraining and decoding on their objectives and contexts. Experiments on three\nLLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup\nratio averaging across three datasets, which is 8%-15% faster than EAGLE-2.\n","authors":["Lefan Zhang","Xiaodan Wang","Yanhua Huang","Ruiwen Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15747v1","updated":"2024-08-28T12:25:45Z","published":"2024-08-28T12:25:45Z","title":"Form and meaning co-determine the realization of tone in Taiwan Mandarin\n spontaneous speech: the case of Tone 3 sandhi","summary":" In Standard Chinese, Tone 3 (the dipping tone) becomes Tone 2 (rising tone)\nwhen followed by another Tone 3. Previous studies have noted that this sandhi\nprocess may be incomplete, in the sense that the assimilated Tone 3 is still\ndistinct from a true Tone 2. While Mandarin Tone 3 sandhi is widely studied\nusing carefully controlled laboratory speech (Xu, 1997) and more formal\nregisters of Beijing Mandarin (Yuan and Chen, 2014), less is known about its\nrealization in spontaneous speech, and about the effect of contextual factors\non tonal realization. The present study investigates the pitch contours of\ntwo-character words with T2-T3 and T3-T3 tone patterns in spontaneous Taiwan\nMandarin conversations. Our analysis makes use of the Generative Additive Mixed\nModel (GAMM, Wood, 2017) to examine fundamental frequency (f0) contours as a\nfunction of normalized time. We consider various factors known to influence\npitch contours, including gender, speaking rate, speaker, neighboring tones,\nword position, bigram probability, and also novel predictors, word and word\nsense (Chuang et al., 2024). Our analyses revealed that in spontaneous Taiwan\nMandarin, T3-T3 words become indistinguishable from T2-T3 words, indicating\ncomplete sandhi, once the strong effect of word (or word sense) is taken into\naccount. For our data, the shape of f0 contours is not co-determined by word\nfrequency. In contrast, the effect of word meaning on f0 contours is robust, as\nstrong as the effect of adjacent tones, and is present for both T2-T3 and T3-T3\nwords.\n","authors":["Yuxin Lu","Yu-Ying Chuang","R. Harald Baayen"],"pdf_url":"https://arxiv.org/pdf/2408.15747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14398v2","updated":"2024-08-28T12:03:54Z","published":"2024-08-26T16:29:13Z","title":"Language-specific Calibration for Pruning Multilingual Language Models","summary":" Recent advances in large language model (LLM) pruning have shown\nstate-of-the-art compression results in post-training and retraining-free\nsettings while maintaining high predictive performance. However, such research\nmainly considers calibrating pruning using English text, despite the\nmultilingual nature of modern LLMs and their frequent uses in non-English\nlanguages. In this paper, we set out to explore effective strategies for\ncalibrating the pruning of multilingual language models. We present the first\ncomprehensive empirical study, comparing different calibration languages for\npruning multilingual models across diverse tasks, models, and state-of-the-art\npruning techniques. Our results present practical suggestions, for example,\ncalibrating in the target language can efficiently yield lower perplexity, but\ndoes not necessarily benefit downstream tasks. Our further analysis experiments\nunveil that calibration in the target language mainly contributes to preserving\nlanguage-specific features related to fluency and coherence, but might not\ncontribute to capturing language-agnostic features such as language\nunderstanding and reasoning. Last, we provide practical recommendations for\nfuture practitioners.\n","authors":["Simon Kurz","Jian-Jia Chen","Lucie Flek","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.14398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15729v1","updated":"2024-08-28T11:44:52Z","published":"2024-08-28T11:44:52Z","title":"LM-PUB-QUIZ: A Comprehensive Framework for Zero-Shot Evaluation of\n Relational Knowledge in Language Models","summary":" Knowledge probing evaluates the extent to which a language model (LM) has\nacquired relational knowledge during its pre-training phase. It provides a\ncost-effective means of comparing LMs of different sizes and training setups\nand is useful for monitoring knowledge gained or lost during continual learning\n(CL). In prior work, we presented an improved knowledge probe called BEAR\n(Wiland et al., 2024), which enables the comparison of LMs trained with\ndifferent pre-training objectives (causal and masked LMs) and addresses issues\nof skewed distributions in previous probes to deliver a more unbiased reading\nof LM knowledge. With this paper, we present LM-PUB- QUIZ, a Python framework\nand leaderboard built around the BEAR probing mechanism that enables\nresearchers and practitioners to apply it in their work. It provides options\nfor standalone evaluation and direct integration into the widely-used training\npipeline of the Hugging Face TRANSFORMERS library. Further, it provides a\nfine-grained analysis of different knowledge types to assist users in better\nunderstanding the knowledge in each evaluated LM. We publicly release\nLM-PUB-QUIZ as an open-source project.\n","authors":["Max Ploner","Jacek Wiland","Sebastian Pohl","Alan Akbik"],"pdf_url":"https://arxiv.org/pdf/2408.15729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15720v1","updated":"2024-08-28T11:36:29Z","published":"2024-08-28T11:36:29Z","title":"An Evaluation of Sindhi Word Embedding in Semantic Analogies and\n Downstream Tasks","summary":" In this paper, we propose a new word embedding based corpus consisting of\nmore than 61 million words crawled from multiple web resources. We design a\npreprocessing pipeline for the filtration of unwanted text from crawled data.\nAfterwards, the cleaned vocabulary is fed to state-of-the-art\ncontinuous-bag-of-words, skip-gram, and GloVe word embedding algorithms. For\nthe evaluation of pretrained embeddings, we use popular intrinsic and extrinsic\nevaluation approaches. The evaluation results reveal that\ncontinuous-bag-of-words and skip-gram perform better than GloVe and existing\nSindhi fastText word embedding on both intrinsic and extrinsic evaluation\napproaches\n","authors":["Wazir Ali","Saifullah Tumrani","Jay Kumar","Tariq Rahim Soomro"],"pdf_url":"https://arxiv.org/pdf/2408.15720v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:1911.12579"},{"id":"http://arxiv.org/abs/2408.15710v1","updated":"2024-08-28T11:18:06Z","published":"2024-08-28T11:18:06Z","title":"Conan-embedding: General Text Embedding with More and Better Negative\n Samples","summary":" With the growing popularity of RAG, the capabilities of embedding models are\ngaining increasing attention. Embedding models are primarily trained through\ncontrastive loss learning, with negative examples being a key component.\nPrevious work has proposed various hard negative mining strategies, but these\nstrategies are typically employed as preprocessing steps. In this paper, we\npropose the conan-embedding model, which maximizes the utilization of more and\nhigher-quality negative examples. Specifically, since the model's ability to\nhandle preprocessed negative examples evolves during training, we propose\ndynamic hard negative mining method to expose the model to more challenging\nnegative examples throughout the training process. Secondly, contrastive\nlearning requires as many negative examples as possible but is limited by GPU\nmemory constraints. Therefore, we use a Cross-GPU balancing Loss to provide\nmore negative examples for embedding training and balance the batch size across\nmultiple tasks. Moreover, we also discovered that the prompt-response pairs\nfrom LLMs can be used for embedding training. Our approach effectively enhances\nthe capabilities of embedding models, currently ranking first on the Chinese\nleaderboard of Massive text embedding benchmark\n","authors":["Shiyu Li","Yang Tang","Shizhe Chen","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11239v2","updated":"2024-08-28T11:10:59Z","published":"2024-06-17T06:07:32Z","title":"Evading AI-Generated Content Detectors using Homoglyphs","summary":" The advent of large language models (LLMs) has enabled the generation of text\nthat increasingly exhibits human-like characteristics. As the detection of such\ncontent is of significant importance, numerous studies have been conducted with\nthe aim of developing reliable AI-generated text detectors. These detectors\nhave demonstrated promising results on test data, but recent research has\nrevealed that they can be circumvented by employing different techniques. In\nthis paper, we present homoglyph-based attacks ($a \\rightarrow {\\alpha}$) as a\nmeans of circumventing existing detectors. A comprehensive evaluation was\nconducted to assess the effectiveness of these attacks on seven detectors,\nincluding ArguGPT, Binoculars, DetectGPT, Fast-DetectGPT, Ghostbuster, OpenAI's\ndetector, and watermarking techniques, on five different datasets. Our findings\ndemonstrate that homoglyph-based attacks can effectively circumvent\nstate-of-the-art detectors, leading them to classify all texts as either\nAI-generated or human-written (decreasing the average Matthews Correlation\nCoefficient from 0.64 to -0.01). We then examine the effectiveness of these\nattacks by analyzing how homoglyphs impact different families of detectors.\nFinally, we discuss the implications of these findings and potential defenses\nagainst such attacks.\n","authors":["Aldan Creo","Shushanta Pudasaini"],"pdf_url":"https://arxiv.org/pdf/2406.11239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11537v3","updated":"2024-08-28T10:39:11Z","published":"2024-02-18T10:36:05Z","title":"Deciphering the Impact of Pretraining Data on Large Language Models\n through Machine Unlearning","summary":" Through pretraining on a corpus with various sources, Large Language Models\n(LLMs) have gained impressive performance. However, the impact of each\ncomponent of the pretraining corpus remains opaque. As a result, the\norganization of the pretraining corpus is still empirical and may deviate from\nthe optimal. To address this issue, we systematically analyze the impact of 48\ndatasets from 5 major categories of pretraining data of LLMs and measure their\nimpacts on LLMs using benchmarks about nine major categories of model\ncapabilities. Our analyses provide empirical results about the contribution of\nmultiple corpora on the performances of LLMs, along with their joint impact\npatterns, including complementary, orthogonal, and correlational relationships.\nWe also identify a set of ``high-impact data'' such as Books that is\nsignificantly related to a set of model capabilities. These findings provide\ninsights into the organization of data to support more efficient pretraining of\nLLMs.\n","authors":["Yang Zhao","Li Du","Xiao Ding","Kai Xiong","Zhouhao Sun","Jun Shi","Ting Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2402.11537v3.pdf","comment":"Accepted by ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.15689v1","updated":"2024-08-28T10:25:53Z","published":"2024-08-28T10:25:53Z","title":"TempoFormer: A Transformer for Temporally-aware Representations in\n Change Detection","summary":" Dynamic representation learning plays a pivotal role in understanding the\nevolution of linguistic content over time. On this front both context and time\ndynamics as well as their interplay are of prime importance. Current approaches\nmodel context via pre-trained representations, which are typically temporally\nagnostic. Previous work on modeling context and temporal dynamics has used\nrecurrent methods, which are slow and prone to overfitting. Here we introduce\nTempoFormer, the fist task-agnostic transformer-based and temporally-aware\nmodel for dynamic representation learning. Our approach is jointly trained on\ninter and intra context dynamics and introduces a novel temporal variation of\nrotary positional embeddings. The architecture is flexible and can be used as\nthe temporal representation foundation of other models or applied to different\ntransformer-based architectures. We show new SOTA performance on three\ndifferent real-time change detection tasks.\n","authors":["Talia Tseriotou","Adam Tsakalidis","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2408.15689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15666v1","updated":"2024-08-28T09:35:15Z","published":"2024-08-28T09:35:15Z","title":"StyleRemix: Interpretable Authorship Obfuscation via Distillation and\n Perturbation of Style Elements","summary":" Authorship obfuscation, rewriting a text to intentionally obscure the\nidentity of the author, is an important but challenging task. Current methods\nusing large language models (LLMs) lack interpretability and controllability,\noften ignoring author-specific stylistic features, resulting in less robust\nperformance overall.\n To address this, we develop StyleRemix, an adaptive and interpretable\nobfuscation method that perturbs specific, fine-grained style elements of the\noriginal input text. StyleRemix uses pre-trained Low Rank Adaptation (LoRA)\nmodules to rewrite an input specifically along various stylistic axes (e.g.,\nformality and length) while maintaining low computational cost. StyleRemix\noutperforms state-of-the-art baselines and much larger LLMs in a variety of\ndomains as assessed by both automatic and human evaluation.\n Additionally, we release AuthorMix, a large set of 30K high-quality,\nlong-form texts from a diverse set of 14 authors and 4 domains, and DiSC, a\nparallel corpus of 1,500 texts spanning seven style axes in 16 unique\ndirections\n","authors":["Jillian Fisher","Skyler Hallinan","Ximing Lu","Mitchell Gordon","Zaid Harchaoui","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2408.15666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15664v1","updated":"2024-08-28T09:31:09Z","published":"2024-08-28T09:31:09Z","title":"Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts","summary":" For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to\nrouting collapse or increased computational overhead. Existing methods commonly\nemploy an auxiliary loss to encourage load balance, but a large auxiliary loss\nwill introduce non-negligible interference gradients into training and thus\nimpair the model performance. In order to control load balance while not\nproducing undesired gradients during training, we propose Loss-Free Balancing,\nfeatured by an auxiliary-loss-free load balancing strategy. To be specific,\nbefore the top-K routing decision, Loss-Free Balancing will first apply an\nexpert-wise bias to the routing scores of each expert. By dynamically updating\nthe bias of each expert according to its recent load, Loss-Free Balancing can\nconsistently maintain a balanced distribution of expert load. In addition,\nsince Loss-Free Balancing does not produce any interference gradients, it also\nelevates the upper bound of model performance gained from MoE training. We\nvalidate the performance of Loss-Free Balancing on MoE models with up to 3B\nparameters trained on up to 200B tokens. Experimental results show that\nLoss-Free Balancing achieves both better performance and better load balance\ncompared with traditional auxiliary-loss-controlled load balancing strategies.\n","authors":["Lean Wang","Huazuo Gao","Chenggang Zhao","Xu Sun","Damai Dai"],"pdf_url":"https://arxiv.org/pdf/2408.15664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15650v1","updated":"2024-08-28T09:07:30Z","published":"2024-08-28T09:07:30Z","title":"Harnessing the Intrinsic Knowledge of Pretrained Language Models for\n Challenging Text Classification Settings","summary":" Text classification is crucial for applications such as sentiment analysis\nand toxic text filtering, but it still faces challenges due to the complexity\nand ambiguity of natural language. Recent advancements in deep learning,\nparticularly transformer architectures and large-scale pretraining, have\nachieved inspiring success in NLP fields. Building on these advancements, this\nthesis explores three challenging settings in text classification by leveraging\nthe intrinsic knowledge of pretrained language models (PLMs). Firstly, to\naddress the challenge of selecting misleading yet incorrect distractors for\ncloze questions, we develop models that utilize features based on\ncontextualized word representations from PLMs, achieving performance that\nrivals or surpasses human accuracy. Secondly, to enhance model generalization\nto unseen labels, we create small finetuning datasets with domain-independent\ntask label descriptions, improving model performance and robustness. Lastly, we\ntackle the sensitivity of large language models to in-context learning prompts\nby selecting effective demonstrations, focusing on misclassified examples and\nresolving model ambiguity regarding test example labels.\n","authors":["Lingyu Gao"],"pdf_url":"https://arxiv.org/pdf/2408.15650v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2205.11245v4","updated":"2024-08-28T08:51:57Z","published":"2022-05-18T04:38:15Z","title":"PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for\n Multi-stage Ranking","summary":" This paper describes the PASH participation in TREC 2021 Deep Learning Track.\nIn the recall stage, we adopt a scheme combining sparse and dense retrieval\nmethod. In the multi-stage ranking phase, point-wise and pair-wise ranking\nstrategies are used one after another based on model continual pre-trained on\ngeneral knowledge and document-level data. Compared to TREC 2020 Deep Learning\nTrack, we have additionally introduced the generative model T5 to further\nenhance the performance.\n","authors":["Yixuan Qiao","Hao Chen","Jun Wang","Tuozhen Liu","Xianbin Ye","Xin Tang","Rui Fang","Peng Gao","Wenfeng Xie","Guotong Xie"],"pdf_url":"https://arxiv.org/pdf/2205.11245v4.pdf","comment":"TREC 2021"},{"id":"http://arxiv.org/abs/2405.20770v3","updated":"2024-08-28T08:46:17Z","published":"2024-05-24T07:23:56Z","title":"Large Language Model Sentinel: LLM Agent for Adversarial Purification","summary":" Over the past two years, the use of large language models (LLMs) has advanced\nrapidly. While these LLMs offer considerable convenience, they also raise\nsecurity concerns, as LLMs are vulnerable to adversarial attacks by some\nwell-designed textual perturbations. In this paper, we introduce a novel\ndefense technique named Large LAnguage MOdel Sentinel (LLAMOS), which is\ndesigned to enhance the adversarial robustness of LLMs by purifying the\nadversarial textual examples before feeding them into the target LLM. Our\nmethod comprises two main components: a) Agent instruction, which can simulate\na new agent for adversarial defense, altering minimal characters to maintain\nthe original meaning of the sentence while defending against attacks; b)\nDefense guidance, which provides strategies for modifying clean or adversarial\nexamples to ensure effective defense and accurate outputs from the target LLMs.\nRemarkably, the defense agent demonstrates robust defensive capabilities even\nwithout learning from adversarial examples. Additionally, we conduct an\nintriguing adversarial experiment where we develop two agents, one for defense\nand one for attack, and engage them in mutual confrontation. During the\nadversarial interactions, neither agent completely beat the other. Extensive\nexperiments on both open-source and closed-source LLMs demonstrate that our\nmethod effectively defends against adversarial attacks, thereby enhancing\nadversarial robustness.\n","authors":["Guang Lin","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20770v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15625v1","updated":"2024-08-28T08:25:22Z","published":"2024-08-28T08:25:22Z","title":"CBF-LLM: Safe Control for LLM Alignment","summary":" This paper proposes a control-based framework for aligning large language\nmodels (LLMs) by leveraging a control barrier function (CBF) to ensure\nuser-desirable text generation. The presented framework applies the safety\nfilter, designed based on the CBF, to the output generation of the baseline\nLLM, i.e., the sequence of the token, with the aim of intervening in the\ngenerated text. The overall text-generation system is implemented with Llama 3\nand a RoBERTa model, and the source code is available at\nhttps://github.com/Mya-Mya/CBF-LLM. The experiment demonstrates its control\nability and effectiveness in reducing the number of interventions needed for\nuser-specified alignment tasks.\n","authors":["Yuya Miyaoka","Masaki Inoue"],"pdf_url":"https://arxiv.org/pdf/2408.15625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15616v1","updated":"2024-08-28T08:14:51Z","published":"2024-08-28T08:14:51Z","title":"Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error\n Rate Computations And Granular Error Classifications","summary":" The Word Error Rate (WER) is the common measure of accuracy for Automatic\nSpeech Recognition (ASR). Transcripts are usually pre-processed by substituting\nspecific characters to account for non-semantic differences. As a result of\nthis normalisation, information on the accuracy of punctuation or\ncapitalisation is lost. We present a non-destructive, token-based approach\nusing an extended Levenshtein distance algorithm to compute a robust WER and\nadditional orthographic metrics. Transcription errors are also classified more\ngranularly by existing string similarity and phonetic algorithms. An evaluation\non several datasets demonstrates the practical equivalence of our approach\ncompared to common WER computations. We also provide an exemplary analysis of\nderived use cases, such as a punctuation error rate, and a web application for\ninteractive use and visualisation of our implementation. The code is available\nopen-source.\n","authors":["Korbinian Kuhn","Verena Kersken","Gottfried Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2408.15616v1.pdf","comment":"Accepted in INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2406.18312v4","updated":"2024-08-28T08:07:49Z","published":"2024-06-26T12:51:37Z","title":"AI-native Memory: A Pathway from LLMs Towards AGI","summary":" Large language models (LLMs) have demonstrated the world with the sparks of\nartificial general intelligence (AGI). One opinion, especially from some\nstartups working on LLMs, argues that an LLM with nearly unlimited context\nlength can realize AGI. However, they might be too optimistic about the\nlong-context capability of (existing) LLMs -- (1) Recent literature has shown\nthat their effective context length is significantly smaller than their claimed\ncontext length; and (2) Our reasoning-in-a-haystack experiments further\ndemonstrate that simultaneously finding the relevant information from a long\ncontext and conducting (simple) reasoning is nearly impossible. In this paper,\nwe envision a pathway from LLMs to AGI through the integration of\n\\emph{memory}. We believe that AGI should be a system where LLMs serve as core\nprocessors. In addition to raw data, the memory in this system would store a\nlarge number of important conclusions derived from reasoning processes.\nCompared with retrieval-augmented generation (RAG) that merely processing raw\ndata, this approach not only connects semantically related information closer,\nbut also simplifies complex inferences at the time of querying. As an\nintermediate stage, the memory will likely be in the form of natural language\ndescriptions, which can be directly consumed by users too. Ultimately, every\nagent/person should have its own large personal model, a deep neural network\nmodel (thus \\emph{AI-native}) that parameterizes and compresses all types of\nmemory, even the ones cannot be described by natural languages. Finally, we\ndiscuss the significant potential of AI-native memory as the transformative\ninfrastructure for (proactive) engagement, personalization, distribution, and\nsocial in the AGI era, as well as the incurred privacy and security challenges\nwith preliminary solutions.\n","authors":["Jingbo Shang","Zai Zheng","Jiale Wei","Xiang Ying","Felix Tao","Mindverse Team"],"pdf_url":"https://arxiv.org/pdf/2406.18312v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09333v2","updated":"2024-08-28T07:34:37Z","published":"2024-08-18T02:27:25Z","title":"SkyScript-100M: 1,000,000,000 Pairs of Scripts and Shooting Scripts for\n Short Drama","summary":" Generating high-quality shooting scripts containing information such as scene\nand shot language is essential for short drama script generation. We collect\n6,660 popular short drama episodes from the Internet, each with an average of\n100 short episodes, and the total number of short episodes is about 80,000,\nwith a total duration of about 2,000 hours and totaling 10 terabytes (TB). We\nperform keyframe extraction and annotation on each episode to obtain about\n10,000,000 shooting scripts. We perform 100 script restorations on the\nextracted shooting scripts based on our self-developed large short drama\ngeneration model SkyReels. This leads to a dataset containing 1,000,000,000\npairs of scripts and shooting scripts for short dramas, called SkyScript-100M.\nWe compare SkyScript-100M with the existing dataset in detail and demonstrate\nsome deeper insights that can be achieved based on SkyScript-100M. Based on\nSkyScript-100M, researchers can achieve several deeper and more far-reaching\nscript optimization goals, which may drive a paradigm shift in the entire field\nof text-to-video and significantly advance the field of short drama video\ngeneration. The data and code are available at\nhttps://github.com/vaew/SkyScript-100M.\n","authors":["Jing Tang","Quanlu Jia","Yuqiang Xie","Zeyu Gong","Xiang Wen","Jiayi Zhang","Yalong Guo","Guibin Chen","Jiangping Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09333v2.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.13893v2","updated":"2024-08-28T07:16:37Z","published":"2024-08-25T17:07:39Z","title":"SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with\n Flow-based Scalar Latent Transformer Diffusion Models","summary":" Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as\nan effective method for improving the diversity and naturalness of synthesized\nspeech. At the high level, previous large-scale TTS models can be categorized\ninto either Auto-regressive (AR) based (\\textit{e.g.}, VALL-E) or\nNon-auto-regressive (NAR) based models (\\textit{e.g.}, NaturalSpeech 2/3).\nAlthough these works demonstrate good performance, they still have potential\nweaknesses. For instance, AR-based models are plagued by unstable generation\nquality and slow generation speed; meanwhile, some NAR-based models need\nphoneme-level duration alignment information, thereby increasing the complexity\nof data pre-processing, model design, and loss design. In this work, we build\nupon our previous publication by implementing a simple and efficient\nnon-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2\neffectively combines the strengths of both autoregressive (AR) and\nnon-autoregressive (NAR) methods, offering the following key advantages: (1)\nsimplified data preparation; (2) straightforward model and loss design; and (3)\nstable, high-quality generation performance with fast inference speed. Compared\nto our previous publication, we present ({\\romannumeral1}) a detailed analysis\nof the influence of speech tokenizer and noisy label for TTS performance;\n({\\romannumeral2}) four distinct types of sentence duration predictors;\n({\\romannumeral3}) a novel flow-based scalar latent transformer diffusion\nmodel. With these improvement, we show a significant improvement in generation\nperformance and generation speed compared to our previous work and other\nstate-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that\nSimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on\nmultilingual speech datasets. Demos are available on:\n{https://dongchaoyang.top/SimpleSpeech2\\_demo/}.\n","authors":["Dongchao Yang","Rongjie Huang","Yuanyuan Wang","Haohan Guo","Dading Chong","Songxiang Liu","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.13893v2.pdf","comment":"Submit to TASLP"},{"id":"http://arxiv.org/abs/2408.15565v1","updated":"2024-08-28T06:33:03Z","published":"2024-08-28T06:33:03Z","title":"SIaM: Self-Improving Code-Assisted Mathematical Reasoning of Large\n Language Models","summary":" There is a growing trend of teaching large language models (LLMs) to solve\nmathematical problems through coding. Existing studies primarily focus on\nprompting powerful, closed-source models to generate seed training data\nfollowed by in-domain data augmentation, equipping LLMs with considerable\ncapabilities for code-aided mathematical reasoning. However, continually\ntraining these models on augmented data derived from a few datasets such as\nGSM8K may impair their generalization abilities and restrict their\neffectiveness to a narrow range of question types. Conversely, the potential of\nimproving such LLMs by leveraging large-scale, expert-written, diverse math\nquestion-answer pairs remains unexplored. To utilize these resources and tackle\nunique challenges such as code response assessment, we propose a novel paradigm\nthat uses a code-based critic model to guide steps including question-code data\nconstruction, quality control, and complementary evaluation. We also explore\ndifferent alignment algorithms with self-generated instruction/preference data\nto foster continuous improvement. Experiments across both in-domain (up to\n+5.7%) and out-of-domain (+4.4%) benchmarks in English and Chinese demonstrate\nthe effectiveness of the proposed paradigm.\n","authors":["Dian Yu","Baolin Peng","Ye Tian","Linfeng Song","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.15565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15562v1","updated":"2024-08-28T06:28:01Z","published":"2024-08-28T06:28:01Z","title":"Boosting Lossless Speculative Decoding via Feature Sampling and Partial\n Alignment Distillation","summary":" Lossless speculative decoding accelerates target large language model (LLM)\ninference by employing a lightweight draft model for generating tree-structured\ncandidates, which are subsequently verified in parallel by the target LLM.\nCurrently, effective approaches leverage feature-level rather than token-level\nautoregression within the draft model to facilitate more straightforward\npredictions and enhanced knowledge distillation. In this paper, we reassess\nthese approaches and propose FSPAD (Feature Sampling and Partial Alignment\nDistillation for Lossless Speculative Decoding), which introduces two\nstraightforward and effective components within the existing framework to boost\nlossless speculative decoding. Firstly, FSPAD utilizes token embeddings to\nsample features of the target LLM in high-dimensional space before feeding them\ninto the draft model, due to the inherent uncertainty of the features\npreventing the draft model from obtaining the specific token output by the\ntarget LLM. Secondly, FSPAD introduces partial alignment distillation to weaken\nthe draft model's connection between features and logits, aiming to reduce the\nconflict between feature alignment and logit confidence during training. Our\nexperiments include both greedy and non-greedy decoding on the largest and\nsmallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in\nmulti-turn conversation, translation, summarization, question answering,\nmathematical reasoning, and retrieval-augmented generation. The results show\nthat FSPAD outperforms the state-of-the-art method across all the\naforementioned tasks and target LLMs.\n","authors":["Lujun Gui","Bin Xiao","Lei Su","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15562v1.pdf","comment":"The work was not submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2408.15549v1","updated":"2024-08-28T05:53:46Z","published":"2024-08-28T05:53:46Z","title":"WildFeedback: Aligning LLMs With In-situ User Interactions And Feedback","summary":" As large language models (LLMs) continue to advance, aligning these models\nwith human preferences has emerged as a critical challenge. Traditional\nalignment methods, relying on human or LLM annotated datasets, are limited by\ntheir resource-intensive nature, inherent subjectivity, and the risk of\nfeedback loops that amplify model biases. To overcome these limitations, we\nintroduce WildFeedback, a novel framework that leverages real-time, in-situ\nuser interactions to create preference datasets that more accurately reflect\nauthentic human values. WildFeedback operates through a three-step process:\nfeedback signal identification, preference data construction, and user-guided\nevaluation. We applied this framework to a large corpus of user-LLM\nconversations, resulting in a rich preference dataset that reflects genuine\nuser preferences. This dataset captures the nuances of user preferences by\nidentifying and classifying feedback signals within natural conversations,\nthereby enabling the construction of more representative and context-sensitive\nalignment data. Our extensive experiments demonstrate that LLMs fine-tuned on\nWildFeedback exhibit significantly improved alignment with user preferences, as\nevidenced by both traditional benchmarks and our proposed user-guided\nevaluation. By incorporating real-time feedback from actual users, WildFeedback\naddresses the scalability, subjectivity, and bias challenges that plague\nexisting approaches, marking a significant step toward developing LLMs that are\nmore responsive to the diverse and evolving needs of their users. In summary,\nWildFeedback offers a robust, scalable solution for aligning LLMs with true\nhuman values, setting a new standard for the development and evaluation of\nuser-centric language models.\n","authors":["Taiwei Shi","Zhuoer Wang","Longqi Yang","Ying-Chun Lin","Zexue He","Mengting Wan","Pei Zhou","Sujay Jauhar","Xiaofeng Xu","Xia Song","Jennifer Neville"],"pdf_url":"https://arxiv.org/pdf/2408.15549v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2408.15545v1","updated":"2024-08-28T05:41:52Z","published":"2024-08-28T05:41:52Z","title":"SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding","summary":" Scientific literature understanding is crucial for extracting targeted\ninformation and garnering insights, thereby significantly advancing scientific\ndiscovery. Despite the remarkable success of Large Language Models (LLMs), they\nface challenges in scientific literature understanding, primarily due to (1) a\nlack of scientific knowledge and (2) unfamiliarity with specialized scientific\ntasks.\n To develop an LLM specialized in scientific literature understanding, we\npropose a hybrid strategy that integrates continual pre-training (CPT) and\nsupervised fine-tuning (SFT), to simultaneously infuse scientific domain\nknowledge and enhance instruction-following capabilities for domain-specific\ntasks.cIn this process, we identify two key challenges: (1) constructing\nhigh-quality CPT corpora, and (2) generating diverse SFT instructions. We\naddress these challenges through a meticulous pipeline, including PDF text\nextraction, parsing content error correction, quality filtering, and synthetic\ninstruction creation. Applying this strategy, we present a suite of LLMs:\nSciLitLLM, specialized in scientific literature understanding. These models\ndemonstrate promising performance on scientific literature understanding\nbenchmarks.\n Our contributions are threefold: (1) We present an effective framework that\nintegrates CPT and SFT to adapt LLMs to scientific literature understanding,\nwhich can also be easily adapted to other domains. (2) We propose an LLM-based\nsynthesis method to generate diverse and high-quality scientific instructions,\nresulting in a new instruction set -- SciLitIns -- for supervised fine-tuning\nin less-represented scientific domains. (3) SciLitLLM achieves promising\nperformance improvements on scientific literature understanding benchmarks.\n","authors":["Sihang Li","Jian Huang","Jiaxi Zhuang","Yaorui Shi","Xiaochen Cai","Mingjun Xu","Xiang Wang","Linfeng Zhang","Guolin Ke","Hengxing Cai"],"pdf_url":"https://arxiv.org/pdf/2408.15545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15543v1","updated":"2024-08-28T05:36:25Z","published":"2024-08-28T05:36:25Z","title":"An Investigation of Warning Erroneous Chat Translations in Cross-lingual\n Communication","summary":" The complexities of chats pose significant challenges for machine translation\nmodels. Recognizing the need for a precise evaluation metric to address the\nissues of chat translation, this study introduces Multidimensional Quality\nMetrics for Chat Translation (MQM-Chat). Through the experiments of five models\nusing MQM-Chat, we observed that all models generated certain fundamental\nerrors, while each of them has different shortcomings, such as omission, overly\ncorrecting ambiguous source content, and buzzword issues, resulting in the loss\nof stylized information. Our findings underscore the effectiveness of MQM-Chat\nin evaluating chat translation, emphasizing the importance of stylized content\nand dialogue consistency for future studies.\n","authors":["Yunmeng Li","Jun Suzuki","Makoto Morishita","Kaori Abe","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2408.15543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08872v2","updated":"2024-08-28T05:03:34Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":" This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15533v1","updated":"2024-08-28T04:44:43Z","published":"2024-08-28T04:44:43Z","title":"LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via\n Layer-wise Relevance Propagation","summary":" Retrieval-Augmented Generation (RAG) has become a primary technique for\nmitigating hallucinations in large language models (LLMs). However, incomplete\nknowledge extraction and insufficient understanding can still mislead LLMs to\nproduce irrelevant or even contradictory responses, which means hallucinations\npersist in RAG. In this paper, we propose LRP4RAG, a method based on the\nLayer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations\nin RAG. Specifically, we first utilize LRP to compute the relevance between the\ninput and output of the RAG generator. We then apply further extraction and\nresampling to the relevance matrix. The processed relevance data are input into\nmultiple classifiers to determine whether the output contains hallucinations.\nTo the best of our knowledge, this is the first time that LRP has been used for\ndetecting RAG hallucinations, and extensive experiments demonstrate that\nLRP4RAG outperforms existing baselines.\n","authors":["Haichuan Hu","Yuhan Sun","Qunjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15518v1","updated":"2024-08-28T04:06:14Z","published":"2024-08-28T04:06:14Z","title":"Dolphin: Long Context as a New Modality for Energy-Efficient On-Device\n Language Models","summary":" This paper presents Dolphin, a novel decoder-decoder architecture for\nenergy-efficient processing of long contexts in language models. Our approach\naddresses the significant energy consumption and latency challenges inherent in\non-device models. Dolphin employs a compact 0.5B parameter decoder to distill\nextensive contextual information into a memory embedding, substantially\nreducing the input length for the primary 7B parameter decoder model. Inspired\nby vision-language models, we repurpose the image embedding projector to encode\nlong textual contexts, effectively treating extended context as a distinct\nmodality. This innovative method enables processing of substantially longer\ncontexts without the typical computational overhead associated with extended\ninput sequences. Empirical evaluations demonstrate a 10-fold improvement in\nenergy efficiency and a 5-fold reduction in latency compared to conventional\nfull-length context processing methods without losing quality of the response.\nOur work contributes to the development of more sustainable and scalable\nlanguage models for on-device applications, addressing the critical need for\nenergy-efficient and responsive AI technologies in resource-constrained\nenvironments while maintaining the accuracy to understand long contexts. This\nresearch has implications for the broader field of natural language processing,\nparticularly in the domain of efficient model design for resource-limited\nsettings. By enabling more sophisticated AI capabilities on edge devices,\nDolphin paves the way for advanced language processing in a wide range of\napplications where computational resources are at a premium. The Dolphin model\nis publicly available at https://huggingface.co/NexaAIDev/Dolphin.\n","authors":["Wei Chen","Zhiyuan Li","Shuo Xin","Yihao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15040v2","updated":"2024-08-28T03:56:37Z","published":"2024-08-27T13:10:05Z","title":"A Survey of Large Language Models for European Languages","summary":" Large Language Models (LLMs) have gained significant attention due to their\nhigh performance on a wide range of natural language tasks since the release of\nChatGPT. The LLMs learn to understand and generate language by training\nbillions of model parameters on vast volumes of text data. Despite being a\nrelatively new field, LLM research is rapidly advancing in various directions.\nIn this paper, we present an overview of LLM families, including LLaMA, PaLM,\nGPT, and MoE, and the methods developed to create and enhance LLMs for official\nEuropean Union (EU) languages. We provide a comprehensive summary of common\nmonolingual and multilingual datasets used for pretraining large language\nmodels.\n","authors":["Wazir Ali","Sampo Pyysalo"],"pdf_url":"https://arxiv.org/pdf/2408.15040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15512v1","updated":"2024-08-28T03:48:05Z","published":"2024-08-28T03:48:05Z","title":"Towards Fully Autonomous Research Powered by LLMs: Case Study on\n Simulations","summary":" The advent of Large Language Models (LLMs) has created new opportunities for\nthe automation of scientific research, spanning both experimental processes and\ncomputational simulations. This study explores the feasibility of constructing\nan autonomous simulation agent (ASA) powered by LLM, through sophisticated API\nintegration, to automate the entire research process, from experimental design,\nremote upload and simulation execution, data analysis, to report compilation.\nUsing a simulation problem of polymer chain conformations as a case study, we\nassessed the performance of ASAs powered by different LLMs including\nGPT-4-Turbo. Our findings revealed that ASA-GPT-4o achieved near-flawless\nexecution on designated research missions, underscoring the potential of LLMs\nto manage complete scientific investigations autonomously. The outlined\nautomation can be iteratively performed up to twenty cycles without human\nintervention, illustrating the potential of LLMs for large-scale autonomous\nresearch endeavors. Additionally, we discussed the intrinsic traits of ASAs in\nmanaging extensive tasks, focusing on self-validation mechanisms and the\nbalance between local attention and global oversight.\n","authors":["Zhihan Liu","Yubo Chai","Jianfeng Li"],"pdf_url":"https://arxiv.org/pdf/2408.15512v1.pdf","comment":"For additional code and data, please visit our GitHub repository:\n https://github.com/zokaraa/autonomous_simulation_agent"},{"id":"http://arxiv.org/abs/2408.07611v2","updated":"2024-08-28T03:47:28Z","published":"2024-08-14T15:19:16Z","title":"WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation\n Integrating Web Search and Knowledge Graphs","summary":" Large Language Models (LLMs) have greatly contributed to the development of\nadaptive intelligent agents and are positioned as an important way to achieve\nArtificial General Intelligence (AGI). However, LLMs are prone to produce\nfactually incorrect information and often produce \"phantom\" content that\nundermines their reliability, which poses a serious challenge for their\ndeployment in real-world scenarios. Enhancing LLMs by combining external\ndatabases and information retrieval mechanisms is an effective path. To address\nthe above challenges, we propose a new approach called WeKnow-RAG, which\nintegrates Web search and Knowledge Graphs into a \"Retrieval-Augmented\nGeneration (RAG)\" system. First, the accuracy and reliability of LLM responses\nare improved by combining the structured representation of Knowledge Graphs\nwith the flexibility of dense vector retrieval. WeKnow-RAG then utilizes\ndomain-specific knowledge graphs to satisfy a variety of queries and domains,\nthereby improving performance on factual information and complex reasoning\ntasks by employing multi-stage web page retrieval techniques using both sparse\nand dense retrieval methods. Our approach effectively balances the efficiency\nand accuracy of information retrieval, thus improving the overall retrieval\nprocess. Finally, we also integrate a self-assessment mechanism for the LLM to\nevaluate the trustworthiness of the answers it generates. Our approach proves\nits outstanding effectiveness in a wide range of offline experiments and online\nsubmissions.\n","authors":["Weijian Xie","Xuefeng Liang","Yuhui Liu","Kaihua Ni","Hong Cheng","Zetian Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07611v2.pdf","comment":"8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta\n KDD Cup 2024 CRAG Challenge"},{"id":"http://arxiv.org/abs/2408.15510v1","updated":"2024-08-28T03:45:49Z","published":"2024-08-28T03:45:49Z","title":"Measuring the Reliability of Causal Probing Methods: Tradeoffs,\n Limitations, and the Plight of Nullifying Interventions","summary":" Causal probing is an approach to interpreting foundation models, such as\nlarge language models, by training probes to recognize latent properties of\ninterest from embeddings, intervening on probes to modify this representation,\nand analyzing the resulting changes in the model's behavior. While some recent\nworks have cast doubt on the theoretical basis of several leading causal\nprobing intervention methods, it has been unclear how to systematically and\nempirically evaluate their effectiveness in practice. To address this problem,\nwe propose a general empirical analysis framework to evaluate the reliability\nof causal probing interventions, formally defining and quantifying two key\ncausal probing desiderata: completeness (fully transforming the representation\nof the target property) and selectivity (minimally impacting other properties).\nOur formalism allows us to make the first direct comparisons between different\nfamilies of causal probing methods (e.g., linear vs. nonlinear or\ncounterfactual vs. nullifying interventions). We conduct extensive experiments\nacross several leading methods, finding that (1) there is an inherent tradeoff\nbetween these criteria, and no method is able to consistently satisfy both at\nonce; and (2) across the board, nullifying interventions are always far less\ncomplete than counterfactual interventions, indicating that nullifying methods\nmay not be an effective approach to causal probing.\n","authors":["Marc Canby","Adam Davies","Chirag Rastogi","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2408.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15496v1","updated":"2024-08-28T02:47:27Z","published":"2024-08-28T02:47:27Z","title":"ReMamba: Equip Mamba with Effective Long-Sequence Modeling","summary":" While the Mamba architecture demonstrates superior inference efficiency and\ncompetitive performance on short-context natural language processing (NLP)\ntasks, empirical evidence suggests its capacity to comprehend long contexts is\nlimited compared to transformer-based models. In this study, we investigate the\nlong-context efficiency issues of the Mamba models and propose ReMamba, which\nenhances Mamba's ability to comprehend long contexts. ReMamba incorporates\nselective compression and adaptation techniques within a two-stage re-forward\nprocess, incurring minimal additional inference costs overhead. Experimental\nresults on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,\nimproving over the baselines by 3.2 and 1.6 points, respectively, and attaining\nperformance almost on par with same-size transformer models.\n","authors":["Danlong Yuan","Jiahao Liu","Bei Li","Huishuai Zhang","Jingang Wang","Xunliang Cai","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15491v1","updated":"2024-08-28T02:31:15Z","published":"2024-08-28T02:31:15Z","title":"Enhancing and Accelerating Large Language Models via Instruction-Aware\n Contextual Compression","summary":" Large Language Models (LLMs) have garnered widespread attention due to their\nremarkable performance across various tasks. However, to mitigate the issue of\nhallucinations, LLMs often incorporate retrieval-augmented pipeline to provide\nthem with rich external knowledge and context. Nevertheless, challenges stem\nfrom inaccurate and coarse-grained context retrieved from the retriever.\nSupplying irrelevant context to the LLMs can result in poorer responses,\nincreased inference latency, and higher costs. This paper introduces a method\ncalled Instruction-Aware Contextual Compression, which filters out less\ninformative content, thereby accelerating and enhancing the use of LLMs. The\nexperimental results demonstrate that Instruction-Aware Contextual Compression\nnotably reduces memory consumption and minimizes generation latency while\nmaintaining performance levels comparable to those achieved with the use of the\nfull context. Specifically, we achieved a 50% reduction in context-related\ncosts, resulting in a 5% reduction in inference memory usage and a 2.2-fold\nincrease in inference speed, with only a minor drop of 0.047 in Rouge-1. These\nfindings suggest that our method strikes an effective balance between\nefficiency and performance.\n","authors":["Haowen Hou","Fei Ma","Binwen Bai","Xinxin Zhu","Fei Yu"],"pdf_url":"https://arxiv.org/pdf/2408.15491v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2408.15488v1","updated":"2024-08-28T02:27:07Z","published":"2024-08-28T02:27:07Z","title":"Legilimens: Practical and Unified Content Moderation for Large Language\n Model Services","summary":" Given the societal impact of unsafe content generated by large language\nmodels (LLMs), ensuring that LLM services comply with safety standards is a\ncrucial concern for LLM service providers. Common content moderation methods\nare limited by an effectiveness-and-efficiency dilemma, where simple models are\nfragile while sophisticated models consume excessive computational resources.\nIn this paper, we reveal for the first time that effective and efficient\ncontent moderation can be achieved by extracting conceptual features from\nchat-oriented LLMs, despite their initial fine-tuning for conversation rather\nthan content moderation. We propose a practical and unified content moderation\nframework for LLM services, named Legilimens, which features both effectiveness\nand efficiency. Our red-team model-based data augmentation enhances the\nrobustness of Legilimens against state-of-the-art jailbreaking. Additionally,\nwe develop a framework to theoretically analyze the cost-effectiveness of\nLegilimens compared to other methods. We have conducted extensive experiments\non five host LLMs, seventeen datasets, and nine jailbreaking methods to verify\nthe effectiveness, efficiency, and robustness of Legilimens against normal and\nadaptive adversaries. A comparison of Legilimens with both commercial and\nacademic baselines demonstrates the superior performance of Legilimens.\nFurthermore, we confirm that Legilimens can be applied to few-shot scenarios\nand extended to multi-label classification tasks.\n","authors":["Jialin Wu","Jiangyi Deng","Shengyuan Pang","Yanjiao Chen","Jiayang Xu","Xinfeng Li","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15488v1.pdf","comment":"Accepted by ACM Conference on Computer and Communications Security\n (CCS) 2024"},{"id":"http://arxiv.org/abs/2407.05750v3","updated":"2024-08-28T02:04:24Z","published":"2024-07-08T09:03:12Z","title":"Large Language Models Understand Layout","summary":" Large language models (LLMs) demonstrate extraordinary abilities in a wide\nrange of natural language processing (NLP) tasks. In this paper, we show that,\nbeyond text understanding capability, LLMs are capable of processing text\nlayouts that are denoted by spatial markers. They are able to answer questions\nthat require explicit spatial perceiving and reasoning, while a drastic\nperformance drop is observed when the spatial markers from the original data\nare excluded. We perform a series of experiments with the GPT-3.5, Baichuan2,\nLlama2 and ChatGLM3 models on various types of layout-sensitive datasets for\nfurther analysis. The experimental results reveal that the layout understanding\nability of LLMs is mainly introduced by the coding data for pretraining, which\nis further enhanced at the instruction-tuning stage. In addition, layout\nunderstanding can be enhanced by integrating low-cost, auto-generated data\napproached by a novel text game. Finally, we show that layout understanding\nability is beneficial for building efficient visual question-answering (VQA)\nsystems.\n","authors":["Weiming Li","Manni Duan","Dong An","Yan Shao"],"pdf_url":"https://arxiv.org/pdf/2407.05750v3.pdf","comment":"This paper has been accepted by ECAI-2024"},{"id":"http://arxiv.org/abs/2408.14895v2","updated":"2024-08-28T01:56:33Z","published":"2024-08-27T09:18:57Z","title":"VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view\n Videos of Daily Activities","summary":" Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data\n(e.g., images and videos) into symbols, have attracted attention as resources\nenabling knowledge processing and machine learning across modalities. However,\nthe construction of MMKGs for videos consisting of multiple events, such as\ndaily activities, is still in the early stages. In this paper, we construct an\nMMKG based on synchronized multi-view simulated videos of daily activities.\nBesides representing the content of daily life videos as event-centric\nknowledge, our MMKG also includes frame-by-frame fine-grained changes, such as\nbounding boxes within video frames. In addition, we provide support tools for\nquerying our MMKG. As an application example, we demonstrate that our MMKG\nfacilitates benchmarking vision-language models by providing the necessary\nvision-language datasets for a tailored task.\n","authors":["Shusaku Egami","Takahiro Ugai","Swe Nwe Nwe Htun","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2408.14895v2.pdf","comment":"5 pages, 4 figures, accepted by CIKM2024 Resource Track"},{"id":"http://arxiv.org/abs/2408.16163v1","updated":"2024-08-28T22:51:29Z","published":"2024-08-28T22:51:29Z","title":"FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational\n Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench","summary":" This paper introduces FRACTURED-SORRY-Bench, a framework for evaluating the\nsafety of Large Language Models (LLMs) against multi-turn conversational\nattacks. Building upon the SORRY-Bench dataset, we propose a simple yet\neffective method for generating adversarial prompts by breaking down harmful\nqueries into seemingly innocuous sub-questions. Our approach achieves a maximum\nincrease of +46.22\\% in Attack Success Rates (ASRs) across GPT-4, GPT-4o,\nGPT-4o-mini, and GPT-3.5-Turbo models compared to baseline methods. We\ndemonstrate that this technique poses a challenge to current LLM safety\nmeasures and highlights the need for more robust defenses against subtle,\nmulti-turn attacks.\n","authors":["Aman Priyanshu","Supriti Vijay"],"pdf_url":"https://arxiv.org/pdf/2408.16163v1.pdf","comment":"4 pages, 2 tables"},{"id":"http://arxiv.org/abs/2408.16131v1","updated":"2024-08-28T20:36:35Z","published":"2024-08-28T20:36:35Z","title":"Evaluating Computational Representations of Character: An Austen\n Character Similarity Benchmark","summary":" Several systems have been developed to extract information about characters\nto aid computational analysis of English literature. We propose character\nsimilarity grouping as a holistic evaluation task for these pipelines. We\npresent AustenAlike, a benchmark suite of character similarities in Jane\nAusten's novels. Our benchmark draws on three notions of character similarity:\na structurally defined notion of similarity; a socially defined notion of\nsimilarity; and an expert defined set extracted from literary criticism.\n We use AustenAlike to evaluate character features extracted using two\npipelines, BookNLP and FanfictionNLP. We build character representations from\nfour kinds of features and compare them to the three AustenAlike benchmarks and\nto GPT-4 similarity rankings. We find that though computational representations\ncapture some broad similarities based on shared social and narrative roles, the\nexpert pairings in our third benchmark are challenging for all systems,\nhighlighting the subtler aspects of similarity noted by human readers.\n","authors":["Funing Yang","Carolyn Jane Anderson"],"pdf_url":"https://arxiv.org/pdf/2408.16131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16098v1","updated":"2024-08-28T19:03:41Z","published":"2024-08-28T19:03:41Z","title":"Structured Event Reasoning with Large Language Models","summary":" Reasoning about real-life events is a unifying challenge in AI and NLP that\nhas profound utility in a variety of domains, while fallacy in high-stake\napplications could be catastrophic. Able to work with diverse text in these\ndomains, large language models (LLMs) have proven capable of answering\nquestions and solving problems. However, I show that end-to-end LLMs still\nsystematically fail to reason about complex events, and they lack\ninterpretability due to their black-box nature. To address these issues, I\npropose three general approaches to use LLMs in conjunction with a structured\nrepresentation of events. The first is a language-based representation\ninvolving relations of sub-events that can be learned by LLMs via fine-tuning.\nThe second is a semi-symbolic representation involving states of entities that\ncan be predicted and leveraged by LLMs via few-shot prompting. The third is a\nfully symbolic representation that can be predicted by LLMs trained with\nstructured data and be executed by symbolic solvers. On a suite of event\nreasoning tasks spanning common-sense inference and planning, I show that each\napproach greatly outperforms end-to-end LLMs with more interpretability. These\nresults suggest manners of synergy between LLMs and structured representations\nfor event reasoning and beyond.\n","authors":["Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16098v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2408.16089v1","updated":"2024-08-28T18:43:07Z","published":"2024-08-28T18:43:07Z","title":"Is Personality Prediction Possible Based on Reddit Comments?","summary":" In this assignment, we examine whether there is a correlation between the\npersonality type of a person and the texts they wrote. In order to do this, we\naggregated datasets of Reddit comments labeled with the Myers-Briggs Type\nIndicator (MBTI) of the author and built different supervised classifiers based\non BERT to try to predict the personality of an author given a text. Despite\nexperiencing issues with the unfiltered character of the dataset, we can\nobserve potential in the classification.\n","authors":["Robert Deimann","Till Preidt","Shaptarshi Roy","Jan Stanicki"],"pdf_url":"https://arxiv.org/pdf/2408.16089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16081v1","updated":"2024-08-28T18:25:35Z","published":"2024-08-28T18:25:35Z","title":"Logic-Enhanced Language Model Agents for Trustworthy Social Simulations","summary":" We introduce the Logic-Enhanced Language Model Agents (LELMA) framework, a\nnovel approach to enhance the trustworthiness of social simulations that\nutilize large language models (LLMs). While LLMs have gained attention as\nagents for simulating human behaviour, their applicability in this role is\nlimited by issues such as inherent hallucinations and logical inconsistencies.\nLELMA addresses these challenges by integrating LLMs with symbolic AI, enabling\nlogical verification of the reasoning generated by LLMs. This verification\nprocess provides corrective feedback, refining the reasoning output. The\nframework consists of three main components: an LLM-Reasoner for producing\nstrategic reasoning, an LLM-Translator for mapping natural language reasoning\nto logic queries, and a Solver for evaluating these queries. This study focuses\non decision-making in game-theoretic scenarios as a model of human interaction.\nExperiments involving the Hawk-Dove game, Prisoner's Dilemma, and Stag Hunt\nhighlight the limitations of state-of-the-art LLMs, GPT-4 Omni and Gemini 1.0\nPro, in producing correct reasoning in these contexts. LELMA demonstrates high\naccuracy in error detection and improves the reasoning correctness of LLMs via\nself-refinement, particularly in GPT-4 Omni.\n","authors":["Agnieszka Mensfelt","Kostas Stathis","Vince Trencsenyi"],"pdf_url":"https://arxiv.org/pdf/2408.16081v1.pdf","comment":"Source code: https://github.com/dicelab-rhul/LELMA"},{"id":"http://arxiv.org/abs/2408.16073v1","updated":"2024-08-28T18:14:39Z","published":"2024-08-28T18:14:39Z","title":"Using Large Language Models to Create AI Personas for Replication and\n Prediction of Media Effects: An Empirical Test of 133 Published Experimental\n Research Findings","summary":" This report analyzes the potential for large language models (LLMs) to\nexpedite accurate replication of published message effects studies. We tested\nLLM-powered participants (personas) by replicating 133 experimental findings\nfrom 14 papers containing 45 recent studies in the Journal of Marketing\n(January 2023-May 2024). We used a new software tool, Viewpoints AI\n(https://viewpoints.ai/), that takes study designs, stimuli, and measures as\ninput, automatically generates prompts for LLMs to act as a specified sample of\nunique personas, and collects their responses to produce a final output in the\nform of a complete dataset and statistical analysis. The underlying LLM used\nwas Anthropic's Claude Sonnet 3.5. We generated 19,447 AI personas to replicate\nthese studies with the exact same sample attributes, study designs, stimuli,\nand measures reported in the original human research. Our LLM replications\nsuccessfully reproduced 76% of the original main effects (84 out of 111),\ndemonstrating strong potential for AI-assisted replication of studies in which\npeople respond to media stimuli. When including interaction effects, the\noverall replication rate was 68% (90 out of 133). The use of LLMs to replicate\nand accelerate marketing research on media effects is discussed with respect to\nthe replication crisis in social science, potential solutions to\ngeneralizability problems in sampling subjects and experimental conditions, and\nthe ability to rapidly test consumer responses to various media stimuli. We\nalso address the limitations of this approach, particularly in replicating\ncomplex interaction effects in media response studies, and suggest areas for\nfuture research and improvement in AI-assisted experimental replication of\nmedia effects.\n","authors":["Leo Yeykelis","Kaavya Pichai","James J. Cummings","Byron Reeves"],"pdf_url":"https://arxiv.org/pdf/2408.16073v1.pdf","comment":"24 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.14028v2","updated":"2024-08-28T18:06:50Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":" Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.15998v1","updated":"2024-08-28T17:59:31Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n Encoders","summary":" The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\nModels and code: https://github.com/NVlabs/Eagle\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v1.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2408.15996v1","updated":"2024-08-28T17:59:05Z","published":"2024-08-28T17:59:05Z","title":"Spatio-Temporal Context Prompting for Zero-Shot Action Detection","summary":" Spatio-temporal action detection encompasses the tasks of localizing and\nclassifying individual actions within a video. Recent works aim to enhance this\nprocess by incorporating interaction modeling, which captures the relationship\nbetween people and their surrounding context. However, these approaches have\nprimarily focused on fully-supervised learning, and the current limitation lies\nin the lack of generalization capability to recognize unseen action categories.\nIn this paper, we aim to adapt the pretrained image-language models to detect\nunseen actions. To this end, we propose a method which can effectively leverage\nthe rich knowledge of visual-language models to perform Person-Context\nInteraction. Meanwhile, our Context Prompting module will utilize contextual\ninformation to prompt labels, thereby enhancing the generation of more\nrepresentative text features. Moreover, to address the challenge of recognizing\ndistinct actions by multiple people at the same timestamp, we design the\nInterest Token Spotting mechanism which employs pretrained visual knowledge to\nfind each person's interest context tokens, and then these tokens will be used\nfor prompting to generate text features tailored to each individual. To\nevaluate the ability to detect unseen actions, we propose a comprehensive\nbenchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our\nmethod achieves superior results compared to previous approaches and can be\nfurther extended to multi-action videos, bringing it closer to real-world\napplications. The code and data can be found in\nhttps://webber2933.github.io/ST-CLIP-project-page.\n","authors":["Wei-Jhe Huang","Min-Hung Chen","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15995v1","updated":"2024-08-28T17:59:02Z","published":"2024-08-28T17:59:02Z","title":"TEDRA: Text-based Editing of Dynamic and Photoreal Actors","summary":" Over the past years, significant progress has been made in creating\nphotorealistic and drivable 3D avatars solely from videos of real humans.\nHowever, a core remaining challenge is the fine-grained and user-friendly\nediting of clothing styles by means of textual descriptions. To this end, we\npresent TEDRA, the first method allowing text-based edits of an avatar, which\nmaintains the avatar's high fidelity, space-time coherency, as well as\ndynamics, and enables skeletal pose and view control. We begin by training a\nmodel to create a controllable and high-fidelity digital replica of the real\nactor. Next, we personalize a pretrained generative diffusion model by\nfine-tuning it on various frames of the real character captured from different\ncamera angles, ensuring the digital representation faithfully captures the\ndynamics and movements of the real person. This two-stage process lays the\nfoundation for our approach to dynamic human avatar editing. Utilizing this\npersonalized diffusion model, we modify the dynamic avatar based on a provided\ntext prompt using our Personalized Normal Aligned Score Distillation Sampling\n(PNA-SDS) within a model-based guidance framework. Additionally, we propose a\ntime step annealing strategy to ensure high-quality edits. Our results\ndemonstrate a clear improvement over prior work in functionality and visual\nquality.\n","authors":["Basavaraj Sunagad","Heming Zhu","Mohit Mendiratta","Adam Kortylewski","Christian Theobalt","Marc Habermann"],"pdf_url":"https://arxiv.org/pdf/2408.15995v1.pdf","comment":"For project page, see this https://vcai.mpi-inf.mpg.de/projects/Tedra"},{"id":"http://arxiv.org/abs/2408.15994v1","updated":"2024-08-28T17:58:54Z","published":"2024-08-28T17:58:54Z","title":"Perceive-IR: Learning to Perceive Degradation Better for All-in-One\n Image Restoration","summary":" The limitations of task-specific and general image restoration methods for\nspecific degradation have prompted the development of all-in-one image\nrestoration techniques. However, the diversity of patterns among multiple\ndegradation, along with the significant uncertainties in mapping between\ndegraded images of different severities and their corresponding undistorted\nversions, pose significant challenges to the all-in-one restoration tasks. To\naddress these challenges, we propose Perceive-IR, an all-in-one image restorer\ndesigned to achieve fine-grained quality control that enables restored images\nto more closely resemble their undistorted counterparts, regardless of the type\nor severity of degradation. Specifically, Perceive-IR contains two stages: (1)\nprompt learning stage and (2) restoration stage. In the prompt learning stage,\nwe leverage prompt learning to acquire a fine-grained quality perceiver capable\nof distinguishing three-tier quality levels by constraining the prompt-image\nsimilarity in the CLIP perception space. Subsequently, this quality perceiver\nand difficulty-adaptive perceptual loss are integrated as a quality-aware\nlearning strategy to realize fine-grained quality control in restoration stage.\nFor the restoration stage, a semantic guidance module (SGM) and compact feature\nextraction (CFE) are proposed to further promote the restoration process by\nutilizing the robust semantic information from the pre-trained large scale\nvision models and distinguishing degradation-specific features. Extensive\nexperiments demonstrate that our Perceive-IR outperforms state-of-the-art\nmethods in all-in-one image restoration tasks and exhibit superior\ngeneralization ability when dealing with unseen tasks.\n","authors":["Xu Zhang","Jiaqi Ma","Guoli Wang","Qian Zhang","Huan Zhang","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15994v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.15993v1","updated":"2024-08-28T17:58:53Z","published":"2024-08-28T17:58:53Z","title":"ClimDetect: A Benchmark Dataset for Climate Change Detection and\n Attribution","summary":" Detecting and attributing temperature increases due to climate change is\ncrucial for understanding global warming and guiding adaptation strategies. The\ncomplexity of distinguishing human-induced climate signals from natural\nvariability has challenged traditional detection and attribution (D&A)\napproaches, which seek to identify specific \"fingerprints\" in climate response\nvariables. Deep learning offers potential for discerning these complex patterns\nin expansive spatial datasets. However, lack of standard protocols has hindered\nconsistent comparisons across studies. We introduce ClimDetect, a standardized\ndataset of over 816k daily climate snapshots, designed to enhance model\naccuracy in identifying climate change signals. ClimDetect integrates various\ninput and target variables used in past research, ensuring comparability and\nconsistency. We also explore the application of vision transformers (ViT) to\nclimate data, a novel and modernizing approach in this context. Our open-access\ndata and code serve as a benchmark for advancing climate science through\nimproved model evaluations. ClimDetect is publicly accessible via Huggingface\ndataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect.\n","authors":["Sungduk Yu","Brian L. White","Anahita Bhiwandiwalla","Musashi Hinck","Matthew Lyle Olson","Tung Nguyen","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2408.15993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15992v1","updated":"2024-08-28T17:58:39Z","published":"2024-08-28T17:58:39Z","title":"CoGen: Learning from Feedback with Coupled Comprehension and Generation","summary":" Systems with both language comprehension and generation capabilities can\nbenefit from the tight connection between the two. This work studies coupling\ncomprehension and generation with focus on continually learning from\ninteraction with users. We propose techniques to tightly integrate the two\ncapabilities for both learning and inference. We situate our studies in\ntwo-player reference games, and deploy various models for thousands of\ninteractions with human users, while learning from interaction feedback\nsignals. We show dramatic improvements in performance over time, with\ncomprehension-generation coupling leading to performance improvements up to 26%\nin absolute terms and up to 17% higher accuracies compared to a non-coupled\nsystem. Our analysis also shows coupling has substantial qualitative impact on\nthe system's language, making it significantly more human-like.\n","authors":["Mustafa Omer Gul","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2408.15992v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15991v1","updated":"2024-08-28T17:58:17Z","published":"2024-08-28T17:58:17Z","title":"Distribution Backtracking Builds A Faster Convergence Trajectory for\n One-step Diffusion Distillation","summary":" Accelerating the sampling speed of diffusion models remains a significant\nchallenge. Recent score distillation methods distill a heavy teacher model into\nan one-step student generator, which is optimized by calculating the difference\nbetween the two score functions on the samples generated by the student model.\nHowever, there is a score mismatch issue in the early stage of the distillation\nprocess, because existing methods mainly focus on using the endpoint of\npre-trained diffusion models as teacher models, overlooking the importance of\nthe convergence trajectory between the student generator and the teacher model.\nTo address this issue, we extend the score distillation process by introducing\nthe entire convergence trajectory of teacher models and propose Distribution\nBacktracking Distillation (DisBack) for distilling student generators. DisBask\nis composed of two stages: Degradation Recording and Distribution Backtracking.\nDegradation Recording is designed to obtain the convergence trajectory of\nteacher models, which records the degradation path from the trained teacher\nmodel to the untrained initial student generator. The degradation path\nimplicitly represents the intermediate distributions of teacher models. Then\nDistribution Backtracking trains a student generator to backtrack the\nintermediate distributions for approximating the convergence trajectory of\nteacher models. Extensive experiments show that DisBack achieves faster and\nbetter convergence than the existing distillation method and accomplishes\ncomparable generation performance. Notably, DisBack is easy to implement and\ncan be generalized to existing distillation methods to boost performance. Our\ncode is publicly available on https://github.com/SYZhang0805/DisBack.\n","authors":["Shengyuan Zhang","Ling Yang","Zejian Li","An Zhao","Chenye Meng","Changyuan Yang","Guang Yang","Zhiyuan Yang","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2408.15991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15966v1","updated":"2024-08-28T17:38:44Z","published":"2024-08-28T17:38:44Z","title":"More Text, Less Point: Towards 3D Data-Efficient Point-Language\n Understanding","summary":" Enabling Large Language Models (LLMs) to comprehend the 3D physical world\nremains a significant challenge. Due to the lack of large-scale 3D-text pair\ndatasets, the success of LLMs has yet to be replicated in 3D understanding. In\nthis paper, we rethink this issue and propose a new task: 3D Data-Efficient\nPoint-Language Understanding. The goal is to enable LLMs to achieve robust 3D\nobject understanding with minimal 3D point cloud and text data pairs. To\naddress this task, we introduce GreenPLM, which leverages more text data to\ncompensate for the lack of 3D data. First, inspired by using CLIP to align\nimages and text, we utilize a pre-trained point cloud-text encoder to map the\n3D point cloud space to the text space. This mapping leaves us to seamlessly\nconnect the text space with LLMs. Once the point-text-LLM connection is\nestablished, we further enhance text-LLM alignment by expanding the\nintermediate text space, thereby reducing the reliance on 3D point cloud data.\nSpecifically, we generate 6M free-text descriptions of 3D objects, and design a\nthree-stage training strategy to help LLMs better explore the intrinsic\nconnections between different modalities. To achieve efficient modality\nalignment, we design a zero-parameter cross-attention module for token pooling.\nExtensive experimental results show that GreenPLM requires only 12% of the 3D\ntraining data used by existing state-of-the-art models to achieve superior 3D\nunderstanding. Remarkably, GreenPLM also achieves competitive performance using\ntext-only data. The code and weights are available at:\nhttps://github.com/TangYuan96/GreenPLM.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Jinfeng Xu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15958v1","updated":"2024-08-28T17:20:56Z","published":"2024-08-28T17:20:56Z","title":"Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume","summary":" Current anomaly detection methods excel with benchmark industrial data but\nstruggle with natural images and medical data due to varying definitions of\n'normal' and 'abnormal.' This makes accurate identification of deviations in\nthese fields particularly challenging. Especially for 3D brain MRI data, all\nthe state-of-the-art models are reconstruction-based with 3D convolutional\nneural networks which are memory-intensive, time-consuming and producing noisy\noutputs that require further post-processing. We propose a framework called\nSimple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained\non ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature\nextractor to reduce computational cost. We aggregate the extracted features to\nperform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a\nconditional normalizing flow to calculate log likelihood of features and\nemploys the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The\nresults indicate improved performance, showcasing our model's remarkable\nadaptability and effectiveness when addressing the challenges exists in brain\nMRI data. In addition, for the large-scale 3D brain volumes, our model\nSimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of\naccuracy, memory usage and time consumption. Code is available at:\nhttps://anonymous.4open.science/r/SimpleSliceNet-8EA3.\n","authors":["Zeduo Zhang","Yalda Mohsenzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.15958v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13818v2","updated":"2024-08-28T17:19:34Z","published":"2024-08-25T12:22:50Z","title":"HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images\n Using Deep Learning","summary":" The current standard for detecting human epidermal growth factor receptor 2\n(HER2) status in breast cancer patients relies on HER2 amplification,\nidentified through fluorescence in situ hybridization (FISH) or\nimmunohistochemistry (IHC). However, hematoxylin and eosin (H\\&E) tumor stains\nare more widely available, and accurately predicting HER2 status using H\\&E\ncould reduce costs and expedite treatment selection. Deep Learning algorithms\nfor H&E have shown effectiveness in predicting various cancer features and\nclinical outcomes, including moderate success in HER2 status prediction. In\nthis work, we employed a customized weak supervision classification technique\ncombined with MoCo-v2 contrastive learning to predict HER2 status. We trained\nour pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The\nCancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale\nSchool of Medicine are publicly available. Our pipeline achieved an Area Under\nthe Curve (AUC) of 0.85 across four different test folds. Additionally, we\ntested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2\nscore of 2+ and included corresponding HER2 status and FISH test results. These\ncases are considered equivocal for IHC, requiring an expensive FISH test on\ntheir IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81\non these challenging H&E slides. Reducing the need for FISH test can have\nsignificant implications in cancer treatment equity for underserved\npopulations.\n","authors":["Ardhendu Sekhar","Vrinda Goel","Garima Jain","Abhijeet Patil","Ravi Kant Gupta","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2408.13818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15956v1","updated":"2024-08-28T17:17:20Z","published":"2024-08-28T17:17:20Z","title":"Generating Binary Species Range Maps","summary":" Accurately predicting the geographic ranges of species is crucial for\nassisting conservation efforts. Traditionally, range maps were manually created\nby experts. However, species distribution models (SDMs) and, more recently,\ndeep learning-based variants offer a potential automated alternative. Deep\nlearning-based SDMs generate a continuous probability representing the\npredicted presence of a species at a given location, which must be binarized by\nsetting per-species thresholds to obtain binary range maps. However, selecting\nappropriate per-species thresholds to binarize these predictions is non-trivial\nas different species can require distinct thresholds. In this work, we evaluate\ndifferent approaches for automatically identifying the best thresholds for\nbinarizing range maps using presence-only data. This includes approaches that\nrequire the generation of additional pseudo-absence data, along with ones that\nonly require presence data. We also propose an extension of an existing\npresence-only technique that is more robust to outliers. We perform a detailed\nevaluation of different thresholding techniques on the tasks of binary range\nestimation and large-scale fine-grained visual classification, and we\ndemonstrate improved performance over existing pseudo-absence free approaches\nusing our method.\n","authors":["Filip Dorm","Christian Lange","Scott Loarie","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2408.15956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15955v1","updated":"2024-08-28T17:14:51Z","published":"2024-08-28T17:14:51Z","title":"Fall Detection for Smart Living using YOLOv5","summary":" This work introduces a fall detection system using the YOLOv5mu model, which\nachieved a mean average precision (mAP) of 0.995, demonstrating exceptional\naccuracy in identifying fall events within smart home environments. Enhanced by\nadvanced data augmentation techniques, the model demonstrates significant\nrobustness and adaptability across various conditions. The integration of\nYOLOv5mu offers precise, real-time fall detection, which is crucial for\nimproving safety and emergency response for residents. Future research will\nfocus on refining the system by incorporating contextual data and exploring\nmulti-sensor approaches to enhance its performance and practical applicability\nin diverse environments.\n","authors":["Gracile Astlin Pereira"],"pdf_url":"https://arxiv.org/pdf/2408.15955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15954v1","updated":"2024-08-28T17:14:21Z","published":"2024-08-28T17:14:21Z","title":"InstanSeg: an embedding-based instance segmentation algorithm optimized\n for accurate, efficient and portable cell segmentation","summary":" Cell and nucleus segmentation are fundamental tasks for quantitative bioimage\nanalysis. Despite progress in recent years, biologists and other domain experts\nstill require novel algorithms to handle increasingly large and complex\nreal-world datasets. These algorithms must not only achieve state-of-the-art\naccuracy, but also be optimized for efficiency, portability and\nuser-friendliness. Here, we introduce InstanSeg: a novel embedding-based\ninstance segmentation pipeline designed to identify cells and nuclei in\nmicroscopy images. Using six public cell segmentation datasets, we demonstrate\nthat InstanSeg can significantly improve accuracy when compared to the most\nwidely used alternative methods, while reducing the processing time by at least\n60%. Furthermore, InstanSeg is designed to be fully serializable as TorchScript\nand supports GPU acceleration on a range of hardware. We provide an open-source\nimplementation of InstanSeg in Python, in addition to a user-friendly,\ninteractive QuPath extension for inference written in Java. Our code and\npre-trained models are available at https://github.com/instanseg/instanseg .\n","authors":["Thibaut Goldsborough","Ben Philps","Alan O'Callaghan","Fiona Inglis","Leo Leplat","Andrew Filby","Hakan Bilen","Peter Bankhead"],"pdf_url":"https://arxiv.org/pdf/2408.15954v1.pdf","comment":"12 pages,6 figures"},{"id":"http://arxiv.org/abs/2408.15947v1","updated":"2024-08-28T17:05:38Z","published":"2024-08-28T17:05:38Z","title":"Auxiliary Input in Training: Incorporating Catheter Features into Deep\n Learning Models for ECG-Free Dynamic Coronary Roadmapping","summary":" Dynamic coronary roadmapping is a technology that overlays the vessel maps\n(the \"roadmap\") extracted from an offline image sequence of X-ray angiography\nonto a live stream of X-ray fluoroscopy in real-time. It aims to offer\nnavigational guidance for interventional surgeries without the need for\nrepeated contrast agent injections, thereby reducing the risks associated with\nradiation exposure and kidney failure. The precision of the roadmaps is\ncontingent upon the accurate alignment of angiographic and fluoroscopic images\nbased on their cardiac phases, as well as precise catheter tip tracking. The\nformer ensures the selection of a roadmap that closely matches the vessel shape\nin the current frame, while the latter uses catheter tips as reference points\nto adjust for translational motion between the roadmap and the present vessel\ntree. Training deep learning models for both tasks is challenging and\nunderexplored. However, incorporating catheter features into the models could\noffer substantial benefits, given humans heavily rely on catheters to complete\nthe tasks. To this end, we introduce a simple but effective method, auxiliary\ninput in training (AIT), and demonstrate that it enhances model performance\nacross both tasks, outperforming baseline methods in knowledge incorporation\nand transfer learning.\n","authors":["Yikang Liu","Lin Zhao","Eric Z. Chen","Xiao Chen","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2408.15947v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.15946v1","updated":"2024-08-28T17:04:56Z","published":"2024-08-28T17:04:56Z","title":"Sigma Flows for Image and Data Labeling and Learning Structured\n Prediction","summary":" This paper introduces the sigma flow model for the prediction of structured\nlabelings of data observed on Riemannian manifolds, including Euclidean image\ndomains as special case. The approach combines the Laplace-Beltrami framework\nfor image denoising and enhancement, introduced by Sochen, Kimmel and Malladi\nabout 25 years ago, and the assignment flow approach introduced and studied by\nthe authors.\n The sigma flow arises as Riemannian gradient flow of generalized harmonic\nenergies and thus is governed by a nonlinear geometric PDE which determines a\nharmonic map from a closed Riemannian domain manifold to a statistical\nmanifold, equipped with the Fisher-Rao metric from information geometry. A\nspecific ingredient of the sigma flow is the mutual dependency of the\nRiemannian metric of the domain manifold on the evolving state. This makes the\napproach amenable to machine learning in a specific way, by realizing this\ndependency through a mapping with compact time-variant parametrization that can\nbe learned from data. Proof of concept experiments demonstrate the expressivity\nof the sigma flow model and prediction performance.\n Structural similarities to transformer network architectures and networks\ngenerated by the geometric integration of sigma flows are pointed out, which\nhighlights the connection to deep learning and, conversely, may stimulate the\nuse of geometric design principles for structured prediction in other areas of\nscientific machine learning.\n","authors":["Jonas Cassel","Bastian Boll","Stefania Petra","Peter Albers","Christoph Schnörr"],"pdf_url":"https://arxiv.org/pdf/2408.15946v1.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2305.12437v4","updated":"2024-08-28T16:56:02Z","published":"2023-05-21T11:51:09Z","title":"SCP: Soft Conditional Prompt Learning for Aerial Video Action\n Recognition","summary":" We present a new learning approach, Soft Conditional Prompt Learning (SCP),\nwhich leverages the strengths of prompt learning for aerial video action\nrecognition. Our approach is designed to predict the action of each agent by\nhelping the models focus on the descriptions or instructions associated with\nactions in the input videos for aerial/robot visual perception. Our formulation\nsupports various prompts, including learnable prompts, auxiliary visual\ninformation, and large vision models to improve the recognition performance. We\npresent a soft conditional prompt method that learns to dynamically generate\nprompts from a pool of prompt experts under different video inputs. By sharing\nthe same objective with the task, our proposed SCP can optimize prompts that\nguide the model's predictions while explicitly learning input-invariant (prompt\nexperts pool) and input-specific (data-dependent) prompt knowledge. In\npractice, we observe a 3.17-10.2% accuracy improvement on the aerial video\ndatasets (Okutama, NECDrone), which consist of scenes with single-agent and\nmulti-agent actions. We further evaluate our approach on ground camera videos\nto verify the effectiveness and generalization and achieve a 1.0-3.6%\nimprovement on dataset SSV2. We integrate our method into the ROS2 as well.\n","authors":["Xijun Wang","Ruiqi Xian","Tianrui Guan","Fuxiao Liu","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2305.12437v4.pdf","comment":"IROS2024"},{"id":"http://arxiv.org/abs/2402.09786v4","updated":"2024-08-28T16:48:06Z","published":"2024-02-15T08:34:21Z","title":"Examining Pathological Bias in a Generative Adversarial Network\n Discriminator: A Case Study on a StyleGAN3 Model","summary":" Generative adversarial networks (GANs) generate photorealistic faces that are\noften indistinguishable by humans from real faces. While biases in machine\nlearning models are often assumed to be due to biases in training data, we find\npathological internal color and luminance biases in the discriminator of a\npre-trained StyleGAN3-r model that are not explicable by the training data. We\nalso find that the discriminator systematically stratifies scores by both\nimage- and face-level qualities and that this disproportionately affects images\nacross gender, race, and other categories. We examine axes common in research\non stereotyping in social psychology.\n","authors":["Alvin Grissom II","Ryan F. Lei","Matt Gusdorff","Jeova Farias Sales Rocha Neto","Bailey Lin","Ryan Trotter"],"pdf_url":"https://arxiv.org/pdf/2402.09786v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15924v1","updated":"2024-08-28T16:36:23Z","published":"2024-08-28T16:36:23Z","title":"Local Descriptors Weighted Adaptive Threshold Filtering For Few-Shot\n Learning","summary":" Few-shot image classification is a challenging task in the field of machine\nlearning, involving the identification of new categories using a limited number\nof labeled samples. In recent years, methods based on local descriptors have\nmade significant progress in this area. However, the key to improving\nclassification accuracy lies in effectively filtering background noise and\naccurately selecting critical local descriptors highly relevant to image\ncategory information.\n To address this challenge, we propose an innovative weighted adaptive\nthreshold filtering (WATF) strategy for local descriptors. This strategy can\ndynamically adjust based on the current task and image context, thereby\nselecting local descriptors most relevant to the image category. This enables\nthe model to better focus on category-related information while effectively\nmitigating interference from irrelevant background regions.\n To evaluate the effectiveness of our method, we adopted the N-way K-shot\nexperimental framework. Experimental results show that our method not only\nimproves the clustering effect of selected local descriptors but also\nsignificantly enhances the discriminative ability between image categories.\nNotably, our method maintains a simple and lightweight design philosophy\nwithout introducing additional learnable parameters. This feature ensures\nconsistency in filtering capability during both training and testing phases,\nfurther enhancing the reliability and practicality of the method.\n","authors":["Bingchen Yan"],"pdf_url":"https://arxiv.org/pdf/2408.15924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15922v1","updated":"2024-08-28T16:36:09Z","published":"2024-08-28T16:36:09Z","title":"DiffAge3D: Diffusion-based 3D-aware Face Aging","summary":" Face aging is the process of converting an individual's appearance to a\nyounger or older version of themselves. Existing face aging techniques have\nbeen limited to 2D settings, which often weaken their applications as there is\na growing demand for 3D face modeling. Moreover, existing aging methods\nstruggle to perform faithful aging, maintain identity, and retain the fine\ndetails of the input images. Given these limitations and the need for a\n3D-aware aging method, we propose DiffAge3D, the first 3D-aware aging framework\nthat not only performs faithful aging and identity preservation but also\noperates in a 3D setting. Our aging framework allows to model the aging and\ncamera pose separately by only taking a single image with a target age. Our\nframework includes a robust 3D-aware aging dataset generation pipeline by\nutilizing a pre-trained 3D GAN and the rich text embedding capabilities within\nCLIP model. Notably, we do not employ any inversion bottleneck in dataset\ngeneration. Instead, we randomly generate training samples from the latent\nspace of 3D GAN, allowing us to manipulate the rich latent space of GAN to\ngenerate ages even with large gaps. With the generated dataset, we train a\nviewpoint-aware diffusion-based aging model to control the camera pose and\nfacial age. Through quantitative and qualitative evaluations, we demonstrate\nthat DiffAge3D outperforms existing methods, particularly in\nmultiview-consistent aging and fine details preservation.\n","authors":["Junaid Wahid","Fangneng Zhan","Pramod Rao","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2408.15922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15915v1","updated":"2024-08-28T16:28:07Z","published":"2024-08-28T16:28:07Z","title":"Leveraging Open Knowledge for Advancing Task Expertise in Large Language\n Models","summary":" The cultivation of expertise for large language models (LLMs) to solve tasks\nof specific areas often requires special-purpose tuning with calibrated\nbehaviors on the expected stable outputs. To avoid huge cost brought by manual\npreparation of instruction datasets and training resources up to hundreds of\nhours, the exploitation of open knowledge including a wealth of low rank\nadaptation (LoRA) models and instruction datasets serves as a good starting\npoint. However, existing methods on model and data selection focus on the\nperformance of general-purpose capabilities while neglecting the knowledge gap\nexposed in domain-specific deployment. In the present study, we propose to\nbridge such gap by introducing few human-annotated samples (i.e., K-shot) for\nadvancing task expertise of LLMs with open knowledge. Specifically, we develop\nan efficient and scalable pipeline to cost-efficiently produce task experts\nwhere K-shot data intervene in selecting the most promising expert candidates\nand the task-relevant instructions. A mixture-of-expert (MoE) system is built\nto make the best use of individual-yet-complementary knowledge between multiple\nexperts. We unveil the two keys to the success of a MoE system, 1) the abidance\nby K-shot, and 2) the insistence on diversity. For the former, we ensure that\nmodels that truly possess problem-solving abilities on K-shot are selected\nrather than those blind guessers. Besides, during data selection, instructions\nthat share task-relevant contexts with K-shot are prioritized. For the latter,\nwe highlight the diversity of constituting experts and that of the fine-tuning\ninstructions throughout the model and data selection process. Extensive\nexperimental results confirm the superiority of our approach over existing\nmethods on utilization of open knowledge across various tasks. Codes and models\nwill be released later.\n","authors":["Yuncheng Yang","Yulei Qin","Tong Wu","Zihan Xu","Gang Li","Pengcheng Guo","Hang Shao","Yucheng Shi","Ke Li","Xing Sun","Jie Yang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2408.15915v1.pdf","comment":"28 pages, 12 tables, 10 figures"},{"id":"http://arxiv.org/abs/2408.15914v1","updated":"2024-08-28T16:27:58Z","published":"2024-08-28T16:27:58Z","title":"CoRe: Context-Regularized Text Embedding Learning for Text-to-Image\n Personalization","summary":" Recent advances in text-to-image personalization have enabled high-quality\nand controllable image synthesis for user-provided concepts. However, existing\nmethods still struggle to balance identity preservation with text alignment.\nOur approach is based on the fact that generating prompt-aligned images\nrequires a precise semantic understanding of the prompt, which involves\naccurately processing the interactions between the new concept and its\nsurrounding context tokens within the CLIP text encoder. To address this, we\naim to embed the new concept properly into the input embedding space of the\ntext encoder, allowing for seamless integration with existing tokens. We\nintroduce Context Regularization (CoRe), which enhances the learning of the new\nconcept's text embedding by regularizing its context tokens in the prompt. This\nis based on the insight that appropriate output vectors of the text encoder for\nthe context tokens can only be achieved if the new concept's text embedding is\ncorrectly learned. CoRe can be applied to arbitrary prompts without requiring\nthe generation of corresponding images, thus improving the generalization of\nthe learned text embedding. Additionally, CoRe can serve as a test-time\noptimization technique to further enhance the generations for specific prompts.\nComprehensive experiments demonstrate that our method outperforms several\nbaseline methods in both identity preservation and text alignment. Code will be\nmade publicly available.\n","authors":["Feize Wu","Yun Pang","Junyi Zhang","Lianyu Pang","Jian Yin","Baoquan Zhao","Qing Li","Xudong Mao"],"pdf_url":"https://arxiv.org/pdf/2408.15914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01090v3","updated":"2024-08-28T16:23:19Z","published":"2023-11-02T08:55:11Z","title":"Infusion: internal diffusion for inpainting of dynamic textures and\n complex motion","summary":" Video inpainting is the task of filling a region in a video in a visually\nconvincing manner. It is very challenging due to the high dimensionality of the\ndata and the temporal consistency required for obtaining convincing results.\nRecently, diffusion models have shown impressive results in modeling complex\ndata distributions, including images and videos. Such models remain nonetheless\nvery expensive to train and to perform inference with, which strongly reduce\ntheir applicability to videos, and yields unreasonable computational loads. We\nshow that in the case of video inpainting, thanks to the highly auto-similar\nnature of videos, the training data of a diffusion model can be restricted to\nthe input video and still produce very satisfying results. This leads us to\nadopt an internal learning approach, which also allows us to greatly reduce the\nneural network size by about three orders of magnitude less than current\ndiffusion models used for image inpainting. We also introduce a new method for\nefficient training and inference of diffusion models in the context of internal\nlearning, by splitting the diffusion process into different learning intervals\ncorresponding to different noise levels of the diffusion process. To the best\nof our knowledge, this is the first video inpainting method based purely on\ndiffusion. Other methods require additional components such as optical flow\nestimation, which limits their performance in the case of dynamic textures and\ncomplex motions. We show qualitative and quantitative results, demonstrating\nthat our method reaches state of the art performance in the case of dynamic\ntextures and complex dynamic backgrounds.\n","authors":["Nicolas Cherel","Andrés Almansa","Yann Gousseau","Alasdair Newson"],"pdf_url":"https://arxiv.org/pdf/2311.01090v3.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15899v1","updated":"2024-08-28T16:12:28Z","published":"2024-08-28T16:12:28Z","title":"Gen-Swarms: Adapting Deep Generative Models to Swarms of Drones","summary":" Gen-Swarms is an innovative method that leverages and combines the\ncapabilities of deep generative models with reactive navigation algorithms to\nautomate the creation of drone shows. Advancements in deep generative models,\nparticularly diffusion models, have demonstrated remarkable effectiveness in\ngenerating high-quality 2D images. Building on this success, various works have\nextended diffusion models to 3D point cloud generation. In contrast,\nalternative generative models such as flow matching have been proposed,\noffering a simple and intuitive transition from noise to meaningful outputs.\nHowever, the application of flow matching models to 3D point cloud generation\nremains largely unexplored. Gen-Swarms adapts these models to automatically\ngenerate drone shows. Existing 3D point cloud generative models create point\ntrajectories which are impractical for drone swarms. In contrast, our method\nnot only generates accurate 3D shapes but also guides the swarm motion,\nproducing smooth trajectories and accounting for potential collisions through a\nreactive navigation algorithm incorporated into the sampling process. For\nexample, when given a text category like Airplane, Gen-Swarms can rapidly and\ncontinuously generate numerous variations of 3D airplane shapes. Our\nexperiments demonstrate that this approach is particularly well-suited for\ndrone shows, providing feasible trajectories, creating representative final\nshapes, and significantly enhancing the overall performance of drone show\ngeneration.\n","authors":["Carlos Plou","Pablo Pueyo","Ruben Martinez-Cantin","Mac Schwager","Ana C. Murillo","Eduardo Montijano"],"pdf_url":"https://arxiv.org/pdf/2408.15899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15890v1","updated":"2024-08-28T16:03:18Z","published":"2024-08-28T16:03:18Z","title":"Disentangled Diffusion Autoencoder for Harmonization of Multi-site\n Neuroimaging Data","summary":" Combining neuroimaging datasets from multiple sites and scanners can help\nincrease statistical power and thus provide greater insight into subtle\nneuroanatomical effects. However, site-specific effects pose a challenge by\npotentially obscuring the biological signal and introducing unwanted variance.\nExisting harmonization techniques, which use statistical models to remove such\neffects, have been shown to incompletely remove site effects while also failing\nto preserve biological variability. More recently, generative models using GANs\nor autoencoder-based approaches, have been proposed for site adjustment.\nHowever, such methods are known for instability during training or blurry image\ngeneration. In recent years, diffusion models have become increasingly popular\nfor their ability to generate high-quality synthetic images. In this work, we\nintroduce the disentangled diffusion autoencoder (DDAE), a novel diffusion\nmodel designed for controlling specific aspects of an image. We apply the DDAE\nto the task of harmonizing MR images by generating high-quality site-adjusted\nimages that preserve biological variability. We use data from 7 different sites\nand demonstrate the DDAE's superiority in generating high-resolution,\nharmonized 2D MR images over previous approaches. As far as we are aware, this\nwork marks the first diffusion-based model for site adjustment of neuroimaging\ndata.\n","authors":["Ayodeji Ijishakin","Ana Lawry Aguila","Elizabeth Levitis","Ahmed Abdulaal","Andre Altmann","James Cole"],"pdf_url":"https://arxiv.org/pdf/2408.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15887v1","updated":"2024-08-28T15:59:40Z","published":"2024-08-28T15:59:40Z","title":"SpineMamba: Enhancing 3D Spinal Segmentation in Clinical Imaging through\n Residual Visual Mamba Layers and Shape Priors","summary":" Accurate segmentation of 3D clinical medical images is critical in the\ndiagnosis and treatment of spinal diseases. However, the inherent complexity of\nspinal anatomy and uncertainty inherent in current imaging technologies, poses\nsignificant challenges for semantic segmentation of spinal images. Although\nconvolutional neural networks (CNNs) and Transformer-based models have made\nsome progress in spinal segmentation, their limitations in handling long-range\ndependencies hinder further improvements in segmentation accuracy.To address\nthese challenges, we introduce a residual visual Mamba layer to effectively\ncapture and model the deep semantic features and long-range spatial\ndependencies of 3D spinal data. To further enhance the structural semantic\nunderstanding of the vertebrae, we also propose a novel spinal shape prior\nmodule that captures specific anatomical information of the spine from medical\nimages, significantly enhancing the model's ability to extract structural\nsemantic information of the vertebrae. Comparative and ablation experiments on\ntwo datasets demonstrate that SpineMamba outperforms existing state-of-the-art\nmodels. On the CT dataset, the average Dice similarity coefficient for\nsegmentation reaches as high as 94.40, while on the MR dataset, it reaches\n86.95. Notably, compared to the renowned nnU-Net, SpineMamba achieves superior\nsegmentation performance, exceeding it by up to 2 percentage points. This\nunderscores its accuracy, robustness, and excellent generalization\ncapabilities.\n","authors":["Zhiqing Zhang","Tianyong Liu","Guojia Fan","Bin Li","Qianjin Feng","Shoujun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.15887v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.15881v1","updated":"2024-08-28T15:52:23Z","published":"2024-08-28T15:52:23Z","title":"LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation","summary":" We introduce LLaVA-MoD, a novel framework designed to enable the efficient\ntraining of small-scale Multimodal Language Models (s-MLLM) by distilling\nknowledge from large-scale MLLM (l-MLLM). Our approach tackles two fundamental\nchallenges in MLLM distillation. First, we optimize the network structure of\ns-MLLM by integrating a sparse Mixture of Experts (MoE) architecture into the\nlanguage model, striking a balance between computational efficiency and model\nexpressiveness. Second, we propose a progressive knowledge transfer strategy to\nensure comprehensive knowledge migration. This strategy begins with mimic\ndistillation, where we minimize the Kullback-Leibler (KL) divergence between\noutput distributions to enable the student model to emulate the teacher\nnetwork's understanding. Following this, we introduce preference distillation\nvia Direct Preference Optimization (DPO), where the key lies in treating l-MLLM\nas the reference model. During this phase, the s-MLLM's ability to discriminate\nbetween superior and inferior examples is significantly enhanced beyond l-MLLM,\nleading to a better student that surpasses its teacher, particularly in\nhallucination benchmarks. Extensive experiments demonstrate that LLaVA-MoD\noutperforms existing models across various multimodal benchmarks while\nmaintaining a minimal number of activated parameters and low computational\ncosts. Remarkably, LLaVA-MoD, with only 2B activated parameters, surpasses\nQwen-VL-Chat-7B by an average of 8.8% across benchmarks, using merely 0.3% of\nthe training data and 23% trainable parameters. These results underscore\nLLaVA-MoD's ability to effectively distill comprehensive knowledge from its\nteacher model, paving the way for the development of more efficient MLLMs. The\ncode will be available on: https://github.com/shufangxun/LLaVA-MoD.\n","authors":["Fangxun Shu","Yue Liao","Le Zhuo","Chenning Xu","Guanghao Zhang","Haonan Shi","Long Chen","Tao Zhong","Wanggui He","Siming Fu","Haoyuan Li","Bolin Li","Zhelun Yu","Si Liu","Hongsheng Li","Hao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15876v1","updated":"2024-08-28T15:47:32Z","published":"2024-08-28T15:47:32Z","title":"Unleashing the Temporal-Spatial Reasoning Capacity of GPT for\n Training-Free Audio and Language Referenced Video Object Segmentation","summary":" In this paper, we propose an Audio-Language-Referenced SAM 2 (AL-Ref-SAM 2)\npipeline to explore the training-free paradigm for audio and\nlanguage-referenced video object segmentation, namely AVS and RVOS tasks. The\nintuitive solution leverages GroundingDINO to identify the target object from a\nsingle frame and SAM 2 to segment the identified object throughout the video,\nwhich is less robust to spatiotemporal variations due to a lack of video\ncontext exploration. Thus, in our AL-Ref-SAM 2 pipeline, we propose a novel\nGPT-assisted Pivot Selection (GPT-PS) module to instruct GPT-4 to perform\ntwo-step temporal-spatial reasoning for sequentially selecting pivot frames and\npivot boxes, thereby providing SAM 2 with a high-quality initial object prompt.\nWithin GPT-PS, two task-specific Chain-of-Thought prompts are designed to\nunleash GPT's temporal-spatial reasoning capacity by guiding GPT to make\nselections based on a comprehensive understanding of video and reference\ninformation. Furthermore, we propose a Language-Binded Reference Unification\n(LBRU) module to convert audio signals into language-formatted references,\nthereby unifying the formats of AVS and RVOS tasks in the same pipeline.\nExtensive experiments on both tasks show that our training-free AL-Ref-SAM 2\npipeline achieves performances comparable to or even better than\nfully-supervised fine-tuning methods. The code is available at:\nhttps://github.com/appletea233/AL-Ref-SAM2.\n","authors":["Shaofei Huang","Rui Ling","Hongyu Li","Tianrui Hui","Zongheng Tang","Xiaoming Wei","Jizhong Han","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15868v1","updated":"2024-08-28T15:37:44Z","published":"2024-08-28T15:37:44Z","title":"GenDDS: Generating Diverse Driving Video Scenarios with Prompt-to-Video\n Generative Model","summary":" Autonomous driving training requires a diverse range of datasets encompassing\nvarious traffic conditions, weather scenarios, and road types. Traditional data\naugmentation methods often struggle to generate datasets that represent rare\noccurrences. To address this challenge, we propose GenDDS, a novel approach for\ngenerating driving scenarios generation by leveraging the capabilities of\nStable Diffusion XL (SDXL), an advanced latent diffusion model. Our methodology\ninvolves the use of descriptive prompts to guide the synthesis process, aimed\nat producing realistic and diverse driving scenarios. With the power of the\nlatest computer vision techniques, such as ControlNet and Hotshot-XL, we have\nbuilt a complete pipeline for video generation together with SDXL. We employ\nthe KITTI dataset, which includes real-world driving videos, to train the\nmodel. Through a series of experiments, we demonstrate that our model can\ngenerate high-quality driving videos that closely replicate the complexity and\nvariability of real-world driving scenarios. This research contributes to the\ndevelopment of sophisticated training data for autonomous driving systems and\nopens new avenues for creating virtual environments for simulation and\nvalidation purposes.\n","authors":["Yongjie Fu","Yunlong Li","Xuan Di"],"pdf_url":"https://arxiv.org/pdf/2408.15868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15865v1","updated":"2024-08-28T15:29:27Z","published":"2024-08-28T15:29:27Z","title":"microYOLO: Towards Single-Shot Object Detection on Microcontrollers","summary":" This work-in-progress paper presents results on the feasibility of\nsingle-shot object detection on microcontrollers using YOLO. Single-shot object\ndetectors like YOLO are widely used, however due to their complexity mainly on\nlarger GPU-based platforms. We present microYOLO, which can be used on Cortex-M\nbased microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when\nclassifying 128x128 RGB images while using less than 800 KB Flash and less than\n350 KB RAM. Furthermore, we share experimental results for three different\nobject detection tasks, analyzing the accuracy of microYOLO on them.\n","authors":["Mark Deutel","Christopher Mutschler","Jürgen Teich"],"pdf_url":"https://arxiv.org/pdf/2408.15865v1.pdf","comment":"Published at the ECML PKDD Conference 2023, at the 4th Workshop on\n IoT, Edge, and Mobile for Embedded Machine Learning"},{"id":"http://arxiv.org/abs/2310.10835v3","updated":"2024-08-28T15:29:17Z","published":"2023-10-16T21:17:29Z","title":"Provable Probabilistic Imaging using Score-Based Generative Priors","summary":" Estimating high-quality images while also quantifying their uncertainty are\ntwo desired features in an image reconstruction algorithm for solving ill-posed\ninverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as\na principled framework for characterizing the space of possible solutions to a\ngeneral inverse problem. PMC is able to incorporate expressive score-based\ngenerative priors for high-quality image reconstruction while also performing\nuncertainty quantification via posterior sampling. In particular, we develop\ntwo PMC algorithms that can be viewed as the sampling analogues of the\ntraditional plug-and-play priors (PnP) and regularization by denoising (RED)\nalgorithms. To improve the sampling efficiency, we introduce weighted annealing\ninto these PMC algorithms, further developing two additional annealed PMC\nalgorithms (APMC). We establish a theoretical analysis for characterizing the\nconvergence behavior of PMC algorithms. Our analysis provides non-asymptotic\nstationarity guarantees in terms of the Fisher information, fully compatible\nwith the joint presence of weighted annealing, potentially non-log-concave\nlikelihoods, and imperfect score networks. We demonstrate the performance of\nthe PMC algorithms on multiple representative inverse problems with both linear\nand nonlinear forward models. Experimental results show that PMC significantly\nimproves reconstruction quality and enables high-fidelity uncertainty\nquantification.\n","authors":["Yu Sun","Zihui Wu","Yifan Chen","Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2310.10835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15857v1","updated":"2024-08-28T15:18:46Z","published":"2024-08-28T15:18:46Z","title":"What is YOLOv8: An In-Depth Exploration of the Internal Features of the\n Next-Generation Object Detector","summary":" This study presents a detailed analysis of the YOLOv8 object detection model,\nfocusing on its architecture, training techniques, and performance improvements\nover previous iterations like YOLOv5. Key innovations, including the CSPNet\nbackbone for enhanced feature extraction, the FPN+PAN neck for superior\nmulti-scale object detection, and the transition to an anchor-free approach,\nare thoroughly examined. The paper reviews YOLOv8's performance across\nbenchmarks like Microsoft COCO and Roboflow 100, highlighting its high accuracy\nand real-time capabilities across diverse hardware platforms. Additionally, the\nstudy explores YOLOv8's developer-friendly enhancements, such as its unified\nPython package and CLI, which streamline model training and deployment.\nOverall, this research positions YOLOv8 as a state-of-the-art solution in the\nevolving object detection field.\n","authors":["Muhammad Yaseen"],"pdf_url":"https://arxiv.org/pdf/2408.15857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19254v2","updated":"2024-08-28T15:13:45Z","published":"2024-03-28T09:21:00Z","title":"Imperceptible Protection against Style Imitation from Diffusion Models","summary":" Recent progress in diffusion models has profoundly enhanced the fidelity of\nimage generation, but it has raised concerns about copyright infringements.\nWhile prior methods have introduced adversarial perturbations to prevent style\nimitation, most are accompanied by the degradation of artworks' visual quality.\nRecognizing the importance of maintaining this, we introduce a visually\nimproved protection method while preserving its protection capability. To this\nend, we devise a perceptual map to highlight areas sensitive to human eyes,\nguided by instance-aware refinement, which refines the protection intensity\naccordingly. We also introduce a difficulty-aware protection by predicting how\ndifficult the artwork is to protect and dynamically adjusting the intensity\nbased on this. Lastly, we integrate a perceptual constraints bank to further\nimprove the imperceptibility. Results show that our method substantially\nelevates the quality of the protected image without compromising on protection\nefficacy.\n","authors":["Namhyuk Ahn","Wonhyuk Ahn","KiYoon Yoo","Daesik Kim","Seung-Hun Nam"],"pdf_url":"https://arxiv.org/pdf/2403.19254v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15844v1","updated":"2024-08-28T15:04:52Z","published":"2024-08-28T15:04:52Z","title":"Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction","summary":" Video key frame extraction is important in various fields, such as video\nsummary, retrieval, and compression. Therefore, we suggest a video key frame\nextraction algorithm based on shot segmentation using Von Neumann entropy. The\nsegmentation of shots is achieved through the computation of Von Neumann\nentropy of the similarity matrix among frames within the video sequence. The\ninitial frame of each shot is selected as key frames, which combines the\ntemporal sequence information of frames. The experimental results show the\nextracted key frames can fully and accurately represent the original video\ncontent while minimizing the number of repeated frames.\n","authors":["Xueqing Zhang. Di Fu","Naihao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15844v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15833v1","updated":"2024-08-28T14:47:34Z","published":"2024-08-28T14:47:34Z","title":"Network transferability of adversarial patches in real-time object\n detection","summary":" Adversarial patches in computer vision can be used, to fool deep neural\nnetworks and manipulate their decision-making process. One of the most\nprominent examples of adversarial patches are evasion attacks for object\ndetectors. By covering parts of objects of interest, these patches suppress the\ndetections and thus make the target object 'invisible' to the object detector.\nSince these patches are usually optimized on a specific network with a specific\ntrain dataset, the transferability across multiple networks and datasets is not\ngiven. This paper addresses these issues and investigates the transferability\nacross numerous object detector architectures. Our extensive evaluation across\nvarious models on two distinct datasets indicates that patches optimized with\nlarger models provide better network transferability than patches that are\noptimized with smaller models.\n","authors":["Jens Bayer","Stefan Becker","David Münch","Michael Arens"],"pdf_url":"https://arxiv.org/pdf/2408.15833v1.pdf","comment":"7 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.15829v1","updated":"2024-08-28T14:44:42Z","published":"2024-08-28T14:44:42Z","title":"SITransformer: Shared Information-Guided Transformer for Extreme\n Multimodal Summarization","summary":" Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an\nattractive summarization approach by integrating various types of information\nto create extremely concise yet informative summaries for individual\nmodalities. Existing methods overlook the issue that multimodal data often\ncontains more topic irrelevant information, which can mislead the model into\nproducing inaccurate summaries especially for extremely short ones. In this\npaper, we propose SITransformer, a \\textbf{S}hared \\textbf{I}nformation-guided\n\\textbf{T}ransformer for extreme multimodal summarization. It has a shared\ninformation guided pipeline which involves a cross-modal shared information\nextractor and a cross-modal interaction module. The extractor formulates\nsemantically shared salient information from different modalities by devising a\nnovel filtering process consisting of a differentiable top-k selector and a\nshared-information guided gating unit. As a result, the common, salient, and\nrelevant contents across modalities are identified. Next, a transformer with\ncross-modal attentions is developed for intra- and inter-modality learning with\nthe shared information guidance to produce the extreme summary. Comprehensive\nexperiments demonstrate that SITransformer significantly enhances the\nsummarization quality for both video and text summaries for XMSMO. Our code\nwill be publicly available at https://github.com/SichengLeoLiu/MMAsia24-XMSMO.\n","authors":["Sicheng Liu","Lintao Wang","Xiaogan Zhu","Xuequan Lu","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15829v1.pdf","comment":"8 pages, 5 figures, submitted to ACM Multimedia Asia 2024"},{"id":"http://arxiv.org/abs/2408.15823v1","updated":"2024-08-28T14:34:45Z","published":"2024-08-28T14:34:45Z","title":"Benchmarking foundation models as feature extractors for\n weakly-supervised computational pathology","summary":" Advancements in artificial intelligence have driven the development of\nnumerous pathology foundation models capable of extracting clinically relevant\ninformation. However, there is currently limited literature independently\nevaluating these foundation models on truly external cohorts and\nclinically-relevant tasks to uncover adjustments for future improvements. In\nthis study, we benchmarked ten histopathology foundation models on 13 patient\ncohorts with 6,791 patients and 9,493 slides from lung, colorectal, gastric,\nand breast cancers. The models were evaluated on weakly-supervised tasks\nrelated to biomarkers, morphological properties, and prognostic outcomes. We\nshow that a vision-language foundation model, CONCH, yielded the highest\nperformance in 42% of tasks when compared to vision-only foundation models. The\nexperiments reveal that foundation models trained on distinct cohorts learn\ncomplementary features to predict the same label, and can be fused to\noutperform the current state of the art. Creating an ensemble of complementary\nfoundation models outperformed CONCH in 66% of tasks. Moreover, our findings\nsuggest that data diversity outweighs data volume for foundation models. Our\nwork highlights actionable adjustments to improve pathology foundation models.\n","authors":["Peter Neidlinger","Omar S. M. El Nahhas","Hannah Sophie Muti","Tim Lenz","Michael Hoffmeister","Hermann Brenner","Marko van Treeck","Rupert Langer","Bastian Dislich","Hans Michael Behrens","Christoph Röcken","Sebastian Foersch","Daniel Truhn","Antonio Marra","Oliver Lester Saldanha","Jakob Nikolas Kather"],"pdf_url":"https://arxiv.org/pdf/2408.15823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05348v4","updated":"2024-08-28T14:26:07Z","published":"2023-11-09T13:18:27Z","title":"u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model","summary":" Recent advancements in multi-modal large language models (MLLMs) have led to\nsubstantial improvements in visual understanding, primarily driven by\nsophisticated modality alignment strategies. However, predominant approaches\nprioritize global or regional comprehension, with less focus on fine-grained,\npixel-level tasks. To address this gap, we introduce u-LLaVA, an innovative\nunifying multi-task framework that integrates pixel, regional, and global\nfeatures to refine the perceptual faculties of MLLMs. We commence by leveraging\nan efficient modality alignment approach, harnessing both image and video\ndatasets to bolster the model's foundational understanding across diverse\nvisual contexts. Subsequently, a joint instruction tuning method with\ntask-specific projectors and decoders for end-to-end downstream training is\npresented. Furthermore, this work contributes a novel mask-based multi-task\ndataset comprising 277K samples, crafted to challenge and assess the\nfine-grained perception capabilities of MLLMs. The overall framework is simple,\neffective, and achieves state-of-the-art performance across multiple\nbenchmarks. We also make our model, data, and code publicly accessible at\nhttps://github.com/OPPOMKLab/u-LLaVA.\n","authors":["Jinjin Xu","Liwu Xu","Yuzhe Yang","Xiang Li","Fanyi Wang","Yanchun Xie","Yi-Jie Huang","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2311.05348v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15816v1","updated":"2024-08-28T14:25:35Z","published":"2024-08-28T14:25:35Z","title":"Mining Field Data for Tree Species Recognition at Scale","summary":" Individual tree species labels are particularly hard to acquire due to the\nexpert knowledge needed and the limitations of photointerpretation. Here, we\npresent a methodology to automatically mine species labels from public forest\ninventory data, using available pretrained tree detection models. We identify\ntree instances in aerial imagery and match them with field data with close to\nzero human involvement. We conduct a series of experiments on the resulting\ndataset, and show a beneficial effect when adding noisy or even unlabeled data\npoints, highlighting a strong potential for large-scale individual species\nmapping.\n","authors":["Dimitri Gominski","Daniel Ortiz-Gonzalo","Martin Brandt","Maurice Mugabowindekwe","Rasmus Fensholt"],"pdf_url":"https://arxiv.org/pdf/2408.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15813v1","updated":"2024-08-28T14:14:33Z","published":"2024-08-28T14:14:33Z","title":"DQFormer: Towards Unified LiDAR Panoptic Segmentation with Decoupled\n Queries","summary":" LiDAR panoptic segmentation, which jointly performs instance and semantic\nsegmentation for things and stuff classes, plays a fundamental role in LiDAR\nperception tasks. While most existing methods explicitly separate these two\nsegmentation tasks and utilize different branches (i.e., semantic and instance\nbranches), some recent methods have embraced the query-based paradigm to unify\nLiDAR panoptic segmentation. However, the distinct spatial distribution and\ninherent characteristics of objects(things) and their surroundings(stuff) in 3D\nscenes lead to challenges, including the mutual competition of things/stuff and\nthe ambiguity of classification/segmentation. In this paper, we propose\ndecoupling things/stuff queries according to their intrinsic properties for\nindividual decoding and disentangling classification/segmentation to mitigate\nambiguity. To this end, we propose a novel framework dubbed DQFormer to\nimplement semantic and instance segmentation in a unified workflow.\nSpecifically, we design a decoupled query generator to propose informative\nqueries with semantics by localizing things/stuff positions and fusing\nmulti-level BEV embeddings. Moreover, a query-oriented mask decoder is\nintroduced to decode corresponding segmentation masks by performing masked\ncross-attention between queries and mask embeddings. Finally, the decoded masks\nare combined with the semantics of the queries to produce panoptic results.\nExtensive experiments on nuScenes and SemanticKITTI datasets demonstrate the\nsuperiority of our DQFormer framework.\n","authors":["Yu Yang","Jianbiao Mei","Liang Liu","Siliang Du","Yilin Xiao","Jongwon Ra","Yong Liu","Xiao Xu","Huifeng Wu"],"pdf_url":"https://arxiv.org/pdf/2408.15813v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15810v1","updated":"2024-08-28T14:10:57Z","published":"2024-08-28T14:10:57Z","title":"Multi-view Pose Fusion for Occlusion-Aware 3D Human Pose Estimation","summary":" Robust 3D human pose estimation is crucial to ensure safe and effective\nhuman-robot collaboration. Accurate human perception,however, is particularly\nchallenging in these scenarios due to strong occlusions and limited camera\nviewpoints. Current 3D human pose estimation approaches are rather vulnerable\nin such conditions. In this work we present a novel approach for robust 3D\nhuman pose estimation in the context of human-robot collaboration. Instead of\nrelying on noisy 2D features triangulation, we perform multi-view fusion on 3D\nskeletons provided by absolute monocular methods. Accurate 3D pose estimation\nis then obtained via reprojection error optimization, introducing limbs length\nsymmetry constraints. We evaluate our approach on the public dataset Human3.6M\nand on a novel version Human3.6M-Occluded, derived adding synthetic occlusions\non the camera views with the purpose of testing pose estimation algorithms\nunder severe occlusions. We further validate our method on real human-robot\ncollaboration workcells, in which we strongly surpass current 3D human pose\nestimation methods. Our approach outperforms state-of-the-art multi-view human\npose estimation techniques and demonstrates superior capabilities in handling\nchallenging scenarios with strong occlusions, representing a reliable and\neffective solution for real human-robot collaboration setups.\n","authors":["Laura Bragagnolo","Matteo Terreran","Davide Allegro","Stefano Ghidoni"],"pdf_url":"https://arxiv.org/pdf/2408.15810v1.pdf","comment":"ECCV workshops 2024"},{"id":"http://arxiv.org/abs/2408.15809v1","updated":"2024-08-28T14:08:24Z","published":"2024-08-28T14:08:24Z","title":"Object Detection for Vehicle Dashcams using Transformers","summary":" The use of intelligent automation is growing significantly in the automotive\nindustry, as it assists drivers and fleet management companies, thus increasing\ntheir productivity. Dash cams are now been used for this purpose which enables\nthe instant identification and understanding of multiple objects and\noccurrences in the surroundings. In this paper, we propose a novel approach for\nobject detection in dashcams using transformers. Our system is based on the\nstate-of-the-art DEtection TRansformer (DETR), which has demonstrated strong\nperformance in a variety of conditions, including different weather and\nillumination scenarios. The use of transformers allows for the consideration of\ncontextual information in decisionmaking, improving the accuracy of object\ndetection. To validate our approach, we have trained our DETR model on a\ndataset that represents real-world conditions. Our results show that the use of\nintelligent automation through transformers can significantly enhance the\ncapabilities of dashcam systems. The model achieves an mAP of 0.95 on\ndetection.\n","authors":["Osama Mustafa","Khizer Ali","Anam Bibi","Imran Siddiqi","Momina Moetesum"],"pdf_url":"https://arxiv.org/pdf/2408.15809v1.pdf","comment":"7 Pages, and 6 Figures"},{"id":"http://arxiv.org/abs/2408.15802v1","updated":"2024-08-28T13:53:27Z","published":"2024-08-28T13:53:27Z","title":"Visual Prompt Engineering for Medical Vision Language Models in\n Radiology","summary":" Medical image classification in radiology faces significant challenges,\nparticularly in generalizing to unseen pathologies. In contrast, CLIP offers a\npromising solution by leveraging multimodal learning to improve zero-shot\nclassification performance. However, in the medical domain, lesions can be\nsmall and might not be well represented in the embedding space. Therefore, in\nthis paper, we explore the potential of visual prompt engineering to enhance\nthe capabilities of Vision Language Models (VLMs) in radiology. Leveraging\nBiomedCLIP, trained on extensive biomedical image-text pairs, we investigate\nthe impact of embedding visual markers directly within radiological images to\nguide the model's attention to critical regions. Our evaluation on the JSRT\ndataset, focusing on lung nodule malignancy classification, demonstrates that\nincorporating visual prompts $\\unicode{x2013}$ such as arrows, circles, and\ncontours $\\unicode{x2013}$ significantly improves classification metrics\nincluding AUROC, AUPRC, F1 score, and accuracy. Moreover, the study provides\nattention maps, showcasing enhanced model interpretability and focus on\nclinically relevant areas. These findings underscore the efficacy of visual\nprompt engineering as a straightforward yet powerful approach to advance VLM\nperformance in medical image analysis.\n","authors":["Stefan Denner","Markus Bujotzek","Dimitrios Bounias","David Zimmerer","Raphael Stock","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2408.15802v1.pdf","comment":"Accepted at ECCV 2024 Workshop on Emergent Visual Abilities and\n Limits of Foundation Models"},{"id":"http://arxiv.org/abs/2405.18064v2","updated":"2024-08-28T13:41:34Z","published":"2024-05-28T11:24:20Z","title":"Automated Real-World Sustainability Data Generation from Images of\n Buildings","summary":" When data on building features is unavailable, the task of determining how to\nimprove that building in terms of carbon emissions becomes infeasible. We show\nthat from only a set of images, a Large Language Model with appropriate prompt\nengineering and domain knowledge can successfully estimate a range of building\nfeatures relevant for sustainability calculations. We compare our novel\nimage-to-data method with a ground truth comprising real building data for 47\napartments and achieve accuracy better than a human performing the same task.\nWe also demonstrate that the method can generate tailored recommendations to\nthe owner on how best to improve their properties and discuss methods to scale\nthe approach.\n","authors":["Peter J Bentley","Soo Ling Lim","Rajat Mathur","Sid Narang"],"pdf_url":"https://arxiv.org/pdf/2405.18064v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2307.14382v2","updated":"2024-08-28T13:30:36Z","published":"2023-07-25T20:08:41Z","title":"When Multi-Task Learning Meets Partial Supervision: A Computer Vision\n Review","summary":" Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while\nexploiting their mutual relationships. By using shared resources to\nsimultaneously calculate multiple outputs, this learning paradigm has the\npotential to have lower memory requirements and inference times compared to the\ntraditional approach of using separate methods for each task. Previous work in\nMTL has mainly focused on fully-supervised methods, as task relationships can\nnot only be leveraged to lower the level of data-dependency of those methods\nbut they can also improve performance. However, MTL introduces a set of\nchallenges due to a complex optimisation scheme and a higher labeling\nrequirement. This review focuses on how MTL could be utilised under different\npartial supervision settings to address these challenges. First, this review\nanalyses how MTL traditionally uses different parameter sharing techniques to\ntransfer knowledge in between tasks. Second, it presents the different\nchallenges arising from such a multi-objective optimisation scheme. Third, it\nintroduces how task groupings can be achieved by analysing task relationships.\nFourth, it focuses on how partially supervised methods applied to MTL can\ntackle the aforementioned challenges. Lastly, this review presents the\navailable datasets, tools and benchmarking results of such methods.\n","authors":["Maxime Fontana","Michael Spratling","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2307.14382v2.pdf","comment":"Accepted by Proceedings of the IEEE"},{"id":"http://arxiv.org/abs/2408.15777v1","updated":"2024-08-28T13:15:25Z","published":"2024-08-28T13:15:25Z","title":"A Survey on Facial Expression Recognition of Static and Dynamic Emotions","summary":" Facial expression recognition (FER) aims to analyze emotional states from\nstatic images and dynamic sequences, which is pivotal in enhancing\nanthropomorphic communication among humans, robots, and digital avatars by\nleveraging AI technologies. As the FER field evolves from controlled laboratory\nenvironments to more complex in-the-wild scenarios, advanced methods have been\nrapidly developed and new challenges and apporaches are encounted, which are\nnot well addressed in existing reviews of FER. This paper offers a\ncomprehensive survey of both image-based static FER (SFER) and video-based\ndynamic FER (DFER) methods, analyzing from model-oriented development to\nchallenge-focused categorization. We begin with a critical comparison of recent\nreviews, an introduction to common datasets and evaluation criteria, and an\nin-depth workflow on FER to establish a robust research foundation. We then\nsystematically review representative approaches addressing eight main\nchallenges in SFER (such as expression disturbance, uncertainties, compound\nemotions, and cross-domain inconsistency) as well as seven main challenges in\nDFER (such as key frame sampling, expression intensity variations, and\ncross-modal alignment). Additionally, we analyze recent advancements, benchmark\nperformances, major applications, and ethical considerations. Finally, we\npropose five promising future directions and development trends to guide\nongoing research. The project page for this paper can be found at\nhttps://github.com/wangyanckxx/SurveyFER.\n","authors":["Yan Wang","Shaoqi Yan","Yang Liu","Wei Song","Jing Liu","Yang Chang","Xinji Mai","Xiping Hu","Wenqiang Zhang","Zhongxue Gan"],"pdf_url":"https://arxiv.org/pdf/2408.15777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15769v1","updated":"2024-08-28T13:05:55Z","published":"2024-08-28T13:05:55Z","title":"A Survey on Evaluation of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) mimic human perception and reasoning\nsystem by integrating powerful Large Language Models (LLMs) with various\nmodality encoders (e.g., vision, audio), positioning LLMs as the \"brain\" and\nvarious modality encoders as sensory organs. This framework endows MLLMs with\nhuman-like capabilities, and suggests a potential pathway towards achieving\nartificial general intelligence (AGI). With the emergence of all-round MLLMs\nlike GPT-4V and Gemini, a multitude of evaluation methods have been developed\nto assess their capabilities across different dimensions. This paper presents a\nsystematic and comprehensive review of MLLM evaluation methods, covering the\nfollowing key aspects: (1) the background of MLLMs and their evaluation; (2)\n\"what to evaluate\" that reviews and categorizes existing MLLM evaluation tasks\nbased on the capabilities assessed, including general multimodal recognition,\nperception, reasoning and trustworthiness, and domain-specific applications\nsuch as socioeconomic, natural sciences and engineering, medical usage, AI\nagent, remote sensing, video and audio processing, 3D point cloud analysis, and\nothers; (3) \"where to evaluate\" that summarizes MLLM evaluation benchmarks into\ngeneral and specific benchmarks; (4) \"how to evaluate\" that reviews and\nillustrates MLLM evaluation steps and metrics; Our overarching goal is to\nprovide valuable insights for researchers in the field of MLLM evaluation,\nthereby facilitating the development of more capable and reliable MLLMs. We\nemphasize that evaluation should be regarded as a critical discipline,\nessential for advancing the field of MLLMs.\n","authors":["Jiaxing Huang","Jingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19730v5","updated":"2024-08-28T13:05:41Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Foundation Model","summary":" This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v5.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.15761v1","updated":"2024-08-28T12:56:00Z","published":"2024-08-28T12:56:00Z","title":"Addressing the challenges of loop detection in agricultural environments","summary":" While visual SLAM systems are well studied and achieve impressive results in\nindoor and urban settings, natural, outdoor and open-field environments are\nmuch less explored and still present relevant research challenges. Visual\nnavigation and local mapping have shown a relatively good performance in\nopen-field environments. However, globally consistent mapping and long-term\nlocalization still depend on the robustness of loop detection and closure, for\nwhich the literature is scarce. In this work we propose a novel method to pave\nthe way towards robust loop detection in open fields, particularly in\nagricultural settings, based on local feature search and stereo geometric\nrefinement, with a final stage of relative pose estimation. Our method\nconsistently achieves good loop detections, with a median error of 15cm. We aim\nto characterize open fields as a novel environment for loop detection,\nunderstanding the limitations and problems that arise when dealing with them.\n","authors":["Nicolás Soncini","Javier Civera","Taihú Pire"],"pdf_url":"https://arxiv.org/pdf/2408.15761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18006v2","updated":"2024-08-28T12:43:26Z","published":"2024-04-27T20:54:15Z","title":"FRAME: A Modular Framework for Autonomous Map Merging: Advancements in\n the Field","summary":" In this article, a novel approach for merging 3D point cloud maps in the\ncontext of egocentric multi-robot exploration is presented. Unlike traditional\nmethods, the proposed approach leverages state-of-the-art place recognition and\nlearned descriptors to efficiently detect overlap between maps, eliminating the\nneed for the time-consuming global feature extraction and feature matching\nprocess. The estimated overlapping regions are used to calculate a homogeneous\nrigid transform, which serves as an initial condition for the GICP point cloud\nregistration algorithm to refine the alignment between the maps. The advantages\nof this approach include faster processing time, improved accuracy, and\nincreased robustness in challenging environments. Furthermore, the\neffectiveness of the proposed framework is successfully demonstrated through\nmultiple field missions of robot exploration in a variety of different\nunderground environments.\n","authors":["Nikolaos Stathoulopoulos","Björn Lindqvist","Anton Koval","Ali-akbar Agha-mohammadi","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2404.18006v2.pdf","comment":"28 pages, 24 figures. Accepted to the IEEE Transactions on Field\n Robotics"},{"id":"http://arxiv.org/abs/2312.02255v3","updated":"2024-08-28T12:43:10Z","published":"2023-12-04T18:56:08Z","title":"Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis","summary":" Recent neural rendering and reconstruction techniques, such as NeRFs or\nGaussian Splatting, have shown remarkable novel view synthesis capabilities but\nrequire hundreds of images of the scene from diverse viewpoints to render\nhigh-quality novel views. With fewer images available, these methods start to\nfail since they can no longer correctly triangulate the underlying 3D geometry\nand converge to a non-optimal solution. These failures can manifest as floaters\nor blurry renderings in sparsely observed areas of the scene. In this paper, we\npropose Re-Nerfing, a simple and general add-on approach that leverages novel\nview synthesis itself to tackle this problem. Using an already trained NVS\nmethod, we render novel views between existing ones and augment the training\ndata to optimize a second model. This introduces additional multi-view\nconstraints and allows the second model to converge to a better solution. With\nRe-Nerfing we achieve significant improvements upon multiple pipelines based on\nNeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and\nLLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra\nsupervision signals, making it a flexible and practical add-on.\n","authors":["Felix Tristram","Stefano Gasperini","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.02255v3.pdf","comment":"Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2408.15750v1","updated":"2024-08-28T12:33:26Z","published":"2024-08-28T12:33:26Z","title":"Str-L Pose: Integrating Point and Structured Line for Relative Pose\n Estimation in Dual-Graph","summary":" Relative pose estimation is crucial for various computer vision applications,\nincluding Robotic and Autonomous Driving. Current methods primarily depend on\nselecting and matching feature points prone to incorrect matches, leading to\npoor performance. Consequently, relying solely on point-matching relationships\nfor pose estimation is a huge challenge. To overcome these limitations, we\npropose a Geometric Correspondence Graph neural network that integrates point\nfeatures with extra structured line segments. This integration of matched\npoints and line segments further exploits the geometry constraints and enhances\nmodel performance across different environments. We employ the Dual-Graph\nmodule and Feature Weighted Fusion Module to aggregate geometric and visual\nfeatures effectively, facilitating complex scene understanding. We demonstrate\nour approach through extensive experiments on the DeMoN and KITTI Odometry\ndatasets. The results show that our method is competitive with state-of-the-art\ntechniques.\n","authors":["Zherong Zhang","Chunyu Lin","Shujuan Huang","Shangrong Yang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15741v1","updated":"2024-08-28T12:08:25Z","published":"2024-08-28T12:08:25Z","title":"Segmentation-guided Layer-wise Image Vectorization with Gradient Fills","summary":" The widespread use of vector graphics creates a significant demand for\nvectorization methods. While recent learning-based techniques have shown their\ncapability to create vector images of clear topology, filling these primitives\nwith gradients remains a challenge. In this paper, we propose a\nsegmentation-guided vectorization framework to convert raster images into\nconcise vector graphics with radial gradient fills. With the guidance of an\nembedded gradient-aware segmentation subroutine, our approach progressively\nappends gradient-filled B\\'ezier paths to the output, where primitive\nparameters are initiated with our newly designed initialization technique and\nare optimized to minimize our novel loss function. We build our method on a\ndifferentiable renderer with traditional segmentation algorithms to develop it\nas a model-free tool for raster-to-vector conversion. It is tested on various\ninputs to demonstrate its feasibility, independent of datasets, to synthesize\nvector graphics with improved visual quality and layer-wise topology compared\nto prior work.\n","authors":["Hengyu Zhou","Hui Zhang","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15740v1","updated":"2024-08-28T12:06:11Z","published":"2024-08-28T12:06:11Z","title":"MambaPlace:Text-to-Point-Cloud Cross-Modal Place Recognition with\n Attention Mamba Mechanisms","summary":" Vision Language Place Recognition (VLVPR) enhances robot localization\nperformance by incorporating natural language descriptions from images. By\nutilizing language information, VLVPR directs robot place matching, overcoming\nthe constraint of solely depending on vision. The essence of multimodal fusion\nlies in mining the complementary information between different modalities.\nHowever, general fusion methods rely on traditional neural architectures and\nare not well equipped to capture the dynamics of cross modal interactions,\nespecially in the presence of complex intra modal and inter modal correlations.\nTo this end, this paper proposes a novel coarse to fine and end to end\nconnected cross modal place recognition framework, called MambaPlace. In the\ncoarse localization stage, the text description and 3D point cloud are encoded\nby the pretrained T5 and instance encoder, respectively. They are then\nprocessed using Text Attention Mamba (TAM) and Point Clouds Mamba (PCM) for\ndata enhancement and alignment. In the subsequent fine localization stage, the\nfeatures of the text description and 3D point cloud are cross modally fused and\nfurther enhanced through cascaded Cross Attention Mamba (CCAM). Finally, we\npredict the positional offset from the fused text point cloud features,\nachieving the most accurate localization. Extensive experiments show that\nMambaPlace achieves improved localization accuracy on the KITTI360Pose dataset\ncompared to the state of the art methods.\n","authors":["Tianyi Shang","Zhenyu Li","Wenhao Pei","Pengjie Xu","ZhaoJun Deng","Fanchen Kong"],"pdf_url":"https://arxiv.org/pdf/2408.15740v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.14035v2","updated":"2024-08-28T12:03:50Z","published":"2024-08-26T06:01:54Z","title":"FAST-LIVO2: Fast, Direct LiDAR-Inertial-Visual Odometry","summary":" This paper proposes FAST-LIVO2: a fast, direct LiDAR-inertial-visual odometry\nframework to achieve accurate and robust state estimation in SLAM tasks and\nprovide great potential in real-time, onboard robotic applications. FAST-LIVO2\nfuses the IMU, LiDAR and image measurements efficiently through an ESIKF. To\naddress the dimension mismatch between the heterogeneous LiDAR and image\nmeasurements, we use a sequential update strategy in the Kalman filter. To\nenhance the efficiency, we use direct methods for both the visual and LiDAR\nfusion, where the LiDAR module registers raw points without extracting edge or\nplane features and the visual module minimizes direct photometric errors\nwithout extracting ORB or FAST corner features. The fusion of both visual and\nLiDAR measurements is based on a single unified voxel map where the LiDAR\nmodule constructs the geometric structure for registering new LiDAR scans and\nthe visual module attaches image patches to the LiDAR points. To enhance the\naccuracy of image alignment, we use plane priors from the LiDAR points in the\nvoxel map (and even refine the plane prior) and update the reference patch\ndynamically after new images are aligned. Furthermore, to enhance the\nrobustness of image alignment, FAST-LIVO2 employs an on-demanding raycast\noperation and estimates the image exposure time in real time. Lastly, we detail\nthree applications of FAST-LIVO2: UAV onboard navigation demonstrating the\nsystem's computation efficiency for real-time onboard navigation, airborne\nmapping showcasing the system's mapping accuracy, and 3D model rendering\n(mesh-based and NeRF-based) underscoring the suitability of our reconstructed\ndense map for subsequent rendering tasks. We open source our code, dataset and\napplication on GitHub to benefit the robotics community.\n","authors":["Chunran Zheng","Wei Xu","Zuhao Zou","Tong Hua","Chongjian Yuan","Dongjiao He","Bingyang Zhou","Zheng Liu","Jiarong Lin","Fangcheng Zhu","Yunfan Ren","Rong Wang","Fanle Meng","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.14035v2.pdf","comment":"30 pages, 31 figures, due to the limitation that 'The abstract field\n cannot exceed 1,920 characters', the abstract presented here is shorter than\n the one in the PDF file"},{"id":"http://arxiv.org/abs/2408.15721v1","updated":"2024-08-28T11:36:43Z","published":"2024-08-28T11:36:43Z","title":"Defending Text-to-image Diffusion Models: Surprising Efficacy of Textual\n Perturbations Against Backdoor Attacks","summary":" Text-to-image diffusion models have been widely adopted in real-world\napplications due to their ability to generate realistic images from textual\ndescriptions. However, recent studies have shown that these methods are\nvulnerable to backdoor attacks. Despite the significant threat posed by\nbackdoor attacks on text-to-image diffusion models, countermeasures remain\nunder-explored. In this paper, we address this research gap by demonstrating\nthat state-of-the-art backdoor attacks against text-to-image diffusion models\ncan be effectively mitigated by a surprisingly simple defense strategy -\ntextual perturbation. Experiments show that textual perturbations are effective\nin defending against state-of-the-art backdoor attacks with minimal sacrifice\nto generation quality. We analyze the efficacy of textual perturbation from two\nangles: text embedding space and cross-attention maps. They further explain how\nbackdoor attacks have compromised text-to-image diffusion models, providing\ninsights for studying future attack and defense strategies. Our code is\navailable at https://github.com/oscarchew/t2i-backdoor-defense.\n","authors":["Oscar Chew","Po-Yi Lu","Jayden Lin","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2408.15721v1.pdf","comment":"ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond"},{"id":"http://arxiv.org/abs/2408.15714v1","updated":"2024-08-28T11:21:23Z","published":"2024-08-28T11:21:23Z","title":"Pixels to Prose: Understanding the art of Image Captioning","summary":" In the era of evolving artificial intelligence, machines are increasingly\nemulating human-like capabilities, including visual perception and linguistic\nexpression. Image captioning stands at the intersection of these domains,\nenabling machines to interpret visual content and generate descriptive text.\nThis paper provides a thorough review of image captioning techniques, catering\nto individuals entering the field of machine learning who seek a comprehensive\nunderstanding of available options, from foundational methods to\nstate-of-the-art approaches. Beginning with an exploration of primitive\narchitectures, the review traces the evolution of image captioning models to\nthe latest cutting-edge solutions. By dissecting the components of these\narchitectures, readers gain insights into the underlying mechanisms and can\nselect suitable approaches tailored to specific problem requirements without\nduplicating efforts. The paper also delves into the application of image\ncaptioning in the medical domain, illuminating its significance in various\nreal-world scenarios.\n Furthermore, the review offers guidance on evaluating the performance of\nimage captioning systems, highlighting key metrics for assessment. By\nsynthesizing theoretical concepts with practical application, this paper equips\nreaders with the knowledge needed to navigate the complex landscape of image\ncaptioning and harness its potential for diverse applications in machine\nlearning and beyond.\n","authors":["Hrishikesh Singh","Aarti Sharma","Millie Pant"],"pdf_url":"https://arxiv.org/pdf/2408.15714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15708v1","updated":"2024-08-28T11:13:27Z","published":"2024-08-28T11:13:27Z","title":"Towards Realistic Example-based Modeling via 3D Gaussian Stitching","summary":" Using parts of existing models to rebuild new models, commonly termed as\nexample-based modeling, is a classical methodology in the realm of computer\ngraphics. Previous works mostly focus on shape composition, making them very\nhard to use for realistic composition of 3D objects captured from real-world\nscenes. This leads to combining multiple NeRFs into a single 3D scene to\nachieve seamless appearance blending. However, the current SeamlessNeRF method\nstruggles to achieve interactive editing and harmonious stitching for\nreal-world scenes due to its gradient-based strategy and grid-based\nrepresentation. To this end, we present an example-based modeling method that\ncombines multiple Gaussian fields in a point-based representation using\nsample-guided synthesis. Specifically, as for composition, we create a GUI to\nsegment and transform multiple fields in real time, easily obtaining a\nsemantically meaningful composition of models represented by 3D Gaussian\nSplatting (3DGS). For texture blending, due to the discrete and irregular\nnature of 3DGS, straightforwardly applying gradient propagation as SeamlssNeRF\nis not supported. Thus, a novel sampling-based cloning method is proposed to\nharmonize the blending while preserving the original rich texture and content.\nOur workflow consists of three steps: 1) real-time segmentation and\ntransformation of a Gaussian model using a well-tailored GUI, 2) KNN analysis\nto identify boundary points in the intersecting area between the source and\ntarget models, and 3) two-phase optimization of the target model using\nsampling-based cloning and gradient constraints. Extensive experimental results\nvalidate that our approach significantly outperforms previous works in terms of\nrealistic synthesis, demonstrating its practicality. More demos are available\nat https://ingra14m.github.io/gs_stitching_website.\n","authors":["Xinyu Gao","Ziyi Yang","Bingchen Gong","Xiaoguang Han","Sipeng Yang","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2408.15708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10534v2","updated":"2024-08-28T11:12:35Z","published":"2024-07-15T08:42:10Z","title":"Automated Label Unification for Multi-Dataset Semantic Segmentation with\n GNNs","summary":" Deep supervised models possess significant capability to assimilate extensive\ntraining data, thereby presenting an opportunity to enhance model performance\nthrough training on multiple datasets. However, conflicts arising from\ndifferent label spaces among datasets may adversely affect model performance.\nIn this paper, we propose a novel approach to automatically construct a unified\nlabel space across multiple datasets using graph neural networks. This enables\nsemantic segmentation models to be trained simultaneously on multiple datasets,\nresulting in performance improvements. Unlike existing methods, our approach\nfacilitates seamless training without the need for additional manual\nreannotation or taxonomy reconciliation. This significantly enhances the\nefficiency and effectiveness of multi-dataset segmentation model training. The\nresults demonstrate that our method significantly outperforms other\nmulti-dataset training methods when trained on seven datasets simultaneously,\nand achieves state-of-the-art performance on the WildDash 2 benchmark.\n","authors":["Rong Ma","Jie Chen","Xiangyang Xue","Jian Pu"],"pdf_url":"https://arxiv.org/pdf/2407.10534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11982v2","updated":"2024-08-28T11:01:16Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n Results","summary":" Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Xiaoheng Tan","Haiqiang Wang","Xiaozhong Xu","Shan Liu","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17550v2","updated":"2024-08-28T10:52:32Z","published":"2024-03-26T09:58:06Z","title":"DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping","summary":" Recently, significant progress has been achieved in sensing real large-scale\noutdoor 3D environments, particularly by using modern acquisition equipment\nsuch as LiDAR sensors. Unfortunately, they are fundamentally limited in their\nability to produce dense, complete 3D scenes. To address this issue, recent\nlearning-based methods integrate neural implicit representations and\noptimizable feature grids to approximate surfaces of 3D scenes. However,\nnaively fitting samples along raw LiDAR rays leads to noisy 3D mapping results\ndue to the nature of sparse, conflicting LiDAR measurements. Instead, in this\nwork we depart from fitting LiDAR data exactly, instead letting the network\noptimize a non-metric monotonic implicit field defined in 3D space. To fit our\nfield, we design a learning system integrating a monotonicity loss that enables\noptimizing neural monotonic fields and leverages recent progress in large-scale\n3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as\ncaptured by multiple quantitative and perceptual measures and visual results\nobtained for Mai City, Newer College, and KITTI benchmarks. The code of our\napproach will be made publicly available.\n","authors":["Kutay Yılmaz","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.17550v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15695v1","updated":"2024-08-28T10:43:42Z","published":"2024-08-28T10:43:42Z","title":"G-Style: Stylized Gaussian Splatting","summary":" We introduce G-Style, a novel algorithm designed to transfer the style of an\nimage onto a 3D scene represented using Gaussian Splatting. Gaussian Splatting\nis a powerful 3D representation for novel view synthesis, as -- compared to\nother approaches based on Neural Radiance Fields -- it provides fast scene\nrenderings and user control over the scene. Recent pre-prints have demonstrated\nthat the style of Gaussian Splatting scenes can be modified using an image\nexemplar. However, since the scene geometry remains fixed during the\nstylization process, current solutions fall short of producing satisfactory\nresults. Our algorithm aims to address these limitations by following a\nthree-step process: In a pre-processing step, we remove undesirable Gaussians\nwith large projection areas or highly elongated shapes. Subsequently, we\ncombine several losses carefully designed to preserve different scales of the\nstyle in the image, while maintaining as much as possible the integrity of the\noriginal scene content. During the stylization process and following the\noriginal design of Gaussian Splatting, we split Gaussians where additional\ndetail is necessary within our scene by tracking the gradient of the stylized\ncolor. Our experiments demonstrate that G-Style generates high-quality\nstylizations within just a few minutes, outperforming existing methods both\nqualitatively and quantitatively.\n","authors":["Áron Samuel Kovács","Pedro Hermosilla","Renata G. Raidou"],"pdf_url":"https://arxiv.org/pdf/2408.15695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15693v1","updated":"2024-08-28T10:33:00Z","published":"2024-08-28T10:33:00Z","title":"Synthetic Forehead-creases Biometric Generation for Reliable User\n Verification","summary":" Recent studies have emphasized the potential of forehead-crease patterns as\nan alternative for face, iris, and periocular recognition, presenting\ncontactless and convenient solutions, particularly in situations where faces\nare covered by surgical masks. However, collecting forehead data presents\nchallenges, including cost and time constraints, as developing and optimizing\nforehead verification methods requires a substantial number of high-quality\nimages. To tackle these challenges, the generation of synthetic biometric data\nhas gained traction due to its ability to protect privacy while enabling\neffective training of deep learning-based biometric verification methods. In\nthis paper, we present a new framework to synthesize forehead-crease image data\nwhile maintaining important features, such as uniqueness and realism. The\nproposed framework consists of two main modules: a Subject-Specific Generation\nModule (SSGM), based on an image-to-image Brownian Bridge Diffusion Model\n(BBDM), which learns a one-to-many mapping between image pairs to generate\nidentity-aware synthetic forehead creases corresponding to real subjects, and a\nSubject-Agnostic Generation Module (SAGM), which samples new synthetic\nidentities with assistance from the SSGM. We evaluate the diversity and realism\nof the generated forehead-crease images primarily using the Fr\\'echet Inception\nDistance (FID) and the Structural Similarity Index Measure (SSIM). In addition,\nwe assess the utility of synthetically generated forehead-crease images using a\nforehead-crease verification system (FHCVS). The results indicate an\nimprovement in the verification accuracy of the FHCVS by utilizing synthetic\ndata.\n","authors":["Abhishek Tandon","Geetanjali Sharma","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2408.15693v1.pdf","comment":"Accepted at Generative AI for Futuristic Biometrics - IJCB'24 Special\n Session"},{"id":"http://arxiv.org/abs/2408.15682v1","updated":"2024-08-28T10:12:44Z","published":"2024-08-28T10:12:44Z","title":"A quantitative model of takeover request time budget for conditionally\n automated driving","summary":" In conditional automation, the automated driving system assumes full control\nand only issues a takeover request to a human driver to resume driving in\ncritical situations. Previous studies have concluded that the time budget\nrequired by drivers to resume driving after a takeover request varies with\nsituations and different takeover variables. However, no comprehensive\ngeneralized approaches for estimating in advance the time budget required by\ndrivers to takeover have been provided. In this contribution, fixed (7 s) and\nvariable time budgets (6 s, 5 s, and 4 s) with and without visual imagery\nassistance were investigated for suitability in three takeover scenarios using\nperformance measures such as average lateral displacement. The results indicate\nthat 7 s is suitable for two of the studied scenarios based on their\ncharacteristics. Using the obtained results and known relations between\ntakeover variables, a mathematical formula for estimating takeover request time\nbudget is proposed. The proposed formula integrates individual stimulus\nresponse time, driving experience, scenario specific requirements and allows\nincreased safety for takeover maneuvers. Furthermore, the visual imagery\nresulted in increased takeover time which invariably increases the time budget.\nThus the time demand of the visualized information if applicable (such as\nvisual imagery) should be included in the time budget.\n","authors":["Foghor Tanshi","Dirk Söffker"],"pdf_url":"https://arxiv.org/pdf/2408.15682v1.pdf","comment":"Manuscript: 12 pages, 12 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.15679v1","updated":"2024-08-28T10:08:38Z","published":"2024-08-28T10:08:38Z","title":"DEAR: Depth-Enhanced Action Recognition","summary":" Detecting actions in videos, particularly within cluttered scenes, poses\nsignificant challenges due to the limitations of 2D frame analysis from a\ncamera perspective. Unlike human vision, which benefits from 3D understanding,\nrecognizing actions in such environments can be difficult. This research\nintroduces a novel approach integrating 3D features and depth maps alongside\nRGB features to enhance action recognition accuracy. Our method involves\nprocessing estimated depth maps through a separate branch from the RGB feature\nencoder and fusing the features to understand the scene and actions\ncomprehensively. Using the Side4Video framework and VideoMamba, which employ\nCLIP and VisionMamba for spatial feature extraction, our approach outperformed\nour implementation of the Side4Video network on the Something-Something V2\ndataset. Our code is available at: https://github.com/SadeghRahmaniB/DEAR\n","authors":["Sadegh Rahmaniboldaji","Filip Rybansky","Quoc Vuong","Frank Guerin","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2408.15679v1.pdf","comment":"5 pages, 1 figure, 1 table, accepted at Human-inspired Computer\n Vision, ECCV"},{"id":"http://arxiv.org/abs/2408.15678v1","updated":"2024-08-28T10:07:17Z","published":"2024-08-28T10:07:17Z","title":"Deep Learning Based Speckle Filtering for Polarimetric SAR Images.\n Application to Sentinel-1","summary":" Speckle suppression in synthetic aperture radar (SAR) images is a key\nprocessing step which continues to be a research topic. A wide variety of\nmethods, using either spatially-based approaches or transform-based strategies,\nhave been developed and have shown to provide outstanding results. However,\nrecent advances in deep learning techniques and their application to SAR image\ndespeckling have been demonstrated to offer state-of-the-art results.\nUnfortunately, they have been mostly applied to single-polarimetric images. The\nextension of a deep learning-based approach for speckle removal to polarimetric\nSAR (PolSAR) images is complicated because of the complex nature of the\nmeasured covariance matrices for every image pixel, the properties of which\nmust be preserved during filtering. In this work, we propose a complete\nframework to remove speckle in polarimetric SAR images using a convolutional\nneural network. The methodology includes a reversible transformation of the\noriginal complex covariance matrix to obtain a set of real-valued intensity\nbands which are fed to the neural network. In addition, the proposed method\nincludes a change detection strategy to avoid the neural network to learn\nerroneous features in areas strongly affected by temporal changes, so that the\nnetwork only learns the underlying speckle component present in the data. The\nmethod is implemented and tested with dual-polarimetric images acquired by\nSentinel-1. Experiments show that the proposed approach offers exceptional\nresults in both speckle reduction and resolution preservation. More\nimportantly, it is also shown that the neural network is not generating\nartifacts or introducing bias in the filtered images, making them suitable for\nfurther polarimetric processing and exploitation.\n","authors":["Alejandro Mestre-Quereda","Juan M. Lopez-Sanchez"],"pdf_url":"https://arxiv.org/pdf/2408.15678v1.pdf","comment":"23 pages, 32 figures"},{"id":"http://arxiv.org/abs/2312.03187v3","updated":"2024-08-28T10:00:01Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n Generation from Spontaneous Facial Expression Reaction","summary":" Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically score user preferences\nfrom their spontaneous facial expression reaction to the generated images. We\ncollect a dataset of Facial Expression Reaction to Generated Images (FERGI) and\nshow that the activations of multiple facial action units (AUs) are highly\ncorrelated with user evaluations of the generated images. We develop an FAU-Net\n(Facial Action Units Neural Network), which receives inputs from an AU\nestimation model, to automatically score user preferences for text-to-image\ngeneration based on their facial expression reactions, which is complementary\nto the pre-trained scoring models based on the input text prompts and generated\nimages. Integrating our FAU-Net valence score with the pre-trained scoring\nmodels improves their consistency with human preferences. This method of\nautomatic annotation with facial expression analysis can be potentially\ngeneralized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14930v2","updated":"2024-08-28T09:50:00Z","published":"2024-08-27T10:09:17Z","title":"CMTA: Cross-Modal Temporal Alignment for Event-guided Video Deblurring","summary":" Video deblurring aims to enhance the quality of restored results in\nmotion-blurred videos by effectively gathering information from adjacent video\nframes to compensate for the insufficient data in a single blurred frame.\nHowever, when faced with consecutively severe motion blur situations,\nframe-based video deblurring methods often fail to find accurate temporal\ncorrespondence among neighboring video frames, leading to diminished\nperformance. To address this limitation, we aim to solve the video deblurring\ntask by leveraging an event camera with micro-second temporal resolution. To\nfully exploit the dense temporal resolution of the event camera, we propose two\nmodules: 1) Intra-frame feature enhancement operates within the exposure time\nof a single blurred frame, iteratively enhancing cross-modality features in a\nrecurrent manner to better utilize the rich temporal information of events, 2)\nInter-frame temporal feature alignment gathers valuable long-range temporal\ninformation to target frames, aggregating sharp features leveraging the\nadvantages of the events. In addition, we present a novel dataset composed of\nreal-world blurred RGB videos, corresponding sharp videos, and event data. This\ndataset serves as a valuable resource for evaluating event-guided deblurring\nmethods. We demonstrate that our proposed methods outperform state-of-the-art\nframe-based and event-based motion deblurring methods through extensive\nexperiments conducted on both synthetic and real-world deblurring datasets. The\ncode and dataset are available at https://github.com/intelpro/CMTA.\n","authors":["Taewoo Kim","Hoonhee Cho","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2408.14930v2.pdf","comment":"Accepted in ECCV2024"},{"id":"http://arxiv.org/abs/2401.12471v2","updated":"2024-08-28T09:48:24Z","published":"2024-01-23T03:45:05Z","title":"Training-Free Action Recognition and Goal Inference with Dynamic Frame\n Selection","summary":" We introduce VidTFS, a Training-free, open-vocabulary video goal and action\ninference framework that combines the frozen vision foundational model (VFM)\nand large language model (LLM) with a novel dynamic Frame Selection module. Our\nexperiments demonstrate that the proposed frame selection module improves the\nperformance of the framework significantly. We validate the performance of the\nproposed VidTFS on four widely used video datasets, including CrossTask, COIN,\nUCF101, and ActivityNet, covering goal inference and action recognition tasks\nunder open-vocabulary settings without requiring any training or fine-tuning.\nThe results show that VidTFS outperforms pretrained and instruction-tuned\nmultimodal language models that directly stack LLM and VFM for downstream video\ninference tasks. Our VidTFS with its adaptability shows the future potential\nfor generalizing to new training-free video inference tasks.\n","authors":["Ee Yeo Keat","Zhang Hao","Alexander Matyasko","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.12471v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15048v2","updated":"2024-08-28T09:42:44Z","published":"2024-01-26T18:20:53Z","title":"Unrecognizable Yet Identifiable: Image Distortion with Preserved\n Embeddings","summary":" Biometric authentication systems play a crucial role in modern security\nsystems. However, maintaining the balance of privacy and integrity of stored\nbiometrics derivative data while achieving high recognition accuracy is often\nchallenging. Addressing this issue, we introduce an innovative image\ntransformation technique that effectively renders facial images unrecognizable\nto the eye while maintaining their identifiability by neural network models,\nwhich allows the distorted photo version to be stored for further verification.\nWhile initially intended for biometrics systems, the proposed methodology can\nbe used in various artificial intelligence applications to distort the visual\ndata and keep the derived features close. By experimenting with widely used\ndatasets LFW and MNIST, we show that it is possible to build the distortion\nthat changes the image content by more than 70% while maintaining the same\nrecognition accuracy. We compare our method with previously state-of-the-art\napproaches. We publically release the source code.\n","authors":["Dmytro Zakharov","Oleksandr Kuznetsov","Emanuele Frontoni"],"pdf_url":"https://arxiv.org/pdf/2401.15048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15667v1","updated":"2024-08-28T09:40:40Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n vision transformers","summary":" Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02044v4","updated":"2024-08-28T09:34:33Z","published":"2023-10-03T13:35:49Z","title":"How Physics and Background Attributes Impact Video Transformers in\n Robotic Manipulation: A Case Study on Planar Pushing","summary":" As model and dataset sizes continue to scale in robot learning, the need to\nunderstand how the composition and properties of a dataset affect model\nperformance becomes increasingly urgent to ensure cost-effective data\ncollection and model performance. In this work, we empirically investigate how\nphysics attributes (color, friction coefficient, shape) and scene background\ncharacteristics, such as the complexity and dynamics of interactions with\nbackground objects, influence the performance of Video Transformers in\npredicting planar pushing trajectories. We investigate three primary questions:\nHow do physics attributes and background scene characteristics influence model\nperformance? What kind of changes in attributes are most detrimental to model\ngeneralization? What proportion of fine-tuning data is required to adapt models\nto novel scenarios? To facilitate this research, we present\nCloudGripper-Push-1K, a large real-world vision-based robot pushing dataset\ncomprising 1278 hours and 460,000 videos of planar pushing interactions with\nobjects with different physics and background attributes. We also propose Video\nOcclusion Transformer (VOT), a generic modular video-transformer-based\ntrajectory prediction framework which features 3 choices of 2D-spatial encoders\nas the subject of our case study. The dataset and source code are available at\nhttps://cloudgripper.org.\n","authors":["Shutong Jin","Ruiyu Wang","Muhammad Zahid","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2310.02044v4.pdf","comment":"IEEE/RSJ IROS 2024"},{"id":"http://arxiv.org/abs/2408.15660v1","updated":"2024-08-28T09:22:32Z","published":"2024-08-28T09:22:32Z","title":"Merging and Splitting Diffusion Paths for Semantically Coherent\n Panoramas","summary":" Diffusion models have become the State-of-the-Art for text-to-image\ngeneration, and increasing research effort has been dedicated to adapting the\ninference process of pretrained diffusion models to achieve zero-shot\ncapabilities. An example is the generation of panorama images, which has been\ntackled in recent works by combining independent diffusion paths over\noverlapping latent features, which is referred to as joint diffusion, obtaining\nperceptually aligned panoramas. However, these methods often yield semantically\nincoherent outputs and trade-off diversity for uniformity. To overcome this\nlimitation, we propose the Merge-Attend-Diffuse operator, which can be plugged\ninto different types of pretrained diffusion models used in a joint diffusion\nsetting to improve the perceptual and semantical coherence of the generated\npanorama images. Specifically, we merge the diffusion paths, reprogramming\nself- and cross-attention to operate on the aggregated latent space. Extensive\nquantitative and qualitative experimental analysis, together with a user study,\ndemonstrate that our method maintains compatibility with the input prompt and\nvisual quality of the generated images while increasing their semantic\ncoherence. We release the code at https://github.com/aimagelab/MAD.\n","authors":["Fabio Quattrini","Vittorio Pippi","Silvia Cascianelli","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.15660v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.15657v1","updated":"2024-08-28T09:18:36Z","published":"2024-08-28T09:18:36Z","title":"TeFF: Tracking-enhanced Forgetting-free Few-shot 3D LiDAR Semantic\n Segmentation","summary":" In autonomous driving, 3D LiDAR plays a crucial role in understanding the\nvehicle's surroundings. However, the newly emerged, unannotated objects\npresents few-shot learning problem for semantic segmentation. This paper\naddresses the limitations of current few-shot semantic segmentation by\nexploiting the temporal continuity of LiDAR data. Employing a tracking model to\ngenerate pseudo-ground-truths from a sequence of LiDAR frames, our method\nsignificantly augments the dataset, enhancing the model's ability to learn on\nnovel classes. However, this approach introduces a data imbalance biased to\nnovel data that presents a new challenge of catastrophic forgetting. To\nmitigate this, we incorporate LoRA, a technique that reduces the number of\ntrainable parameters, thereby preserving the model's performance on base\nclasses while improving its adaptability to novel classes. This work represents\na significant step forward in few-shot 3D LiDAR semantic segmentation for\nautonomous driving. Our code is available at\nhttps://github.com/junbao-zhou/Track-no-forgetting.\n","authors":["Junbao Zhou","Jilin Mei","Pengze Wu","Liang Chen","Fangzhou Zhao","Xijun Zhao","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13123v2","updated":"2024-08-28T09:18:00Z","published":"2024-08-23T14:50:49Z","title":"Evidential Deep Partial Multi-View Classification With Discount Fusion","summary":" Incomplete multi-view data classification poses significant challenges due to\nthe common issue of missing views in real-world scenarios. Despite\nadvancements, existing methods often fail to provide reliable predictions,\nlargely due to the uncertainty of missing views and the inconsistent quality of\nimputed data. To tackle these problems, we propose a novel framework called\nEvidential Deep Partial Multi-View Classification (EDP-MVC). Initially, we use\nK-means imputation to address missing views, creating a complete set of\nmulti-view data. However, the potential conflicts and uncertainties within this\nimputed data can affect the reliability of downstream inferences. To manage\nthis, we introduce a Conflict-Aware Evidential Fusion Network (CAEFN), which\ndynamically adjusts based on the reliability of the evidence, ensuring\ntrustworthy discount fusion and producing reliable inference outcomes.\nComprehensive experiments on various benchmark datasets reveal EDP-MVC not only\nmatches but often surpasses the performance of state-of-the-art methods.\n","authors":["Haojian Huang","Zhe Liu","Sukumar Letchmunan","Muhammet Deveci","Mingwei Lin","Weizhong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13123v2.pdf","comment":"Ongoing work. 13 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.15656v1","updated":"2024-08-28T09:17:25Z","published":"2024-08-28T09:17:25Z","title":"Realigned Softmax Warping for Deep Metric Learning","summary":" Deep Metric Learning (DML) loss functions traditionally aim to control the\nforces of separability and compactness within an embedding space so that the\nsame class data points are pulled together and different class ones are pushed\napart. Within the context of DML, a softmax operation will typically normalize\ndistances into a probability for optimization, thus coupling all the push/pull\nforces together. This paper proposes a potential new class of loss functions\nthat operate within a euclidean domain and aim to take full advantage of the\ncoupled forces governing embedding space formation under a softmax. These\nforces of compactness and separability can be boosted or mitigated within\ncontrolled locations at will by using a warping function. In this work, we\nprovide a simple example of a warping function and use it to achieve\ncompetitive, state-of-the-art results on various metric learning benchmarks.\n","authors":["Michael G. DeMoor","John J. Prevost"],"pdf_url":"https://arxiv.org/pdf/2408.15656v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.15119v2","updated":"2024-08-28T09:11:55Z","published":"2024-08-27T14:58:13Z","title":"Urdu Digital Text Word Optical Character Recognition Using Permuted Auto\n Regressive Sequence Modeling","summary":" This research paper presents a novel word-level Optical Character Recognition\n(OCR) model developed specifically for digital Urdu text. The model utilizes\ntransformer-based architectures and attention mechanisms to address the unique\nchallenges of recognizing Urdu script, which includes handling a diverse range\nof text styles, fonts, and variations. Trained on a comprehensive dataset of\napproximately 160,000 Urdu text images, the model incorporates a permuted\nautoregressive sequence (PARSeq) architecture. This design enables\ncontext-aware inference and iterative refinement by leveraging bidirectional\ncontext information, significantly enhancing its ability to accurately\nrecognize Urdu characters. The model achieves a character error rate (CER) of\n0.178, highlighting its effectiveness and precision in real-world applications.\nHowever, the model has some limitations, such as difficulties with blurred\nimages, non-horizontal orientations, and the presence of trailing punctuation\nmarks, which can introduce noise into the recognition process. Addressing these\nchallenges will be a key focus of future work. Future research will aim to\nfurther refine the model through advanced data augmentation techniques,\noptimization of hyperparameters, and the integration of context-aware language\nmodels, ultimately enhancing the model's performance and robustness in Urdu\ntext recognition.\n","authors":["Ahmed Mustafa","Muhammad Tahir Rafique","Muhammad Ijlal Baig","Hasan Sajid","Muhammad Jawad Khan","Karam Dad Kallu"],"pdf_url":"https://arxiv.org/pdf/2408.15119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00467v2","updated":"2024-08-28T09:11:40Z","published":"2024-03-01T11:45:29Z","title":"When ControlNet Meets Inexplicit Masks: A Case Study of ControlNet on\n its Contour-following Ability","summary":" ControlNet excels at creating content that closely matches precise contours\nin user-provided masks. However, when these masks contain noise, as a frequent\noccurrence with non-expert users, the output would include unwanted artifacts.\nThis paper first highlights the crucial role of controlling the impact of these\ninexplicit masks with diverse deterioration levels through in-depth analysis.\nSubsequently, to enhance controllability with inexplicit masks, an advanced\nShape-aware ControlNet consisting of a deterioration estimator and a\nshape-prior modulation block is devised. The deterioration estimator assesses\nthe deterioration factor of the provided masks. Then this factor is utilized in\nthe modulation block to adaptively modulate the model's contour-following\nability, which helps it dismiss the noise part in the inexplicit masks.\nExtensive experiments prove its effectiveness in encouraging ControlNet to\ninterpret inaccurate spatial conditions robustly rather than blindly following\nthe given contours. We showcase application scenarios like modifying shape\npriors and composable shape-controllable generation. Codes are soon available.\n","authors":["Wenjie Xuan","Yufei Xu","Shanshan Zhao","Chaoyue Wang","Juhua Liu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2403.00467v2.pdf","comment":"Accepted by ACM-MM 2024"},{"id":"http://arxiv.org/abs/2401.11790v2","updated":"2024-08-28T09:09:34Z","published":"2024-01-22T09:40:52Z","title":"Deep Learning for Computer Vision based Activity Recognition and Fall\n Detection of the Elderly: a Systematic Review","summary":" As the percentage of elderly people in developed countries increases\nworldwide, the healthcare of this collective is a worrying matter, especially\nif it includes the preservation of their autonomy. In this direction, many\nstudies are being published on Ambient Assisted Living (AAL) systems, which\nhelp to reduce the preoccupations raised by the independent living of the\nelderly. In this study, a systematic review of the literature is presented on\nfall detection and Human Activity Recognition (HAR) for the elderly, as the two\nmain tasks to solve to guarantee the safety of elderly people living alone. To\naddress the current tendency to perform these two tasks, the review focuses on\nthe use of Deep Learning (DL) based approaches on computer vision data. In\naddition, different collections of data like DL models, datasets or hardware\n(e.g. depth or thermal cameras) are gathered from the reviewed studies and\nprovided for reference in future studies. Strengths and weaknesses of existing\napproaches are also discussed and, based on them, our recommendations for\nfuture works are provided.\n","authors":["F. Xavier Gaya-Morey","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11790v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15651v1","updated":"2024-08-28T09:07:40Z","published":"2024-08-28T09:07:40Z","title":"Online pre-training with long-form videos","summary":" In this study, we investigate the impact of online pre-training with\ncontinuous video clips. We will examine three methods for pre-training (masked\nimage modeling, contrastive learning, and knowledge distillation), and assess\nthe performance on downstream action recognition tasks. As a result, online\npre-training with contrast learning showed the highest performance in\ndownstream tasks. Our findings suggest that learning from long-form videos can\nbe helpful for action recognition with short videos.\n","authors":["Itsuki Kato","Kodai Kamiya","Toru Tamaki"],"pdf_url":"https://arxiv.org/pdf/2408.15651v1.pdf","comment":"GCCE2024"},{"id":"http://arxiv.org/abs/2408.04957v3","updated":"2024-08-28T09:05:01Z","published":"2024-08-09T09:22:40Z","title":"LLaVA-VSD: Large Language-and-Vision Assistant for Visual Spatial\n Description","summary":" Visual Spatial Description (VSD) aims to generate texts that describe the\nspatial relationships between objects within images. Traditional visual spatial\nrelationship classification (VSRC) methods typically output the spatial\nrelationship between two objects in an image, often neglecting world knowledge\nand lacking general language capabilities. In this paper, we propose a Large\nLanguage-and-Vision Assistant for Visual Spatial Description, named LLaVA-VSD,\nwhich is designed for the classification, description, and open-ended\ndescription of visual spatial relationships. Specifically, the model first\nconstructs a VSD instruction-following dataset using given figure-caption pairs\nfor the three tasks. It then employs LoRA to fine-tune a Large Language and\nVision Assistant for VSD, which has 13 billion parameters and supports\nhigh-resolution images. Finally, a large language model (Qwen-2) is used to\nrefine the generated sentences, enhancing their diversity and accuracy.\nLLaVA-VSD demonstrates excellent multimodal conversational capabilities and can\nfollow open-ended instructions to assist with inquiries about object\nrelationships in images.\n","authors":["Yizhang Jin","Jian Li","Jiangning Zhang","Jianlong Hu","Zhenye Gan","Xin Tan","Yong Liu","Yabiao Wang","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2408.04957v3.pdf","comment":"We have discovered a significant error in the paper that affects the\n main conclusions. To ensure the accuracy of our research, we have decided to\n withdraw this paper and will resubmit it after making the necessary\n corrections"},{"id":"http://arxiv.org/abs/2408.15647v1","updated":"2024-08-28T09:01:55Z","published":"2024-08-28T09:01:55Z","title":"Leveraging Persistent Homology for Differential Diagnosis of Mild\n Cognitive Impairment","summary":" Mild cognitive impairment (MCI) is characterized by subtle changes in\ncognitive functions, often associated with disruptions in brain connectivity.\nThe present study introduces a novel fine-grained analysis to examine\ntopological alterations in neurodegeneration pertaining to six different brain\nnetworks of MCI subjects (Early/Late MCI). To achieve this, fMRI time series\nfrom two distinct populations are investigated: (i) the publicly accessible\nADNI dataset and (ii) our in-house dataset. The study utilizes sliding window\nembedding to convert each fMRI time series into a sequence of 3-dimensional\nvectors, facilitating the assessment of changes in regional brain topology.\nDistinct persistence diagrams are computed for Betti descriptors of\ndimension-0, 1, and 2. Wasserstein distance metric is used to quantify\ndifferences in topological characteristics. We have examined both (i)\nROI-specific inter-subject interactions and (ii) subject-specific inter-ROI\ninteractions. Further, a new deep learning model is proposed for\nclassification, achieving a maximum classification accuracy of 95% for the ADNI\ndataset and 85% for the in-house dataset. This methodology is further adapted\nfor the differential diagnosis of MCI sub-types, resulting in a peak accuracy\nof 76.5%, 91.1% and 80% in classifying HC Vs. EMCI, HC Vs. LMCI and EMCI Vs.\nLMCI, respectively. We showed that the proposed approach surpasses current\nstate-of-the-art techniques designed for classifying MCI and its sub-types\nusing fMRI.\n","authors":["Ninad Aithal","Debanjali Bhattacharya","Neelam Sinha","Thomas Gregor Issac"],"pdf_url":"https://arxiv.org/pdf/2408.15647v1.pdf","comment":"16 pages, 6 figures, 3 tables, accepted at International Conference\n on Pattern Recognition 2024"},{"id":"http://arxiv.org/abs/2402.09066v2","updated":"2024-08-28T09:01:37Z","published":"2024-02-14T10:24:04Z","title":"Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images:\n A Survey","summary":" The detection and characterization of illegal solid waste disposal sites are\nessential for environmental protection, particularly for mitigating pollution\nand health hazards. Improperly managed landfills contaminate soil and\ngroundwater via rainwater infiltration, posing threats to both animals and\nhumans. Traditional landfill identification approaches, such as on-site\ninspections, are time-consuming and expensive. Remote sensing is a\ncost-effective solution for the identification and monitoring of solid waste\ndisposal sites that enables broad coverage and repeated acquisitions over time.\nEarth Observation (EO) satellites, equipped with an array of sensors and\nimaging capabilities, have been providing high-resolution data for several\ndecades. Researchers proposed specialized techniques that leverage remote\nsensing imagery to perform a range of tasks such as waste site detection,\ndumping site monitoring, and assessment of suitable locations for new\nlandfills. This review aims to provide a detailed illustration of the most\nrelevant proposals for the detection and monitoring of solid waste sites by\ndescribing and comparing the approaches, the implemented techniques, and the\nemployed data. Furthermore, since the data sources are of the utmost importance\nfor developing an effective solid waste detection model, a comprehensive\noverview of the satellites and publicly available data sets is presented.\nFinally, this paper identifies the open issues in the state-of-the-art and\ndiscusses the relevant research directions for reducing the costs and improving\nthe effectiveness of novel solid waste detection methods.\n","authors":["Piero Fraternali","Luca Morandini","Sergio Luis Herrera González"],"pdf_url":"https://arxiv.org/pdf/2402.09066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15646v1","updated":"2024-08-28T09:01:18Z","published":"2024-08-28T09:01:18Z","title":"μgat: Improving Single-Page Document Parsing by Providing Multi-Page\n Context","summary":" Regesta are catalogs of summaries of other documents and, in some cases, are\nthe only source of information about the content of such full-length documents.\nFor this reason, they are of great interest to scholars in many social and\nhumanities fields. In this work, we focus on Regesta Pontificum Romanum, a\nlarge collection of papal registers. Regesta are visually rich documents, where\nthe layout is as important as the text content to convey the contained\ninformation through the structure, and are inherently multi-page documents.\nAmong Digital Humanities techniques that can help scholars efficiently exploit\nregesta and other documental sources in the form of scanned documents, Document\nParsing has emerged as a task to process document images and convert them into\nmachine-readable structured representations, usually markup language. However,\ncurrent models focus on scientific and business documents, and most of them\nconsider only single-paged documents. To overcome this limitation, in this\nwork, we propose {\\mu}gat, an extension of the recently proposed Document\nparsing Nougat architecture, which can handle elements spanning over the single\npage limits. Specifically, we adapt Nougat to process a larger, multi-page\ncontext, consisting of the previous and the following page, while parsing the\ncurrent page. Experimental results, both qualitative and quantitative,\ndemonstrate the effectiveness of our proposed approach also in the case of the\nchallenging Regesta Pontificum Romanorum.\n","authors":["Fabio Quattrini","Carmine Zaccagnino","Silvia Cascianelli","Laura Righi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.15646v1.pdf","comment":"Accepted at ECCV Workshop \"AI4DH: Artificial Intelligence for Digital\n Humanities\""},{"id":"http://arxiv.org/abs/2408.15643v1","updated":"2024-08-28T08:53:33Z","published":"2024-08-28T08:53:33Z","title":"RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via\n Rotation-Invariant Analysis","summary":" The rotation robustness property has drawn much attention to point cloud\nanalysis, whereas it still poses a critical challenge in 3D object detection.\nWhen subjected to arbitrary rotation, most existing detectors fail to produce\nexpected outputs due to the poor rotation robustness. In this paper, we present\nRIDE, a pioneering exploration of Rotation-Invariance for the 3D\nLiDAR-point-based object DEtector, with the key idea of designing\nrotation-invariant features from LiDAR scenes and then effectively\nincorporating them into existing 3D detectors. Specifically, we design a\nbi-feature extractor that extracts (i) object-aware features though sensitive\nto rotation but preserve geometry well, and (ii) rotation-invariant features,\nwhich lose geometric information to a certain extent but are robust to\nrotation. These two kinds of features complement each other to decode 3D\nproposals that are robust to arbitrary rotations. Particularly, our RIDE is\ncompatible and easy to plug into the existing one-stage and two-stage 3D\ndetectors, and boosts both detection performance and rotation robustness.\nExtensive experiments on the standard benchmarks showcase that the mean average\nprecision (mAP) and rotation robustness can be significantly boosted by\nintegrating with our RIDE, with +5.6% mAP and 53% rotation robustness\nimprovement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes.\nThe code will be available soon.\n","authors":["Zhaoxuan Wang","Xu Han","Hongxin Liu","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2408.15643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15642v1","updated":"2024-08-28T08:53:20Z","published":"2024-08-28T08:53:20Z","title":"Can SAR improve RSVQA performance?","summary":" Remote sensing visual question answering (RSVQA) has been involved in several\nresearch in recent years, leading to an increase in new methods. RSVQA\nautomatically extracts information from satellite images, so far only optical,\nand a question to automatically search for the answer in the image and provide\nit in a textual form. In our research, we study whether Synthetic Aperture\nRadar (SAR) images can be beneficial to this field. We divide our study into\nthree phases which include classification methods and VQA. In the first one, we\nexplore the classification results of SAR alone and investigate the best method\nto extract information from SAR data. Then, we study the combination of SAR and\noptical data. In the last phase, we investigate how SAR images and a\ncombination of different modalities behave in RSVQA compared to a method only\nusing optical images. We conclude that adding the SAR modality leads to\nimproved performances, although further research on using SAR data to\nautomatically answer questions is needed as well as more balanced datasets.\n","authors":["Lucrezia Tosato","Sylvain Lobry","Flora Weissgerber","Laurent Wendling"],"pdf_url":"https://arxiv.org/pdf/2408.15642v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.15641v1","updated":"2024-08-28T08:52:33Z","published":"2024-08-28T08:52:33Z","title":"MMDRFuse: Distilled Mini-Model with Dynamic Refresh for Multi-Modality\n Image Fusion","summary":" In recent years, Multi-Modality Image Fusion (MMIF) has been applied to many\nfields, which has attracted many scholars to endeavour to improve the fusion\nperformance. However, the prevailing focus has predominantly been on the\narchitecture design, rather than the training strategies. As a low-level vision\ntask, image fusion is supposed to quickly deliver output images for observation\nand supporting downstream tasks. Thus, superfluous computational and storage\noverheads should be avoided. In this work, a lightweight Distilled Mini-Model\nwith a Dynamic Refresh strategy (MMDRFuse) is proposed to achieve this\nobjective. To pursue model parsimony, an extremely small convolutional network\nwith a total of 113 trainable parameters (0.44 KB) is obtained by three\ncarefully designed supervisions. First, digestible distillation is constructed\nby emphasising external spatial feature consistency, delivering soft\nsupervision with balanced details and saliency for the target network. Second,\nwe develop a comprehensive loss to balance the pixel, gradient, and perception\nclues from the source images. Third, an innovative dynamic refresh training\nstrategy is used to collaborate history parameters and current supervision\nduring training, together with an adaptive adjust function to optimise the\nfusion network. Extensive experiments on several public datasets demonstrate\nthat our method exhibits promising advantages in terms of model efficiency and\ncomplexity, with superior performance in multiple image fusion tasks and\ndownstream pedestrian detection application. The code of this work is publicly\navailable at https://github.com/yanglinDeng/MMDRFuse.\n","authors":["Yanglin Deng","Tianyang Xu","Chunyang Cheng","Xiao-Jun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2408.15641v1.pdf","comment":"10 pages, 8 figures, accpeted by ACM International Conference on\n Multimedia 2024(Oral)"},{"id":"http://arxiv.org/abs/2407.02392v4","updated":"2024-08-28T08:49:57Z","published":"2024-07-02T16:10:55Z","title":"TokenPacker: Efficient Visual Projector for Multimodal LLM","summary":" The visual projector serves as an essential bridge between the visual encoder\nand the Large Language Model (LLM) in a Multimodal LLM (MLLM). Typically, MLLMs\nadopt a simple MLP to preserve all visual contexts via one-to-one\ntransformation. However, the visual tokens are redundant and can be\nconsiderably increased when dealing with high-resolution images, impairing the\nefficiency of MLLMs significantly. Some recent works have introduced resampler\nor abstractor to reduce the number of resulting visual tokens. Unfortunately,\nthey fail to capture finer details and undermine the visual reasoning\ncapabilities of MLLMs. In this work, we propose a novel visual projector, which\nadopts a coarse-to-fine scheme to inject the enriched characteristics to\ngenerate the condensed visual tokens. In specific, we first interpolate the\nvisual features as a low-resolution point query, providing the overall visual\nrepresentation as the foundation. Then, we introduce a region-to-point\ninjection module that utilizes high-resolution, multi-level region-based cues\nas fine-grained reference keys and values, allowing them to be fully absorbed\nwithin the corresponding local context region. This step effectively updates\nthe coarse point query, transforming it into an enriched one for the subsequent\nLLM reasoning. Extensive experiments demonstrate that our approach compresses\nthe visual tokens by 75%~89%, while achieves comparable or even better\nperformance across diverse benchmarks with significantly higher efficiency. The\nsource codes can be found at https://github.com/CircleRadon/TokenPacker.\n","authors":["Wentong Li","Yuqian Yuan","Jian Liu","Dongqi Tang","Song Wang","Jie Qin","Jianke Zhu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.02392v4.pdf","comment":"16 pages, Codes:https://github.com/CircleRadon/TokenPacker"},{"id":"http://arxiv.org/abs/2408.15637v1","updated":"2024-08-28T08:44:58Z","published":"2024-08-28T08:44:58Z","title":"Transfer Learning from Simulated to Real Scenes for Monocular 3D Object\n Detection","summary":" Accurately detecting 3D objects from monocular images in dynamic roadside\nscenarios remains a challenging problem due to varying camera perspectives and\nunpredictable scene conditions. This paper introduces a two-stage training\nstrategy to address these challenges. Our approach initially trains a model on\nthe large-scale synthetic dataset, RoadSense3D, which offers a diverse range of\nscenarios for robust feature learning. Subsequently, we fine-tune the model on\na combination of real-world datasets to enhance its adaptability to practical\nconditions. Experimental results of the Cube R-CNN model on challenging public\nbenchmarks show a remarkable improvement in detection performance, with a mean\naverage precision rising from 0.26 to 12.76 on the TUM Traffic A9 Highway\ndataset and from 2.09 to 6.60 on the DAIR-V2X-I dataset when performing\ntransfer learning. Code, data, and qualitative video results are available on\nthe project website: https://roadsense3d.github.io.\n","authors":["Sondos Mohamed","Walter Zimmer","Ross Greer","Ahmed Alaaeldin Ghita","Modesto Castrillón-Santana","Mohan Trivedi","Alois Knoll","Salvatore Mario Carta","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2408.15637v1.pdf","comment":"18 pages. Accepted for ECVA European Conference on Computer Vision\n 2024 (ECCV'24)"},{"id":"http://arxiv.org/abs/2401.11835v2","updated":"2024-08-28T08:38:43Z","published":"2024-01-22T10:52:02Z","title":"Unveiling the Human-like Similarities of Automatic Facial Expression\n Recognition: An Empirical Exploration through Explainable AI","summary":" Facial expression recognition is vital for human behavior analysis, and deep\nlearning has enabled models that can outperform humans. However, it is unclear\nhow closely they mimic human processing. This study aims to explore the\nsimilarity between deep neural networks and human perception by comparing\ntwelve different networks, including both general object classifiers and\nFER-specific models. We employ an innovative global explainable AI method to\ngenerate heatmaps, revealing crucial facial regions for the twelve networks\ntrained on six facial expressions. We assess these results both quantitatively\nand qualitatively, comparing them to ground truth masks based on Friesen and\nEkman's description and among them. We use Intersection over Union (IoU) and\nnormalized correlation coefficients for comparisons. We generate 72 heatmaps to\nhighlight critical regions for each expression and architecture. Qualitatively,\nmodels with pre-trained weights show more similarity in heatmaps compared to\nthose without pre-training. Specifically, eye and nose areas influence certain\nfacial expressions, while the mouth is consistently important across all models\nand expressions. Quantitatively, we find low average IoU values (avg. 0.2702)\nacross all expressions and architectures. The best-performing architecture\naverages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,\nbuilt with the normalized correlation coefficient, reveal two main clusters for\nmost expressions: models with pre-training and models without pre-training.\nFindings suggest limited alignment between human and AI facial expression\nrecognition, with network architectures influencing the similarity, as similar\narchitectures prioritize similar facial regions.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis-Guarinos","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11835v2.pdf","comment":"Multimed Tools Appl (2024)"},{"id":"http://arxiv.org/abs/2408.15045v2","updated":"2024-08-28T08:32:44Z","published":"2024-08-27T13:13:38Z","title":"DocLayLLM: An Efficient and Effective Multi-modal Extension of Large\n Language Models for Text-rich Document Understanding","summary":" Text-rich document understanding (TDU) refers to analyzing and comprehending\ndocuments containing substantial textual content. With the rapid evolution of\nlarge language models (LLMs), they have been widely leveraged for TDU due to\ntheir remarkable versatility and generalization. In this paper, we introduce\nDocLayLLM, an efficient and effective multi-modal extension of LLMs\nspecifically designed for TDU. By integrating visual patch tokens and 2D\npositional tokens into LLMs and encoding the document content using the LLMs\nthemselves, we fully take advantage of the document comprehension capability of\nLLMs and enhance their perception of OCR information. We have also deeply\nconsidered the role of the chain-of-thought (CoT) and innovatively proposed the\ntechniques of CoT Pre-training and CoT Annealing. Our DocLayLLM can achieve\nremarkable performances with lightweight training settings, showcasing its\nefficiency and effectiveness. Experimental results demonstrate that our\nDocLayLLM surpasses existing OCR-dependent methods and also outperforms\nOCR-free competitors.\n","authors":["Wenhui Liao","Jiapeng Wang","Hongliang Li","Chengyu Wang","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2408.15045v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08454v2","updated":"2024-08-28T08:31:28Z","published":"2024-08-15T23:34:04Z","title":"Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention","summary":" The Transformer architecture has revolutionized deep learning through its\nSelf-Attention mechanism, which effectively captures contextual information.\nHowever, the memory footprint of Self-Attention presents significant challenges\nfor long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by\ngrouping queries and mean-pooling the corresponding key-value heads - reducing\nthe number of overall parameters and memory requirements in a flexible manner\nwithout adversely compromising model accuracy. In this work, we introduce\nenhancements to GQA, focusing on two novel approaches that deviate from the\nstatic nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic\nKey-Distributed GQA (DGQA), which leverage information from the norms of the\nkey heads to inform query allocation. Specifically, KDGQA looks at the ratios\nof the norms of the key heads during each forward pass, while DGQA examines the\nratios of the norms as they evolve through training. Additionally, we present\nPerturbed GQA (PGQA) as a case-study, which introduces variability in (static)\ngroup formation via subtracting noise from the attention maps. Our experiments\nwith up-trained Vision Transformers, for Image Classification on datasets such\nas CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of\nthese variants in improving upon the original GQA through more informed and\nadaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of\nup to 8% when utilizing DGQA in comparison to GQA and other variants. We\nfurther analyze the impact of the number of Key-Value Heads on performance,\nunderscoring the importance of utilizing query-key affinities. Code is\navailable on GitHub.\n","authors":["Zohaib Khan","Muhammad Khaquan","Omer Tafveez","Burhanuddin Samiwala","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08454v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15063v2","updated":"2024-08-28T08:28:50Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n with Semantic Feature Fusion Guidance","summary":" Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction.To address\nthese issues, we first design a multi-modal complementary fusion module to\nextract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework.\n","authors":["Kunpeng Wang","Danying Lin","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v2.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15628v1","updated":"2024-08-28T08:27:41Z","published":"2024-08-28T08:27:41Z","title":"CSAD: Unsupervised Component Segmentation for Logical Anomaly Detection","summary":" To improve logical anomaly detection, some previous works have integrated\nsegmentation techniques with conventional anomaly detection methods. Although\nthese methods are effective, they frequently lead to unsatisfactory\nsegmentation results and require manual annotations. To address these\ndrawbacks, we develop an unsupervised component segmentation technique that\nleverages foundation models to autonomously generate training labels for a\nlightweight segmentation network without human labeling. Integrating this new\nsegmentation technique with our proposed Patch Histogram module and the\nLocal-Global Student-Teacher (LGST) module, we achieve a detection AUROC of\n95.3% in the MVTec LOCO AD dataset, which surpasses previous SOTA methods.\nFurthermore, our proposed method provides lower latency and higher throughput\nthan most existing approaches.\n","authors":["Yu-Hsuan Hsieh","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15626v1","updated":"2024-08-28T08:25:41Z","published":"2024-08-28T08:25:41Z","title":"Can Visual Language Models Replace OCR-Based Visual Question Answering\n Pipelines in Production? A Case Study in Retail","summary":" Most production-level deployments for Visual Question Answering (VQA) tasks\nare still build as processing pipelines of independent steps including image\npre-processing, object- and text detection, Optical Character Recognition (OCR)\nand (mostly supervised) object classification. However, the recent advances in\nvision Foundation Models [25] and Vision Language Models (VLMs) [23] raise the\nquestion if these custom trained, multi-step approaches can be replaced with\npre-trained, single-step VLMs. This paper analyzes the performance and limits\nof various VLMs in the context of VQA and OCR [5, 9, 12] tasks in a\nproduction-level scenario. Using data from the Retail-786k [10] dataset, we\ninvestigate the capabilities of pre-trained VLMs to answer detailed questions\nabout advertised products in images. Our study includes two commercial models,\nGPT-4V [16] and GPT-4o [17], as well as four open-source models: InternVL [5],\nLLaVA 1.5 [12], LLaVA-NeXT [13], and CogAgent [9]. Our initial results show,\nthat there is in general no big performance gap between open-source and\ncommercial models. However, we observe a strong task dependent variance in VLM\nperformance: while most models are able to answer questions regarding the\nproduct brand and price with high accuracy, they completely fail at the same\ntime to correctly identity the specific product name or discount. This\nindicates the problem of VLMs to solve fine-grained classification tasks as\nwell to model the more abstract concept of discounts.\n","authors":["Bianca Lamm","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2408.15626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15608v1","updated":"2024-08-28T08:02:47Z","published":"2024-08-28T08:02:47Z","title":"Geometry-guided Feature Learning and Fusion for Indoor Scene\n Reconstruction","summary":" In addition to color and textural information, geometry provides important\ncues for 3D scene reconstruction. However, current reconstruction methods only\ninclude geometry at the feature level thus not fully exploiting the geometric\ninformation.\n In contrast, this paper proposes a novel geometry integration mechanism for\n3D scene reconstruction. Our approach incorporates 3D geometry at three levels,\ni.e. feature learning, feature fusion, and network supervision. First,\ngeometry-guided feature learning encodes geometric priors to contain\nview-dependent information. Second, a geometry-guided adaptive feature fusion\nis introduced which utilizes the geometric priors as a guidance to adaptively\ngenerate weights for multiple views. Third, at the supervision level, taking\nthe consistency between 2D and 3D normals into account, a consistent 3D normal\nloss is designed to add local constraints.\n Large-scale experiments are conducted on the ScanNet dataset, showing that\nvolumetric methods with our geometry integration mechanism outperform\nstate-of-the-art methods quantitatively as well as qualitatively. Volumetric\nmethods with ours also show good generalization on the 7-Scenes and TUM RGB-D\ndatasets.\n","authors":["Ruihong Yin","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2408.15608v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2408.15605v1","updated":"2024-08-28T07:56:28Z","published":"2024-08-28T07:56:28Z","title":"ES-PTAM: Event-based Stereo Parallel Tracking and Mapping","summary":" Visual Odometry (VO) and SLAM are fundamental components for spatial\nperception in mobile robots. Despite enormous progress in the field, current\nVO/SLAM systems are limited by their sensors' capability. Event cameras are\nnovel visual sensors that offer advantages to overcome the limitations of\nstandard cameras, enabling robots to expand their operating range to\nchallenging scenarios, such as high-speed motion and high dynamic range\nillumination. We propose a novel event-based stereo VO system by combining two\nideas: a correspondence-free mapping module that estimates depth by maximizing\nray density fusion and a tracking module that estimates camera poses by\nmaximizing edge-map alignment. We evaluate the system comprehensively on five\nreal-world datasets, spanning a variety of camera types (manufacturers and\nspatial resolutions) and scenarios (driving, flying drone, hand-held,\negocentric, etc). The quantitative and qualitative results demonstrate that our\nmethod outperforms the state of the art in majority of the test sequences by a\nmargin, e.g., trajectory error reduction of 45% on RPG dataset, 61% on DSEC\ndataset, and 21% on TUM-VIE dataset. To benefit the community and foster\nresearch on event-based perception systems, we release the source code and\nresults: https://github.com/tub-rip/ES-PTAM\n","authors":["Suman Ghosh","Valentina Cavinato","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2408.15605v1.pdf","comment":"17 pages, 7 figures, 4 tables, https://github.com/tub-rip/ES-PTAM"},{"id":"http://arxiv.org/abs/2408.08091v2","updated":"2024-08-28T07:51:34Z","published":"2024-08-15T11:34:33Z","title":"HAIR: Hypernetworks-based All-in-One Image Restoration","summary":" Image restoration aims to recover a high-quality clean image from its\ndegraded version. Recent progress in image restoration has demonstrated the\neffectiveness of All-in-One image restoration models in addressing various\ndegradations simultaneously. However, these existing methods typically utilize\nthe same parameters to tackle images with different degradation types, thus\nforcing the model to balance the performance between different tasks and\nlimiting its performance on each task. To alleviate this issue, we propose\nHAIR, a \\textbf{H}ypernetworks-based \\textbf{A}ll-in-One \\textbf{I}mage\n\\textbf{R}estoration method that dynamically generates parameters based on\ninput images. Specifically, HAIR consists of two main components, i.e.,\nClassifier and Hyper Selecting Net (HSN). The Classifier is a simple image\nclassification network used to generate a Global Information Vector (GIV) that\ncontains the degradation information of the input image, and the HSN is a\nsimple fully-connected neural network that receives the GIV and outputs\nparameters for the corresponding modules. Extensive experiments demonstrate\nthat HAIR can significantly improve the performance of existing image\nrestoration models in a plug-and-play manner, both in single-task and\nall-in-one settings. Notably, our innovative model, Res-HAIR, which integrates\nHAIR into the well-known Restormer, can obtain superior or comparable\nperformance compared with current state-of-the-art methods. Moreover, we\ntheoretically demonstrate that our proposed HAIR requires fewer parameters in\ncontrast to the prevalent All-in-One methodologies. The code is available at\n\\textcolor{blue}{\\href{https://github.com/toummHus/HAIR}{https://github.com/toummHus/HAIR}.}\n","authors":["Jin Cao","Yi Cao","Li Pang","Deyu Meng","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2408.08091v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2408.15602v1","updated":"2024-08-28T07:49:30Z","published":"2024-08-28T07:49:30Z","title":"On the Benefits of Visual Stabilization for Frame- and Event-based\n Perception","summary":" Vision-based perception systems are typically exposed to large orientation\nchanges in different robot applications. In such conditions, their performance\nmight be compromised due to the inherent complexity of processing data captured\nunder challenging motion. Integration of mechanical stabilizers to compensate\nfor the camera rotation is not always possible due to the robot payload\nconstraints. This paper presents a processing-based stabilization approach to\ncompensate the camera's rotational motion both on events and on frames (i.e.,\nimages). Assuming that the camera's attitude is available, we evaluate the\nbenefits of stabilization in two perception applications: feature tracking and\nestimating the translation component of the camera's ego-motion. The validation\nis performed using synthetic data and sequences from well-known event-based\nvision datasets. The experiments unveil that stabilization can improve feature\ntracking and camera ego-motion estimation accuracy in 27.37% and 34.82%,\nrespectively. Concurrently, stabilization can reduce the processing time of\ncomputing the camera's linear velocity by at least 25%. Code is available at\nhttps://github.com/tub-rip/visual_stabilization\n","authors":["Juan Pablo Rodriguez-Gomez","Jose Ramiro Martinez-de Dios","Anibal Ollero","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2408.15602v1.pdf","comment":"8 pages, 4 figures, 4 tables,\n https://github.com/tub-rip/visual_stabilization"},{"id":"http://arxiv.org/abs/2407.10389v2","updated":"2024-08-28T07:49:14Z","published":"2024-07-15T01:58:54Z","title":"Boost Your NeRF: A Model-Agnostic Mixture of Experts Framework for High\n Quality and Efficient Rendering","summary":" Since the introduction of NeRFs, considerable attention has been focused on\nimproving their training and inference times, leading to the development of\nFast-NeRFs models. Despite demonstrating impressive rendering speed and\nquality, the rapid convergence of such models poses challenges for further\nimproving reconstruction quality. Common strategies to improve rendering\nquality involves augmenting model parameters or increasing the number of\nsampled points. However, these computationally intensive approaches encounter\nlimitations in achieving significant quality enhancements. This study\nintroduces a model-agnostic framework inspired by Sparsely-Gated Mixture of\nExperts to enhance rendering quality without escalating computational\ncomplexity. Our approach enables specialization in rendering different scene\ncomponents by employing a mixture of experts with varying resolutions. We\npresent a novel gate formulation designed to maximize expert capabilities and\npropose a resolution-based routing technique to effectively induce sparsity and\ndecompose scenes. Our work significantly improves reconstruction quality while\nmaintaining competitive performance.\n","authors":["Francesco Di Sario","Riccardo Renzulli","Enzo Tartaglione","Marco Grangetto"],"pdf_url":"https://arxiv.org/pdf/2407.10389v2.pdf","comment":"The paper has been accepted to the ECCV 2024 conference"},{"id":"http://arxiv.org/abs/2407.20495v2","updated":"2024-08-28T07:40:18Z","published":"2024-07-30T01:39:30Z","title":"Enhancing Quantitative Image Synthesis through Pretraining and\n Resolution Scaling for Bone Mineral Density Estimation from a Plain X-ray\n Image","summary":" While most vision tasks are essentially visual in nature (for recognition),\nsome important tasks, especially in the medical field, also require\nquantitative analysis (for quantification) using quantitative images. Unlike in\nvisual analysis, pixel values in quantitative images correspond to physical\nmetrics measured by specific devices (e.g., a depth image). However, recent\nwork has shown that it is sometimes possible to synthesize accurate\nquantitative values from visual ones (e.g., depth from visual cues or defocus).\nThis research aims to improve quantitative image synthesis (QIS) by exploring\npretraining and image resolution scaling. We propose a benchmark for evaluating\npretraining performance using the task of QIS-based bone mineral density (BMD)\nestimation from plain X-ray images, where the synthesized quantitative image is\nused to derive BMD. Our results show that appropriate pretraining can improve\nQIS performance, significantly raising the correlation of BMD estimation from\n0.820 to 0.898, while others do not help or even hinder it. Scaling-up the\nresolution can further boost the correlation up to 0.923, a significant\nenhancement over conventional methods. Future work will include exploring more\npretraining strategies and validating them on other image synthesis tasks.\n","authors":["Yi Gu","Yoshito Otake","Keisuke Uemura","Masaki Takao","Mazen Soufi","Seiji Okada","Nobuhiko Sugano","Hugues Talbot","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2407.20495v2.pdf","comment":"SASHIMI, 2024 (MICCAI workshop). 13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.14668v4","updated":"2024-08-28T07:28:15Z","published":"2023-05-24T03:20:09Z","title":"NOVUM: Neural Object Volumes for Robust Object Classification","summary":" Discriminative models for object classification typically learn image-based\nrepresentations that do not capture the compositional and 3D nature of objects.\nIn this work, we show that explicitly integrating 3D compositional object\nrepresentations into deep networks for image classification leads to a largely\nenhanced generalization in out-of-distribution scenarios. In particular, we\nintroduce a novel architecture, referred to as NOVUM, that consists of a\nfeature extractor and a neural object volume for every target object class.\nEach neural object volume is a composition of 3D Gaussians that emit feature\nvectors. This compositional object representation allows for a highly robust\nand fast estimation of the object class by independently matching the features\nof the 3D Gaussians of each category to features extracted from an input image.\nAdditionally, the object pose can be estimated via inverse rendering of the\ncorresponding neural object volume. To enable the classification of objects,\nthe neural features at each 3D Gaussian are trained discriminatively to be\ndistinct from (i) the features of 3D Gaussians in other categories, (ii)\nfeatures of other 3D Gaussians of the same object, and (iii) the background\nfeatures. Our experiments show that NOVUM offers intriguing advantages over\nstandard architectures due to the 3D compositional structure of the object\nrepresentation, namely: (1) An exceptional robustness across a spectrum of\nreal-world and synthetic out-of-distribution shifts and (2) an enhanced human\ninterpretability compared to standard models, all while maintaining real-time\ninference and a competitive accuracy on in-distribution data.\n","authors":["Artur Jesslen","Guofeng Zhang","Angtian Wang","Wufei Ma","Alan Yuille","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2305.14668v4.pdf","comment":"14 pages, 4 figures, accepted at ECCV 2024, code is accessible at\n https://github.com/GenIntel/NOVUM"},{"id":"http://arxiv.org/abs/2405.15239v3","updated":"2024-08-28T07:07:06Z","published":"2024-05-24T06:06:11Z","title":"Brain3D: Generating 3D Objects from fMRI","summary":" Understanding the hidden mechanisms behind human's visual perception is a\nfundamental question in neuroscience. To that end, investigating into the\nneural responses of human mind activities, such as functional Magnetic\nResonance Imaging (fMRI), has been a significant research vehicle. However,\nanalyzing fMRI signals is challenging, costly, daunting, and demanding for\nprofessional training. Despite remarkable progress in fMRI analysis, existing\napproaches are limited to generating 2D images and far away from being\nbiologically meaningful and practically useful. Under this insight, we propose\nto generate visually plausible and functionally more comprehensive 3D outputs\ndecoded from brain signals, enabling more sophisticated modeling of fMRI data.\nConceptually, we reformulate this task as a {\\em fMRI conditioned 3D object\ngeneration} problem. We design a novel 3D object representation learning\nmethod, Brain3D, that takes as input the fMRI data of a subject who was\npresented with a 2D image, and yields as output the corresponding 3D object\nimages. The key capabilities of this model include tackling the noises with\nhigh-level semantic signals and a two-stage architecture design for progressive\nhigh-level information integration. Extensive experiments validate the superior\ncapability of our model over previous state-of-the-art 3D object generation\nmethods. Importantly, we show that our model captures the distinct\nfunctionalities of each region of human vision system as well as their\nintricate interplay relationships, aligning remarkably with the established\ndiscoveries in neuroscience. Further, preliminary evaluations indicate that\nBrain3D can successfully identify the disordered brain regions in simulated\nscenarios, such as V1, V2, V3, V4, and the medial temporal lobe (MTL) within\nthe human visual system. Our data and code will be available at\nhttps://brain-3d.github.io/.\n","authors":["Yuankun Yang","Li Zhang","Ziyang Xie","Zhiyuan Yuan","Jianfeng Feng","Xiatian Zhu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2405.15239v3.pdf","comment":"20 pages, 11 figures, project page: https://brain-3d.github.io/"},{"id":"http://arxiv.org/abs/2408.15580v1","updated":"2024-08-28T07:05:46Z","published":"2024-08-28T07:05:46Z","title":"Hierarchical Visual Categories Modeling: A Joint Representation Learning\n and Density Estimation Framework for Out-of-Distribution Detection","summary":" Detecting out-of-distribution inputs for visual recognition models has become\ncritical in safe deep learning. This paper proposes a novel hierarchical visual\ncategory modeling scheme to separate out-of-distribution data from\nin-distribution data through joint representation learning and statistical\nmodeling. We learn a mixture of Gaussian models for each in-distribution\ncategory. There are many Gaussian mixture models to model different visual\ncategories. With these Gaussian models, we design an in-distribution score\nfunction by aggregating multiple Mahalanobis-based metrics. We don't use any\nauxiliary outlier data as training samples, which may hurt the generalization\nability of out-of-distribution detection algorithms. We split the ImageNet-1k\ndataset into ten folds randomly. We use one fold as the in-distribution dataset\nand the others as out-of-distribution datasets to evaluate the proposed method.\nWe also conduct experiments on seven popular benchmarks, including CIFAR,\niNaturalist, SUN, Places, Textures, ImageNet-O, and OpenImage-O. Extensive\nexperiments indicate that the proposed method outperforms state-of-the-art\nalgorithms clearly. Meanwhile, we find that our visual representation has a\ncompetitive performance when compared with features learned by classical\nmethods. These results demonstrate that the proposed method hasn't weakened the\ndiscriminative ability of visual recognition models and keeps high efficiency\nin detecting out-of-distribution samples.\n","authors":["Jinglun Li","Xinyu Zhou","Pinxue Guo","Yixuan Sun","Yiwen Huang","Weifeng Ge","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15580v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2408.13509v2","updated":"2024-08-28T06:53:46Z","published":"2024-08-24T08:09:32Z","title":"DualAnoDiff: Dual-Interrelated Diffusion Model for Few-Shot Anomaly\n Image Generation","summary":" The performance of anomaly inspection in industrial manufacturing is\nconstrained by the scarcity of anomaly data. To overcome this challenge,\nresearchers have started employing anomaly generation approaches to augment the\nanomaly dataset. However, existing anomaly generation methods suffer from\nlimited diversity in the generated anomalies and struggle to achieve a seamless\nblending of this anomaly with the original image. In this paper, we overcome\nthese challenges from a new perspective, simultaneously generating a pair of\nthe overall image and the corresponding anomaly part. We propose DualAnoDiff, a\nnovel diffusion-based few-shot anomaly image generation model, which can\ngenerate diverse and realistic anomaly images by using a dual-interrelated\ndiffusion model, where one of them is employed to generate the whole image\nwhile the other one generates the anomaly part. Moreover, we extract background\nand shape information to mitigate the distortion and blurriness phenomenon in\nfew-shot image generation. Extensive experiments demonstrate the superiority of\nour proposed model over state-of-the-art methods in terms of both realism and\ndiversity. Overall, our approach significantly improves the performance of\ndownstream anomaly detection tasks, including anomaly detection, anomaly\nlocalization, and anomaly classification tasks.\n","authors":["Ying Jin","Jinlong Peng","Qingdong He","Teng Hu","Hao Chen","Jiafu Wu","Wenbing Zhu","Mingmin Chi","Jun Liu","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13509v2.pdf","comment":"Code: https://github.com/yinyjin/DualAnoDiff"},{"id":"http://arxiv.org/abs/2408.15569v1","updated":"2024-08-28T06:53:08Z","published":"2024-08-28T06:53:08Z","title":"Temporal Attention for Cross-View Sequential Image Localization","summary":" This paper introduces a novel approach to enhancing cross-view localization,\nfocusing on the fine-grained, sequential localization of street-view images\nwithin a single known satellite image patch, a significant departure from\ntraditional one-to-one image retrieval methods. By expanding to sequential\nimage fine-grained localization, our model, equipped with a novel Temporal\nAttention Module (TAM), leverages contextual information to significantly\nimprove sequential image localization accuracy. Our method shows substantial\nreductions in both mean and median localization errors on the Cross-View Image\nSequence (CVIS) dataset, outperforming current state-of-the-art single-image\nlocalization techniques. Additionally, by adapting the KITTI-CVL dataset into\nsequential image sets, we not only offer a more realistic dataset for future\nresearch but also demonstrate our model's robust generalization capabilities\nacross varying times and areas, evidenced by a 75.3% reduction in mean distance\nerror in cross-view sequential image localization.\n","authors":["Dong Yuan","Frederic Maire","Feras Dayoub"],"pdf_url":"https://arxiv.org/pdf/2408.15569v1.pdf","comment":"Accepted to IROS 2024"},{"id":"http://arxiv.org/abs/2408.15566v1","updated":"2024-08-28T06:37:59Z","published":"2024-08-28T06:37:59Z","title":"TagOOD: A Novel Approach to Out-of-Distribution Detection via\n Vision-Language Representations and Class Center Learning","summary":" Multimodal fusion, leveraging data like vision and language, is rapidly\ngaining traction. This enriched data representation improves performance across\nvarious tasks. Existing methods for out-of-distribution (OOD) detection, a\ncritical area where AI models encounter unseen data in real-world scenarios,\nrely heavily on whole-image features. These image-level features can include\nirrelevant information that hinders the detection of OOD samples, ultimately\nlimiting overall performance. In this paper, we propose \\textbf{TagOOD}, a\nnovel approach for OOD detection that leverages vision-language representations\nto achieve label-free object feature decoupling from whole images. This\ndecomposition enables a more focused analysis of object semantics, enhancing\nOOD detection performance. Subsequently, TagOOD trains a lightweight network on\nthe extracted object features to learn representative class centers. These\ncenters capture the central tendencies of IND object classes, minimizing the\ninfluence of irrelevant image features during OOD detection. Finally, our\napproach efficiently detects OOD samples by calculating distance-based metrics\nas OOD scores between learned centers and test samples. We conduct extensive\nexperiments to evaluate TagOOD on several benchmark datasets and demonstrate\nits superior performance compared to existing OOD detection methods. This work\npresents a novel perspective for further exploration of multimodal information\nutilization in OOD detection, with potential applications across various tasks.\n","authors":["Jinglun Li","Xinyu Zhou","Kaixun Jiang","Lingyi Hong","Pinxue Guo","Zhaoyu Chen","Weifeng Ge","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15566v1.pdf","comment":"Accepted by ACMMM2024"},{"id":"http://arxiv.org/abs/2408.15557v1","updated":"2024-08-28T06:18:55Z","published":"2024-08-28T06:18:55Z","title":"Generalization Capabilities of Neural Cellular Automata for Medical\n Image Segmentation: A Robust and Lightweight Approach","summary":" In the field of medical imaging, the U-Net architecture, along with its\nvariants, has established itself as a cornerstone for image segmentation tasks,\nparticularly due to its strong performance when trained on limited datasets.\nDespite its impressive performance on identically distributed (in-domain) data,\nU-Nets exhibit a significant decline in performance when tested on data that\ndeviates from the training distribution, out-of-distribution (out-of-domain)\ndata. Current methodologies predominantly address this issue by employing\ngeneralization techniques that hinge on various forms of regularization, which\nhave demonstrated moderate success in specific scenarios. This paper, however,\nventures into uncharted territory by investigating the implications of\nutilizing models that are smaller by three orders of magnitude (i.e., x1000)\ncompared to a conventional U-Net. A reduction of this size in U-net parameters\ntypically adversely affects both in-domain and out-of-domain performance,\npossibly due to a significantly reduced receptive field. To circumvent this\nissue, we explore the concept of Neural Cellular Automata (NCA), which, despite\nits simpler model structure, can attain larger receptive fields through\nrecursive processes. Experimental results on two distinct datasets reveal that\nNCA outperforms traditional methods in terms of generalization, while still\nmaintaining a commendable IID performance.\n","authors":["Steven Korevaar","Ruwan Tennakoon","Alireza Bab-Hadiashar"],"pdf_url":"https://arxiv.org/pdf/2408.15557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13134v2","updated":"2024-08-28T06:15:04Z","published":"2023-11-22T03:41:13Z","title":"Lightweight High-Speed Photography Built on Coded Exposure and Implicit\n Neural Representation of Videos","summary":" The demand for compact cameras capable of recording high-speed scenes with\nhigh resolution is steadily increasing. However, achieving such capabilities\noften entails high bandwidth requirements, resulting in bulky, heavy systems\nunsuitable for low-capacity platforms. To address this challenge, leveraging a\ncoded exposure setup to encode a frame sequence into a blurry snapshot and\nsubsequently retrieve the latent sharp video presents a lightweight solution.\nNevertheless, restoring motion from blur remains a formidable challenge due to\nthe inherent ill-posedness of motion blur decomposition, the intrinsic\nambiguity in motion direction, and the diverse motions present in natural\nvideos. In this study, we propose a novel approach to address these challenges\nby combining the classical coded exposure imaging technique with the emerging\nimplicit neural representation for videos. We strategically embed motion\ndirection cues into the blurry image during the imaging process. Additionally,\nwe develop a novel implicit neural representation based blur decomposition\nnetwork to sequentially extract the latent video frames from the blurry image,\nleveraging the embedded motion direction cues. To validate the effectiveness\nand efficiency of our proposed framework, we conduct extensive experiments\nusing benchmark datasets and real-captured blurry images. The results\ndemonstrate that our approach significantly outperforms existing methods in\nterms of both quality and flexibility. The code for our work is available at\n.https://github.com/zhihongz/BDINR\n","authors":["Zhihong Zhang","Runzhao Yang","Jinli Suo","Yuxiao Cheng","Qionghai Dai"],"pdf_url":"https://arxiv.org/pdf/2311.13134v2.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2408.15556v1","updated":"2024-08-28T06:09:02Z","published":"2024-08-28T06:09:02Z","title":"Divide, Conquer and Combine: A Training-Free Framework for\n High-Resolution Image Perception in Multimodal Large Language Models","summary":" Multimodal large language models (MLLMs) have experienced significant\nadvancements recently, but still struggle to recognize and interpret intricate\ndetails in high-resolution (HR) images effectively. While state-of-the-art\n(SOTA) MLLMs claim to process images at 4K resolution, existing MLLM benchmarks\nonly support up to 2K, leaving the capabilities of SOTA models on true HR\nimages largely untested. Furthermore, existing methods for enhancing HR image\nperception in MLLMs rely on computationally expensive visual instruction\ntuning. To address these limitations, we introduce HR-Bench, the first\ndeliberately designed benchmark to rigorously evaluate MLLM performance on\n4K&8K images. Through extensive experiments, we demonstrate that while\ndownsampling HR images leads to vision information loss, leveraging\ncomplementary modalities, e.g., text, can effectively compensate for this loss.\nBuilding upon this insight, we propose Divide, Conquer and Combine (DC$^2$), a\nnovel training-free framework for enhancing MLLM perception of HR images.\nDC$^2$ follows a three-staged approach: 1) Divide: recursively partitioning the\nHR image into patches and merging similar patches to minimize computational\noverhead, 2) Conquer: leveraging the MLLM to generate accurate textual\ndescriptions for each image patch, and 3) Combine: utilizing the generated text\ndescriptions to enhance the MLLM's understanding of the overall HR image.\nExtensive experiments show that: 1) the SOTA MLLM achieves 63% accuracy, which\nis markedly lower than the 87% accuracy achieved by humans on HR-Bench; 2) our\nDC$^2$ brings consistent and significant improvements (a relative increase of\n+6% on HR-Bench and +8% on general multimodal benchmarks). The benchmark and\ncode will be released to facilitate the multimodal R&D community.\n","authors":["Wenbin Wang","Liang Ding","Minyan Zeng","Xiabin Zhou","Li Shen","Yong Luo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.15556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15555v1","updated":"2024-08-28T06:08:46Z","published":"2024-08-28T06:08:46Z","title":"Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep\n Learning","summary":" In recently years, a significant amount of research has been conducted on\napplying deep learning methods for glaucoma classification and detection.\nHowever, the explainability of those established machine learning models\nremains a big concern. In this research, in contrast, we learn from cognitive\nscience concept and study how ophthalmologists judge glaucoma detection.\nSimulating experts' efforts, we propose a hierarchical decision making system,\ncentered around a holistic set of carefully designed biomarker-oriented machine\nlearning models. While biomarkers represent the key indicators of how\nophthalmologists identify glaucoma, they usually exhibit latent\ninter-relations. We thus construct a time series model, named TRI-LSTM, capable\nof calculating and uncovering potential and latent relationships among various\nbiomarkers of glaucoma. Our model is among the first efforts to explore the\nintrinsic connections among glaucoma biomarkers. We monitor temporal\nrelationships in patients' disease states over time and to capture and retain\nthe progression of disease-relevant clinical information from prior visits,\nthereby enriching biomarker's potential relationships. Extensive experiments\nover real-world dataset have demonstrated the effectiveness of the proposed\nmodel.\n","authors":["Cheng Huang","Junhao Shen","Qiuyu Luo","Karanjit Kooner","Tsengdar Lee","Yishen Liu","Jia Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15555v1.pdf","comment":"9 pages, 4 images"},{"id":"http://arxiv.org/abs/2408.15548v1","updated":"2024-08-28T05:53:30Z","published":"2024-08-28T05:53:30Z","title":"ConsistencyTrack: A Robust Multi-Object Tracker with a Generation\n Strategy of Consistency Model","summary":" Multi-object tracking (MOT) is a critical technology in computer vision,\ndesigned to detect multiple targets in video sequences and assign each target a\nunique ID per frame. Existed MOT methods excel at accurately tracking multiple\nobjects in real-time across various scenarios. However, these methods still\nface challenges such as poor noise resistance and frequent ID switches. In this\nresearch, we propose a novel ConsistencyTrack, joint detection and\ntracking(JDT) framework that formulates detection and association as a\ndenoising diffusion process on perturbed bounding boxes. This progressive\ndenoising strategy significantly improves the model's noise resistance. During\nthe training phase, paired object boxes within two adjacent frames are diffused\nfrom ground-truth boxes to a random distribution, and then the model learns to\ndetect and track by reversing this process. In inference, the model refines\nrandomly generated boxes into detection and tracking results through minimal\ndenoising steps. ConsistencyTrack also introduces an innovative target\nassociation strategy to address target occlusion. Experiments on the MOT17 and\nDanceTrack datasets demonstrate that ConsistencyTrack outperforms other\ncompared methods, especially better than DiffusionTrack in inference speed and\nother performance metrics. Our code is available at\nhttps://github.com/Tankowa/ConsistencyTrack.\n","authors":["Lifan Jiang","Zhihui Wang","Siqi Yin","Guangxiao Ma","Peng Zhang","Boxi Wu"],"pdf_url":"https://arxiv.org/pdf/2408.15548v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.09905 by other authors"},{"id":"http://arxiv.org/abs/2406.18967v2","updated":"2024-08-28T05:40:55Z","published":"2024-06-27T07:59:25Z","title":"Structural Attention: Rethinking Transformer for Unpaired Medical Image\n Synthesis","summary":" Unpaired medical image synthesis aims to provide complementary information\nfor an accurate clinical diagnostics, and address challenges in obtaining\naligned multi-modal medical scans. Transformer-based models excel in imaging\ntranslation tasks thanks to their ability to capture long-range dependencies.\nAlthough effective in supervised training settings, their performance falters\nin unpaired image synthesis, particularly in synthesizing structural details.\nThis paper empirically demonstrates that, lacking strong inductive biases,\nTransformer can converge to non-optimal solutions in the absence of paired\ndata. To address this, we introduce UNet Structured Transformer (UNest), a\nnovel architecture incorporating structural inductive biases for unpaired\nmedical image synthesis. We leverage the foundational Segment-Anything Model to\nprecisely extract the foreground structure and perform structural attention\nwithin the main anatomy. This guides the model to learn key anatomical regions,\nthus improving structural synthesis under the lack of supervision in unpaired\ntraining. Evaluated on two public datasets, spanning three modalities, i.e.,\nMR, CT, and PET, UNest improves recent methods by up to 19.30% across six\nmedical image synthesis tasks. Our code is released at\nhttps://github.com/HieuPhan33/MICCAI2024-UNest.\n","authors":["Vu Minh Hieu Phan","Yutong Xie","Bowen Zhang","Yuankai Qi","Zhibin Liao","Antonios Perperidis","Son Lam Phung","Johan W. Verjans","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2406.18967v2.pdf","comment":"MICCAI version before camera ready"},{"id":"http://arxiv.org/abs/2408.15542v1","updated":"2024-08-28T05:34:14Z","published":"2024-08-28T05:34:14Z","title":"Kangaroo: A Powerful Video-Language Model Supporting Long-context Video\n Input","summary":" Rapid advancements have been made in extending Large Language Models (LLMs)\nto Large Multi-modal Models (LMMs). However, extending input modality of LLMs\nto video data remains a challenging endeavor, especially for long videos. Due\nto insufficient access to large-scale high-quality video data and the excessive\ncompression of visual features, current methods exhibit limitations in\neffectively processing long videos. In this paper, we introduce Kangaroo, a\npowerful Video LMM aimed at addressing these challenges. Confronted with issue\nof inadequate training data, we develop a data curation system to build a\nlarge-scale dataset with high-quality annotations for vision-language\npre-training and instruction tuning. In addition, we design a curriculum\ntraining pipeline with gradually increasing resolution and number of input\nframes to accommodate long videos. Evaluation results demonstrate that, with 8B\nparameters, Kangaroo achieves state-of-the-art performance across a variety of\nvideo understanding benchmarks while exhibiting competitive results on others.\nParticularly, on benchmarks specialized for long videos, Kangaroo excels some\nlarger models with over 10B parameters and proprietary models.\n","authors":["Jiajun Liu","Yibing Wang","Hanghang Ma","Xiaoping Wu","Xiaoqi Ma","Xiaoming Wei","Jianbin Jiao","Enhua Wu","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08872v2","updated":"2024-08-28T05:03:34Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":" This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19698v3","updated":"2024-08-28T04:41:09Z","published":"2024-07-29T04:43:58Z","title":"Classification Matters: Improving Video Action Detection with\n Class-Specific Attention","summary":" Video action detection (VAD) aims to detect actors and classify their actions\nin a video. We figure that VAD suffers more from classification rather than\nlocalization of actors. Hence, we analyze how prevailing methods form features\nfor classification and find that they prioritize actor regions, yet often\noverlooking the essential contextual information necessary for accurate\nclassification. Accordingly, we propose to reduce the bias toward actor and\nencourage paying attention to the context that is relevant to each action\nclass. By assigning a class-dedicated query to each action class, our model can\ndynamically determine where to focus for effective classification. The proposed\nmodel demonstrates superior performance on three challenging benchmarks with\nsignificantly fewer parameters and less computation.\n","authors":["Jinsung Lee","Taeoh Kim","Inwoong Lee","Minho Shim","Dongyoon Wee","Minsu Cho","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2407.19698v3.pdf","comment":"31 pages, accepted to ECCV 2024 (oral)"},{"id":"http://arxiv.org/abs/2208.06561v3","updated":"2024-08-28T04:36:23Z","published":"2022-08-13T03:25:50Z","title":"Drone Referring Localization: An Efficient Heterogeneous Spatial Feature\n Interaction Method For UAV Self-Localization","summary":" Image retrieval (IR) has emerged as a promising approach for\nself-localization in unmanned aerial vehicles (UAVs). However, IR-based methods\nface several challenges: 1) Pre- and post-processing incur significant\ncomputational and storage overhead; 2) The lack of interaction between\ndual-source features impairs precise spatial perception. In this paper, we\npropose an efficient heterogeneous spatial feature interaction method, termed\nDrone Referring Localization (DRL), which aims to localize UAV-view images\nwithin satellite imagery. Unlike conventional methods that treat different data\nsources in isolation, followed by cosine similarity computations, DRL\nfacilitates the learnable interaction of heterogeneous features. To implement\nthe proposed DRL, we design two transformer-based frameworks, Post-Fusion and\nMix-Fusion, enabling end-to-end training and inference. Furthermore, we\nintroduce random scale cropping and weight balance loss techniques to augment\npaired data and optimize the balance between positive and negative sample\nweights. Additionally, we construct a new dataset, UL14, and establish a\nbenchmark tailored to the DRL framework. Compared to traditional IR methods,\nDRL achieves superior localization accuracy (MA@20 +9.4\\%) while significantly\nreducing computational time (1/7) and storage overhead (1/3). The dataset and\ncode will be made publicly available. The dataset and code are available at\n\\url{https://github.com/Dmmm1997/DRL} .\n","authors":["Ming Dai","Enhui Zheng","Jiahao Chen","Lei Qi","Zhenhua Feng","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2208.06561v3.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.15524v1","updated":"2024-08-28T04:19:14Z","published":"2024-08-28T04:19:14Z","title":"Ray-Distance Volume Rendering for Neural Scene Reconstruction","summary":" Existing methods in neural scene reconstruction utilize the Signed Distance\nFunction (SDF) to model the density function. However, in indoor scenes, the\ndensity computed from the SDF for a sampled point may not consistently reflect\nits real importance in volume rendering, often due to the influence of\nneighboring objects. To tackle this issue, our work proposes a novel approach\nfor indoor scene reconstruction, which instead parameterizes the density\nfunction with the Signed Ray Distance Function (SRDF). Firstly, the SRDF is\npredicted by the network and transformed to a ray-conditioned density function\nfor volume rendering. We argue that the ray-specific SRDF only considers the\nsurface along the camera ray, from which the derived density function is more\nconsistent to the real occupancy than that from the SDF. Secondly, although\nSRDF and SDF represent different aspects of scene geometries, their values\nshould share the same sign indicating the underlying spatial occupancy.\nTherefore, this work introduces a SRDF-SDF consistency loss to constrain the\nsigns of the SRDF and SDF outputs. Thirdly, this work proposes a\nself-supervised visibility task, introducing the physical visibility geometry\nto the reconstruction task. The visibility task combines prior from predicted\nSRDF and SDF as pseudo labels, and contributes to generating more accurate 3D\ngeometry. Our method implemented with different representations has been\nvalidated on indoor datasets, achieving improved performance in both\nreconstruction and view synthesis.\n","authors":["Ruihong Yin","Yunlu Chen","Sezer Karaoglu","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2408.15524v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2408.15521v1","updated":"2024-08-28T04:14:01Z","published":"2024-08-28T04:14:01Z","title":"A Simple Baseline with Single-encoder for Referring Image Segmentation","summary":" Referring image segmentation (RIS) requires dense vision-language\ninteractions between visual pixels and textual words to segment objects based\non a given description. However, commonly adapted dual-encoders in RIS, e.g.,\nSwin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal\ndual-encoder), lack dense multi-modal interactions during pre-training, leading\nto a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods\noften rely on multi-modal fusion modules that interact two encoders, but this\napproach leads to high computational costs. In this paper, we present a novel\nRIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of\nshared self-attention across all framework components. This enables seamless\ninteractions of two modalities from input to final prediction, producing\ngranularly aligned multi-modal features. Furthermore, we propose lightweight\nyet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which\ncontribute to the high efficiency of our model. Our simple baseline with a\nsingle encoder achieves outstanding performances on the RIS benchmark datasets\nwhile maintaining computational efficiency, compared to the most recent SoTA\nmethods based on dual-encoders.\n","authors":["Seonghoon Yu","Ilchae Jung","Byeongju Han","Taeoh Kim","Yunho Kim","Dongyoon Wee","Jeany Son"],"pdf_url":"https://arxiv.org/pdf/2408.15521v1.pdf","comment":"ArXiv pre-print"},{"id":"http://arxiv.org/abs/2408.15519v1","updated":"2024-08-28T04:12:07Z","published":"2024-08-28T04:12:07Z","title":"Depth-Weighted Detection of Behaviours of Risk in People with Dementia\n using Cameras","summary":" The behavioural and psychological symptoms of dementia, such as agitation and\naggression, present a significant health and safety risk in residential care\nsettings. Many care facilities have video cameras in place for digital\nmonitoring of public spaces, which can be leveraged to develop an automated\nbehaviours of risk detection system that can alert the staff to enable timely\nintervention and prevent the situation from escalating. However, one of the\nchallenges in our previous study was the presence of false alarms due to\nobstruction of view by activities happening close to the camera. To address\nthis issue, we proposed a novel depth-weighted loss function to train a\ncustomized convolutional autoencoder to enforce equivalent importance to the\nevents happening both near and far from the cameras; thus, helping to reduce\nfalse alarms and making the method more suitable for real-world deployment. The\nproposed method was trained using data from nine participants with dementia\nacross three cameras situated in a specialized dementia unit and achieved an\narea under the curve of receiver operating characteristic of $0.852$, $0.81$\nand $0.768$ for the three cameras. Ablation analysis was conducted for the\nindividual components of the proposed method and the performance of the\nproposed method was investigated for participant-specific and sex-specific\nbehaviours of risk detection. The proposed method performed reasonably well in\ndetecting behaviours of risk in people with dementia motivating further\nresearch toward the development of a behaviours of risk detection system\nsuitable for deployment in video surveillance systems in care facilities.\n","authors":["Pratik K. Mishra","Irene Ballester","Andrea Iaboni","Bing Ye","Kristine Newman","Alex Mihailidis","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2408.15519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03691v3","updated":"2024-08-28T03:57:26Z","published":"2024-03-06T13:17:41Z","title":"MolNexTR: A Generalized Deep Learning Model for Molecular Image\n Recognition","summary":" In the field of chemical structure recognition, the task of converting\nmolecular images into machine-readable data formats such as SMILES string\nstands as a significant challenge, primarily due to the varied drawing styles\nand conventions prevalent in chemical literature. To bridge this gap, we\nproposed MolNexTR, a novel image-to-graph deep learning model that collaborates\nto fuse the strengths of ConvNext, a powerful Convolutional Neural Network\nvariant, and Vision-TRansformer. This integration facilitates a more detailed\nextraction of both local and global features from molecular images. MolNexTR\ncan predict atoms and bonds simultaneously and understand their layout rules.\nIt also excels at flexibly integrating symbolic chemistry principles to discern\nchirality and decipher abbreviated structures. We further incorporate a series\nof advanced algorithms, including an improved data augmentation module, an\nimage contamination module, and a post-processing module for getting the final\nSMILES output. These modules cooperate to enhance the model's robustness to\ndiverse styles of molecular images found in real literature. In our test sets,\nMolNexTR has demonstrated superior performance, achieving an accuracy rate of\n81-97%, marking a significant advancement in the domain of molecular structure\nrecognition.\n","authors":["Yufan Chen","Ching Ting Leung","Yong Huang","Jianwei Sun","Hao Chen","Hanyu Gao"],"pdf_url":"https://arxiv.org/pdf/2403.03691v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15513v1","updated":"2024-08-28T03:50:04Z","published":"2024-08-28T03:50:04Z","title":"Continual-learning-based framework for structural damage recognition","summary":" Multi-damage is common in reinforced concrete structures and leads to the\nrequirement of large number of neural networks, parameters and data storage, if\nconvolutional neural network (CNN) is used for damage recognition. In addition,\nconventional CNN experiences catastrophic forgetting and training inefficiency\nas the number of tasks increases during continual learning, leading to large\naccuracy decrease of previous learned tasks. To address these problems, this\nstudy proposes a continuallearning-based damage recognition model (CLDRM) which\nintegrates the learning without forgetting continual learning method into the\nResNet-34 architecture for the recognition of damages in RC structures as well\nas relevant structural components. Three experiments for four recognition tasks\nwere designed to validate the feasibility and effectiveness of the CLDRM\nframework. In this way, it reduces both the prediction time and data storage by\nabout 75% in four tasks of continuous learning. Three experiments for four\nrecognition tasks were designed to validate the feasibility and effectiveness\nof the CLDRM framework. By gradual feature fusion, CLDRM outperformed other\nmethods by managed to achieve high accuracy in the damage recognition and\nclassification. As the number of recognition tasks increased, CLDRM also\nexperienced smaller decrease of the previous learned tasks. Results indicate\nthat the CLDRM framework successfully performs damage recognition and\nclassification with reasonable accuracy and effectiveness.\n","authors":["Jiangpeng Shu","Jiawei Zhang","Reachsak Ly","Fangzheng Lin","Yuanfeng Duan"],"pdf_url":"https://arxiv.org/pdf/2408.15513v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.12622v6","updated":"2024-08-28T03:45:08Z","published":"2023-07-24T08:51:49Z","title":"Phase Matching for Out-of-Distribution Generalization","summary":" The Fourier transform, an explicit decomposition method for visual signals,\nhas been employed to explain the out-of-distribution generalization behaviors\nof Deep Neural Networks (DNNs). Previous studies indicate that the amplitude\nspectrum is susceptible to the disturbance caused by distribution shifts,\nwhereas the phase spectrum preserves highly-structured spatial information that\nis crucial for robust visual representation learning. Inspired by this insight,\nthis paper is dedicated to clarifying the relationships between Domain\nGeneralization (DG) and the frequency components. Specifically, we provide\ndistribution analysis and empirical experiments for the frequency components.\nBased on these observations, we propose a Phase Matching approach, termed\nPhaMa, to address DG problems. To this end, PhaMa introduces perturbations on\nthe amplitude spectrum and establishes spatial relationships to match the phase\ncomponents with patch contrastive learning. Experiments on multiple benchmarks\ndemonstrate that our proposed method achieves state-of-the-art performance in\ndomain generalization and out-of-distribution robustness tasks. Beyond vanilla\nanalysis and experiments, we further clarify the relationships between the\nFourier components and DG problems by introducing a Fourier-based Structural\nCausal Model (SCM).\n","authors":["Chengming Hu","Yeqian Du","Rui Wang","Hao Chen","Congcong Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.12622v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06207v5","updated":"2024-08-28T03:39:10Z","published":"2023-09-12T13:21:12Z","title":"SGNet: Salient Geometric Network for Point Cloud Registration","summary":" Point Cloud Registration (PCR) is a critical and challenging task in computer\nvision. One of the primary difficulties in PCR is identifying salient and\nmeaningful points that exhibit consistent semantic and geometric properties\nacross different scans. Previous methods have encountered challenges with\nambiguous matching due to the similarity among patch blocks throughout the\nentire point cloud and the lack of consideration for efficient global geometric\nconsistency. To address these issues, we propose a new framework that includes\nseveral novel techniques. Firstly, we introduce a semantic-aware geometric\nencoder that combines object-level and patch-level semantic information. This\nencoder significantly improves registration recall by reducing ambiguity in\npatch-level superpoint matching. Additionally, we incorporate a prior knowledge\napproach that utilizes an intrinsic shape signature to identify salient points.\nThis enables us to extract the most salient super points and meaningful dense\npoints in the scene. Secondly, we introduce an innovative transformer that\nencodes High-Order (HO) geometric features. These features are crucial for\nidentifying salient points within initial overlap regions while considering\nglobal high-order geometric consistency. To optimize this high-order\ntransformer further, we introduce an anchor node selection strategy. By\nencoding inter-frame triangle or polyhedron consistency features based on these\nanchor nodes, we can effectively learn high-order geometric features of salient\nsuper points. These high-order features are then propagated to dense points and\nutilized by a Sinkhorn matching module to identify key correspondences for\nsuccessful registration. In our experiments conducted on well-known datasets\nsuch as 3DMatch/3DLoMatch and KITTI, our approach has shown promising results,\nhighlighting the effectiveness of our novel method.\n","authors":["Qianliang Wu","Yaqing Ding","Lei Luo","Haobo Jiang","Shuo Gu","Chuanwei Zhou","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06207v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09460v2","updated":"2024-08-28T03:29:42Z","published":"2024-08-18T12:48:48Z","title":"Fine-Grained Building Function Recognition from Street-View Images via\n Geometry-Aware Semi-Supervised Learning","summary":" In this work, we propose a geometry-aware semi-supervised method for\nfine-grained building function recognition. This method leverages the geometric\nrelationships between multi-source data to improve the accuracy of pseudo\nlabels in semi-supervised learning, extending the task's scope and making it\napplicable to cross-categorization systems of building function recognition.\nFirstly, we design an online semi-supervised pre-training stage, which\nfacilitates the precise acquisition of building facade location information in\nstreet-view images. In the second stage, we propose a geometry-aware coarse\nannotation generation module. This module effectively combines GIS data and\nstreet-view data based on the geometric relationships, improving the accuracy\nof pseudo annotations. In the third stage, we combine the newly generated\ncoarse annotations with the existing labeled dataset to achieve fine-grained\nfunctional recognition of buildings across multiple cities at a large scale.\nExtensive experiments demonstrate that our proposed framework exhibits superior\nperformance in fine-grained functional recognition of buildings. Within the\nsame categorization system, it achieves improvements of 7.6% and 4.8% compared\nto fully-supervised methods and state-of-the-art semi-supervised methods,\nrespectively. Additionally, our method also performs well in cross-city tasks,\ni.e., extending the model trained on OmniCity (New York) to new areas (i.e.,\nLos Angeles and Boston). This study provides a novel solution for the\nfine-grained function recognition of large-scale buildings across multiple\ncities, offering essential data for understanding urban infrastructure\nplanning, human activity patterns, and the interactions between humans and\nbuildings.\n","authors":["Weijia Li","Jinhua Yu","Dairong Chen","Yi Lin","Runmin Dong","Xiang Zhang","Conghui He","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.09460v2.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2408.15503v1","updated":"2024-08-28T03:17:40Z","published":"2024-08-28T03:17:40Z","title":"RoboSense: Large-scale Dataset and Benchmark for Multi-sensor Low-speed\n Autonomous Driving","summary":" Robust object detection and tracking under arbitrary sight of view is\nchallenging yet essential for the development of Autonomous Vehicle technology.\nWith the growing demand of unmanned function vehicles, near-field scene\nunderstanding becomes an important research topic in the areas of low-speed\nautonomous driving. Due to the complexity of driving conditions and diversity\nof near obstacles such as blind spots and high occlusion, the perception\ncapability of near-field environment is still inferior than its farther\ncounterpart. To further enhance the intelligent ability of unmanned vehicles,\nin this paper, we construct a multimodal data collection platform based on 3\nmain types of sensors (Camera, LiDAR and Fisheye), which supports flexible\nsensor configurations to enable dynamic sight of view for ego vehicle, either\nglobal view or local view. Meanwhile, a large-scale multi-sensor dataset is\nbuilt, named RoboSense, to facilitate near-field scene understanding. RoboSense\ncontains more than 133K synchronized data with 1.4M 3D bounding box and IDs\nannotated in the full $360^{\\circ}$ view, forming 216K trajectories across 7.6K\ntemporal sequences. It has $270\\times$ and $18\\times$ as many annotations of\nnear-field obstacles within 5$m$ as the previous single-vehicle datasets such\nas KITTI and nuScenes. Moreover, we define a novel matching criterion for\nnear-field 3D perception and prediction metrics. Based on RoboSense, we\nformulate 6 popular tasks to facilitate the future development of related\nresearch, where the detailed data analysis as well as benchmarks are also\nprovided accordingly.\n","authors":["Haisheng Su","Feixiang Song","Cong Ma","Panpan Cai","Wei Wu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2408.15503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02408v2","updated":"2024-08-28T02:53:22Z","published":"2024-08-05T12:09:38Z","title":"Multi-weather Cross-view Geo-localization Using Denoising Diffusion\n Models","summary":" Cross-view geo-localization in GNSS-denied environments aims to determine an\nunknown location by matching drone-view images with the correct geo-tagged\nsatellite-view images from a large gallery. Recent research shows that learning\ndiscriminative image representations under specific weather conditions can\nsignificantly enhance performance. However, the frequent occurrence of unseen\nextreme weather conditions hinders progress. This paper introduces MCGF, a\nMulti-weather Cross-view Geo-localization Framework designed to dynamically\nadapt to unseen weather conditions. MCGF establishes a joint optimization\nbetween image restoration and geo-localization using denoising diffusion\nmodels. For image restoration, MCGF incorporates a shared encoder and a\nlightweight restoration module to help the backbone eliminate weather-specific\ninformation. For geo-localization, MCGF uses EVA-02 as a backbone for feature\nextraction, with cross-entropy loss for training and cosine distance for\ntesting. Extensive experiments on University160k-WX demonstrate that MCGF\nachieves competitive results for geo-localization in varying weather\nconditions.\n","authors":["Tongtong Feng","Qing Li","Xin Wang","Mingzi Wang","Guangyao Li","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.02408v2.pdf","comment":"Accepted by ACM MM24 workshop"},{"id":"http://arxiv.org/abs/2408.15484v1","updated":"2024-08-28T02:17:58Z","published":"2024-08-28T02:17:58Z","title":"NAS-BNN: Neural Architecture Search for Binary Neural Networks","summary":" Binary Neural Networks (BNNs) have gained extensive attention for their\nsuperior inferencing efficiency and compression ratio compared to traditional\nfull-precision networks. However, due to the unique characteristics of BNNs,\ndesigning a powerful binary architecture is challenging and often requires\nsignificant manpower. A promising solution is to utilize Neural Architecture\nSearch (NAS) to assist in designing BNNs, but current NAS methods for BNNs are\nrelatively straightforward and leave a performance gap between the searched\nmodels and manually designed ones. To address this gap, we propose a novel\nneural architecture search scheme for binary neural networks, named NAS-BNN. We\nfirst carefully design a search space based on the unique characteristics of\nBNNs. Then, we present three training strategies, which significantly enhance\nthe training of supernet and boost the performance of all subnets. Our\ndiscovered binary model family outperforms previous BNNs for a wide range of\noperations (OPs) from 20M to 200M. For instance, we achieve 68.20% top-1\naccuracy on ImageNet with only 57M OPs. In addition, we validate the\ntransferability of these searched BNNs on the object detection task, and our\nbinary detectors with the searched BNNs achieve a novel state-of-the-art\nresult, e.g., 31.6% mAP with 370M OPs, on MS COCO dataset. The source code and\nmodels will be released at https://github.com/VDIGPKU/NAS-BNN.\n","authors":["Zhihao Lin","Yongtao Wang","Jinhe Zhang","Xiaojie Chu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2408.15484v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2408.14895v2","updated":"2024-08-28T01:56:33Z","published":"2024-08-27T09:18:57Z","title":"VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view\n Videos of Daily Activities","summary":" Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data\n(e.g., images and videos) into symbols, have attracted attention as resources\nenabling knowledge processing and machine learning across modalities. However,\nthe construction of MMKGs for videos consisting of multiple events, such as\ndaily activities, is still in the early stages. In this paper, we construct an\nMMKG based on synchronized multi-view simulated videos of daily activities.\nBesides representing the content of daily life videos as event-centric\nknowledge, our MMKG also includes frame-by-frame fine-grained changes, such as\nbounding boxes within video frames. In addition, we provide support tools for\nquerying our MMKG. As an application example, we demonstrate that our MMKG\nfacilitates benchmarking vision-language models by providing the necessary\nvision-language datasets for a tailored task.\n","authors":["Shusaku Egami","Takahiro Ugai","Swe Nwe Nwe Htun","Ken Fukuda"],"pdf_url":"https://arxiv.org/pdf/2408.14895v2.pdf","comment":"5 pages, 4 figures, accepted by CIKM2024 Resource Track"},{"id":"http://arxiv.org/abs/2402.14780v3","updated":"2024-08-28T01:13:44Z","published":"2024-02-22T18:38:48Z","title":"Customize-A-Video: One-Shot Motion Customization of Text-to-Video\n Diffusion Models","summary":" Image customization has been extensively studied in text-to-image (T2I)\ndiffusion models, leading to impressive outcomes and applications. With the\nemergence of text-to-video (T2V) diffusion models, its temporal counterpart,\nmotion customization, has not yet been well investigated. To address the\nchallenge of one-shot video motion customization, we propose Customize-A-Video\nthat models the motion from a single reference video and adapts it to new\nsubjects and scenes with both spatial and temporal varieties. It leverages\nlow-rank adaptation (LoRA) on temporal attention layers to tailor the\npre-trained T2V diffusion model for specific motion modeling. To disentangle\nthe spatial and temporal information during training, we introduce a novel\nconcept of appearance absorbers that detach the original appearance from the\nreference video prior to motion learning. The proposed modules are trained in a\nstaged pipeline and inferred in a plug-and-play fashion, enabling easy\nextensions to various downstream tasks such as custom video generation and\nediting, video appearance customization and multiple motion combination. Our\nproject page can be found at https://customize-a-video.github.io.\n","authors":["Yixuan Ren","Yang Zhou","Jimei Yang","Jing Shi","Difan Liu","Feng Liu","Mingi Kwon","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2402.14780v3.pdf","comment":"Accepted by ECCV 2024. Project page:\n https://customize-a-video.github.io"},{"id":"http://arxiv.org/abs/2408.15465v1","updated":"2024-08-28T01:06:19Z","published":"2024-08-28T01:06:19Z","title":"Dynamic Reconstruction from Neuromorphic Data","summary":" Unlike traditional cameras which synchronously register pixel intensity,\nneuromorphic sensors only register `changes' at pixels where a change is\noccurring asynchronously. This enables neuromorphic sensors to sample at a\nmicro-second level and efficiently capture the dynamics. Since, only sequences\nof asynchronous event changes are recorded rather than brightness intensities\nover time, many traditional image processing techniques cannot be directly\napplied. Furthermore, existing approaches, including the ones recently\nintroduced by the authors, use traditional images combined with neuromorphic\nevent data to carry out reconstructions. The aim of this work is introduce an\noptimization based approach to reconstruct images and dynamics only from the\nneuromoprhic event data without any additional knowledge of the events. Each\npixel is modeled temporally. The experimental results on real data highlight\nthe efficacy of the presented approach, paving the way for efficient and\naccurate processing of neuromorphic sensor data in real-world applications.\n","authors":["Harbir Antil","Daniel Blauvelt","David Sayre"],"pdf_url":"https://arxiv.org/pdf/2408.15465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15461v1","updated":"2024-08-28T00:54:51Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":" Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v1.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2408.15450v1","updated":"2024-08-28T00:07:51Z","published":"2024-08-28T00:07:51Z","title":"Avoiding Generative Model Writer's Block With Embedding Nudging","summary":" Generative image models, since introduction, have become a global phenomenon.\nFrom new arts becoming possible to new vectors of abuse, many new capabilities\nhave become available. One of the challenging issues with generative models is\ncontrolling the generation process specially to prevent specific generations\nclasses or instances . There are several reasons why one may want to control\nthe output of generative models, ranging from privacy and safety concerns to\napplication limitations or user preferences\n To address memorization and privacy challenges, there has been considerable\nresearch dedicated to filtering prompts or filtering the outputs of these\nmodels. What all these solutions have in common is that at the end of the day\nthey stop the model from producing anything, hence limiting the usability of\nthe model. In this paper, we propose a method for addressing this usability\nissue by making it possible to steer away from unwanted concepts (when detected\nin model's output) and still generating outputs. In particular we focus on the\nlatent diffusion image generative models and how one can prevent them to\ngenerate particular images while generating similar images with limited\noverhead.\n We focus on mitigating issues like image memorization, demonstrating our\ntechnique's effectiveness through qualitative and quantitative evaluations. Our\nmethod successfully prevents the generation of memorized training images while\nmaintaining comparable image quality and relevance to the unmodified model.\n","authors":["Ali Zand","Milad Nasr"],"pdf_url":"https://arxiv.org/pdf/2408.15450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16176v1","updated":"2024-08-28T23:53:57Z","published":"2024-08-28T23:53:57Z","title":"VLM4Bio: A Benchmark Dataset to Evaluate Pretrained Vision-Language\n Models for Trait Discovery from Biological Images","summary":" Images are increasingly becoming the currency for documenting biodiversity on\nthe planet, providing novel opportunities for accelerating scientific\ndiscoveries in the field of organismal biology, especially with the advent of\nlarge vision-language models (VLMs). We ask if pre-trained VLMs can aid\nscientists in answering a range of biologically relevant questions without any\nadditional fine-tuning. In this paper, we evaluate the effectiveness of 12\nstate-of-the-art (SOTA) VLMs in the field of organismal biology using a novel\ndataset, VLM4Bio, consisting of 469K question-answer pairs involving 30K images\nfrom three groups of organisms: fishes, birds, and butterflies, covering five\nbiologically relevant tasks. We also explore the effects of applying prompting\ntechniques and tests for reasoning hallucination on the performance of VLMs,\nshedding new light on the capabilities of current SOTA VLMs in answering\nbiologically relevant questions using images. The code and datasets for running\nall the analyses reported in this paper can be found at\nhttps://github.com/sammarfy/VLM4Bio.\n","authors":["M. Maruf","Arka Daw","Kazi Sajeed Mehrab","Harish Babu Manogaran","Abhilash Neog","Medha Sawhney","Mridul Khurana","James P. Balhoff","Yasin Bakis","Bahadir Altintas","Matthew J. Thompson","Elizabeth G. Campolongo","Josef C. Uyeda","Hilmar Lapp","Henry L. Bart","Paula M. Mabee","Yu Su","Wei-Lun Chao","Charles Stewart","Tanya Berger-Wolf","Wasila Dahdul","Anuj Karpatne"],"pdf_url":"https://arxiv.org/pdf/2408.16176v1.pdf","comment":"36 pages, 37 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.16154v1","updated":"2024-08-28T22:14:44Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":" Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","André Anjos","Lilian Berton","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2408.16154v1.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16150v1","updated":"2024-08-28T22:02:38Z","published":"2024-08-28T22:02:38Z","title":"Single-Photon 3D Imaging with Equi-Depth Photon Histograms","summary":" Single-photon cameras present a promising avenue for high-resolution 3D\nimaging. They have ultra-high sensitivity -- down to individual photons -- and\ncan record photon arrival times with extremely high (sub-nanosecond)\nresolution. Single-photon 3D cameras estimate the round-trip time of a laser\npulse by forming equi-width (EW) histograms of detected photon timestamps.\nAcquiring and transferring such EW histograms requires high bandwidth and\nin-pixel memory, making SPCs less attractive in resource-constrained settings\nsuch as mobile devices and AR/VR headsets. In this work we propose a 3D sensing\ntechnique based on equi-depth (ED) histograms. ED histograms compress timestamp\ndata more efficiently than EW histograms, reducing the bandwidth requirement.\nMoreover, to reduce the in-pixel memory requirement, we propose a lightweight\nalgorithm to estimate ED histograms in an online fashion without explicitly\nstoring the photon timestamps. This algorithm is amenable to future in-pixel\nimplementations. We propose algorithms that process ED histograms to perform 3D\ncomputer-vision tasks of estimating scene distance maps and performing visual\nodometry under challenging conditions such as high ambient light. Our work\npaves the way towards lower bandwidth and reduced in-pixel memory requirements\nfor SPCs, making them attractive for resource-constrained 3D vision\napplications. Project page:\n$\\href{https://www.computational.camera/pedh}{https://www.computational.camera/pedh}$\n","authors":["Kaustubh Sadekar","David Maier","Atul Ingle"],"pdf_url":"https://arxiv.org/pdf/2408.16150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16318v2","updated":"2024-08-28T21:07:49Z","published":"2024-03-24T22:53:16Z","title":"AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans","summary":" Recently, progress in acquisition equipment such as LiDAR sensors has enabled\nsensing increasingly spacious outdoor 3D environments. Making sense of such 3D\nacquisitions requires fine-grained scene understanding, such as constructing\ninstance-based 3D scene segmentations. Commonly, a neural network is trained\nfor this task; however, this requires access to a large, densely annotated\ndataset, which is widely known to be challenging to obtain. To address this\nissue, in this work we propose to predict instance segmentations for 3D scenes\nin an unsupervised way, without relying on ground-truth annotations. To this\nend, we construct a learning framework consisting of two components: (1) a\npseudo-annotation scheme for generating initial unsupervised pseudo-labels; and\n(2) a self-training algorithm for instance segmentation to fit robust, accurate\ninstances from initial noisy proposals. To enable generating 3D instance mask\nproposals, we construct a weighted proxy-graph by connecting 3D points with\nedges integrating multi-modal image- and point-based self-supervised features,\nand perform graph-cuts to isolate individual pseudo-instances. We then build on\na state-of-the-art point-based architecture and train a 3D instance\nsegmentation model, resulting in significant refinement of initial proposals.\nTo scale to arbitrary complexity 3D scenes, we design our algorithm to operate\non local 3D point chunks and construct a merging step to generate scene-level\ninstance segmentations. Experiments on the challenging SemanticKITTI benchmark\ndemonstrate the potential of our approach, where it attains 13.3% higher\nAverage Precision and 9.1% higher F1 score compared to the best-performing\nbaseline. The code will be made publicly available at\nhttps://github.com/artonson/autoinst.\n","authors":["Cedric Perauer","Laurenz Adrian Heidrich","Haifan Zhang","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.16318v2.pdf","comment":"8 pages, 7 figures, to be published in IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS) 2024"},{"id":"http://arxiv.org/abs/2408.16130v1","updated":"2024-08-28T20:35:38Z","published":"2024-08-28T20:35:38Z","title":"Using Backbone Foundation Model for Evaluating Fairness in Chest\n Radiography Without Demographic Data","summary":" Ensuring consistent performance across diverse populations and incorporating\nfairness into machine learning models are crucial for advancing medical image\ndiagnostics and promoting equitable healthcare. However, many databases do not\nprovide protected attributes or contain unbalanced representations of\ndemographic groups, complicating the evaluation of model performance across\ndifferent demographics and the application of bias mitigation techniques that\nrely on these attributes. This study aims to investigate the effectiveness of\nusing the backbone of Foundation Models as an embedding extractor for creating\ngroups that represent protected attributes, such as gender and age. We propose\nutilizing these groups in different stages of bias mitigation, including\npre-processing, in-processing, and evaluation. Using databases in and\nout-of-distribution scenarios, it is possible to identify that the method can\ncreate groups that represent gender in both databases and reduce in 4.44% the\ndifference between the gender attribute in-distribution and 6.16% in\nout-of-distribution. However, the model lacks robustness in handling age\nattributes, underscoring the need for more fundamentally fair and robust\nFoundation models. These findings suggest a role in promoting fairness\nassessment in scenarios where we lack knowledge of attributes, contributing to\nthe development of more equitable medical diagnostics.\n","authors":["Dilermando Queiroz","André Anjos","Lilian Berton"],"pdf_url":"https://arxiv.org/pdf/2408.16130v1.pdf","comment":"Preprint of paper to be presented at Fairness of AI in Medical\n Imaging (FAIMI) during MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.15113v2","updated":"2024-08-28T20:31:42Z","published":"2024-08-27T14:51:34Z","title":"AnomalousPatchCore: Exploring the Use of Anomalous Samples in Industrial\n Anomaly Detection","summary":" Visual inspection, or industrial anomaly detection, is one of the most common\nquality control types in manufacturing. The task is to identify the presence of\nan anomaly given an image, e.g., a missing component on an image of a circuit\nboard, for subsequent manual inspection. While industrial anomaly detection has\nseen a surge in recent years, most anomaly detection methods still utilize\nknowledge only from normal samples, failing to leverage the information from\nthe frequently available anomalous samples. Additionally, they heavily rely on\nvery general feature extractors pre-trained on common image classification\ndatasets. In this paper, we address these shortcomings and propose the new\nanomaly detection system AnomalousPatchCore~(APC) based on a feature extractor\nfine-tuned with normal and anomalous in-domain samples and a subsequent memory\nbank for identifying unusual features. To fine-tune the feature extractor in\nAPC, we propose three auxiliary tasks that address the different aspects of\nanomaly detection~(classification vs. localization) and mitigate the effect of\nthe imbalance between normal and anomalous samples. Our extensive evaluation on\nthe MVTec dataset shows that APC outperforms state-of-the-art systems in\ndetecting anomalies, which is especially important in industrial anomaly\ndetection given the subsequent manual inspection. In detailed ablation studies,\nwe further investigate the properties of our APC.\n","authors":["Mykhailo Koshil","Tilman Wegener","Detlef Mentrup","Simone Frintrop","Christian Wilms"],"pdf_url":"https://arxiv.org/pdf/2408.15113v2.pdf","comment":"Accepted at the 2nd workshop on Vision-based InduStrial InspectiON\n (VISION) @ ECCV"},{"id":"http://arxiv.org/abs/2408.15077v2","updated":"2024-08-28T20:30:29Z","published":"2024-08-27T14:05:48Z","title":"MMASD+: A Novel Dataset for Privacy-Preserving Behavior Analysis of\n Children with Autism Spectrum Disorder","summary":" Autism spectrum disorder (ASD) is characterized by significant challenges in\nsocial interaction and comprehending communication signals. Recently,\ntherapeutic interventions for ASD have increasingly utilized Deep learning\npowered-computer vision techniques to monitor individual progress over time.\nThese models are trained on private, non-public datasets from the autism\ncommunity, creating challenges in comparing results across different models due\nto privacy-preserving data-sharing issues. This work introduces MMASD+, an\nenhanced version of the novel open-source dataset called Multimodal ASD\n(MMASD). MMASD+ consists of diverse data modalities, including 3D-Skeleton, 3D\nBody Mesh, and Optical Flow data. It integrates the capabilities of Yolov8 and\nDeep SORT algorithms to distinguish between the therapist and children,\naddressing a significant barrier in the original dataset. Additionally, a\nMultimodal Transformer framework is proposed to predict 11 action types and the\npresence of ASD. This framework achieves an accuracy of 95.03% for predicting\naction types and 96.42% for predicting ASD presence, demonstrating over a 10%\nimprovement compared to models trained on single data modalities. These\nfindings highlight the advantages of integrating multiple data modalities\nwithin the Multimodal Transformer framework.\n","authors":["Pavan Uttej Ravva","Behdokht Kiafar","Pinar Kullu","Jicheng Li","Anjana Bhat","Roghayeh Leila Barmaki"],"pdf_url":"https://arxiv.org/pdf/2408.15077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03652v2","updated":"2024-08-28T20:29:12Z","published":"2024-05-06T17:23:42Z","title":"Field-of-View Extension for Brain Diffusion MRI via Deep Generative\n Models","summary":" Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of\nwhole-brain tissue microstructure and connectivity can be severely impeded by\nan incomplete field-of-view (FOV). This work aims to develop a method for\nimputing the missing slices directly from existing dMRI scans with an\nincomplete FOV. We hypothesize that the imputed image with complete FOV can\nimprove the whole-brain tractography for corrupted data with incomplete FOV.\nTherefore, our approach provides a desirable alternative to discarding the\nvaluable dMRI data, enabling subsequent tractography analyses that would\notherwise be challenging or unattainable with corrupted data. Approach: We\npropose a framework based on a deep generative model that estimates the absent\nbrain regions in dMRI scans with incomplete FOV. The model is capable of\nlearning both the diffusion characteristics in diffusion-weighted images (DWI)\nand the anatomical features evident in the corresponding structural images for\nefficiently imputing missing slices of DWI outside of incomplete FOV. Results:\nFor evaluating the imputed slices, on the WRAP dataset the proposed framework\nachieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the\nNACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599,\nSSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as\ndemonstrated by an increased average Dice score for 72 tracts (p < 0.001) on\nboth the WRAP and NACC datasets. Conclusions: Results suggest that the proposed\nframework achieved sufficient imputation performance in dMRI data with\nincomplete FOV for improving whole-brain tractography, thereby repairing the\ncorrupted data. Our approach achieved more accurate whole-brain tractography\nresults with extended and complete FOV and reduced the uncertainty when\nanalyzing bundles associated with Alzheimer's Disease.\n","authors":["Chenyu Gao","Shunxing Bao","Michael Kim","Nancy Newlin","Praitayini Kanakaraj","Tianyuan Yao","Gaurav Rudravaram","Yuankai Huo","Daniel Moyer","Kurt Schilling","Walter Kukull","Arthur Toga","Derek Archer","Timothy Hohman","Bennett Landman","Zhiyuan Li"],"pdf_url":"https://arxiv.org/pdf/2405.03652v2.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.16123v1","updated":"2024-08-28T20:22:39Z","published":"2024-08-28T20:22:39Z","title":"ChartEye: A Deep Learning Framework for Chart Information Extraction","summary":" The widespread use of charts and infographics as a means of data\nvisualization in various domains has inspired recent research in automated\nchart understanding. However, information extraction from chart images is a\ncomplex multitasked process due to style variations and, as a consequence, it\nis challenging to design an end-to-end system. In this study, we propose a deep\nlearning-based framework that provides a solution for key steps in the chart\ninformation extraction pipeline. The proposed framework utilizes hierarchal\nvision transformers for the tasks of chart-type and text-role classification,\nwhile YOLOv7 for text detection. The detected text is then enhanced using Super\nResolution Generative Adversarial Networks to improve the recognition output of\nthe OCR. Experimental results on a benchmark dataset show that our proposed\nframework achieves excellent performance at every stage with F1-scores of 0.97\nfor chart-type classification, 0.91 for text-role classification, and a mean\nAverage Precision of 0.95 for text detection.\n","authors":["Osama Mustafa","Muhammad Khizer Ali","Momina Moetesum","Imran Siddiqi"],"pdf_url":"https://arxiv.org/pdf/2408.16123v1.pdf","comment":"8 Pages, and 11 Figures"},{"id":"http://arxiv.org/abs/2408.16117v1","updated":"2024-08-28T20:05:36Z","published":"2024-08-28T20:05:36Z","title":"Alternating Direction Method of Multipliers for Negative Binomial Model\n with The Weighted Difference of Anisotropic and Isotropic Total Variation","summary":" In many applications such as medical imaging, the measurement data represent\ncounts of photons hitting a detector. Such counts in low-photon settings are\noften modeled using a Poisson distribution. However, this model assumes that\nthe mean and variance of the signal's noise distribution are equal. For\noverdispersed data where the variance is greater than the mean, the negative\nbinomial distribution is a more appropriate statistical model. In this paper,\nwe propose an optimization approach for recovering images corrupted by\noverdispersed Poisson noise. In particular, we incorporate a weighted\nanisotropic-isotropic total variation regularizer, which avoids staircasing\nartifacts that are introduced by a regular total variation penalty. We use an\nalternating direction method of multipliers, where each subproblem has a\nclosed-form solution. Numerical experiments demonstrate the effectiveness of\nour proposed approach, especially in very photon-limited settings.\n","authors":["Yu Lu","Kevin Bui","Roummel F. Marcia"],"pdf_url":"https://arxiv.org/pdf/2408.16117v1.pdf","comment":"6 pages, Accepted by the IEEE International Conference on Multimedia\n and Expo (ICME)"},{"id":"http://arxiv.org/abs/2408.03393v2","updated":"2024-08-28T19:56:19Z","published":"2024-08-06T18:38:55Z","title":"Biomedical Image Segmentation: A Systematic Literature Review of Deep\n Learning Based Object Detection Methods","summary":" Biomedical image segmentation plays a vital role in diagnosis of diseases\nacross various organs. Deep learning-based object detection methods are\ncommonly used for such segmentation. There exists an extensive research in this\ntopic. However, there is no standard review on this topic. Existing surveys\noften lack a standardized approach or focus on broader segmentation techniques.\nIn this paper, we conducted a systematic literature review (SLR), collected and\nanalysed 148 articles that explore deep learning object detection methods for\nbiomedical image segmentation. We critically analyzed these methods, identified\nthe key challenges, and discussed the future directions. From the selected\narticles we extracted the results including the deep learning models, targeted\nimaging modalities, targeted diseases, and the metrics for the analysis of the\nmethods. The results have been presented in tabular and/or charted forms. The\nresults are presented in three major categories including two stage detection\nmodels, one stage detection models and point-based detection models. Each\narticle is individually analyzed along with its pros and cons. Finally, we\ndiscuss open challenges, potential benefits, and future research directions.\nThis SLR aims to provide the research community with a quick yet deeper\nunderstanding of these segmentation models, ultimately facilitating the\ndevelopment of more powerful solutions for biomedical image analysis.\n","authors":["Fazli Wahid","Yingliang Ma","Dawar Khan","Muhammad Aamir","Syed U. K. Bukhari"],"pdf_url":"https://arxiv.org/pdf/2408.03393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16113v1","updated":"2024-08-28T19:43:48Z","published":"2024-08-28T19:43:48Z","title":"Negative Binomial Matrix Completion","summary":" Matrix completion focuses on recovering missing or incomplete information in\nmatrices. This problem arises in various applications, including image\nprocessing and network analysis. Previous research proposed Poisson matrix\ncompletion for count data with noise that follows a Poisson distribution, which\nassumes that the mean and variance are equal. Since overdispersed count data,\nwhose variance is greater than the mean, is more likely to occur in realistic\nsettings, we assume that the noise follows the negative binomial (NB)\ndistribution, which can be more general than the Poisson distribution. In this\npaper, we introduce NB matrix completion by proposing a nuclear-norm\nregularized model that can be solved by proximal gradient descent. In our\nexperiments, we demonstrate that the NB model outperforms Poisson matrix\ncompletion in various noise and missing data settings on real data.\n","authors":["Yu Lu","Kevin Bui","Roummel F. Marcia"],"pdf_url":"https://arxiv.org/pdf/2408.16113v1.pdf","comment":"6 pages, Accepted by the IEEE International Workshop on Machine\n Learning for Signal Processing (MLSP)"},{"id":"http://arxiv.org/abs/2408.13724v2","updated":"2024-08-28T18:09:49Z","published":"2024-08-25T04:56:09Z","title":"PhysPart: Physically Plausible Part Completion for Interactable Objects","summary":" Interactable objects are ubiquitous in our daily lives. Recent advances in 3D\ngenerative models make it possible to automate the modeling of these objects,\nbenefiting a range of applications from 3D printing to the creation of robot\nsimulation environments. However, while significant progress has been made in\nmodeling 3D shapes and appearances, modeling object physics, particularly for\ninteractable objects, remains challenging due to the physical constraints\nimposed by inter-part motions. In this paper, we tackle the problem of\nphysically plausible part completion for interactable objects, aiming to\ngenerate 3D parts that not only fit precisely into the object but also allow\nsmooth part motions. To this end, we propose a diffusion-based part generation\nmodel that utilizes geometric conditioning through classifier-free guidance and\nformulates physical constraints as a set of stability and mobility losses to\nguide the sampling process. Additionally, we demonstrate the generation of\ndependent parts, paving the way toward sequential part generation for objects\nwith complex part-whole hierarchies. Experimentally, we introduce a new metric\nfor measuring physical plausibility based on motion success rates. Our model\noutperforms existing baselines over shape and physical metrics, especially\nthose that do not adequately model physical constraints. We also demonstrate\nour applications in 3D printing, robot manipulation, and sequential part\ngeneration, showing our strength in realistic tasks with the demand for high\nphysical plausibility.\n","authors":["Rundong Luo","Haoran Geng","Congyue Deng","Puhao Li","Zan Wang","Baoxiong Jia","Leonidas Guibas","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.13724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14028v2","updated":"2024-08-28T18:06:50Z","published":"2024-08-26T05:38:27Z","title":"SurGen: Text-Guided Diffusion Model for Surgical Video Generation","summary":" Diffusion-based video generation models have made significant strides,\nproducing outputs with improved visual fidelity, temporal coherence, and user\ncontrol. These advancements hold great promise for improving surgical education\nby enabling more realistic, diverse, and interactive simulation environments.\nIn this study, we introduce SurGen, a text-guided diffusion model tailored for\nsurgical video synthesis, producing the highest resolution and longest duration\nvideos among existing surgical video generation models. We validate the visual\nand temporal quality of the outputs using standard image and video generation\nmetrics. Additionally, we assess their alignment to the corresponding text\nprompts through a deep learning classifier trained on surgical data. Our\nresults demonstrate the potential of diffusion models to serve as valuable\neducational tools for surgical trainees.\n","authors":["Joseph Cho","Samuel Schmidgall","Cyril Zakka","Mrudang Mathur","Rohan Shad","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2408.14028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16061v1","updated":"2024-08-28T18:01:00Z","published":"2024-08-28T18:01:00Z","title":"3D Reconstruction with Spatial Memory","summary":" We present Spann3R, a novel approach for dense 3D reconstruction from ordered\nor unordered image collections. Built on the DUSt3R paradigm, Spann3R uses a\ntransformer-based architecture to directly regress pointmaps from images\nwithout any prior knowledge of the scene or camera parameters. Unlike DUSt3R,\nwhich predicts per image-pair pointmaps each expressed in its local coordinate\nframe, Spann3R can predict per-image pointmaps expressed in a global coordinate\nsystem, thus eliminating the need for optimization-based global alignment. The\nkey idea of Spann3R is to manage an external spatial memory that learns to keep\ntrack of all previous relevant 3D information. Spann3R then queries this\nspatial memory to predict the 3D structure of the next frame in a global\ncoordinate system. Taking advantage of DUSt3R's pre-trained weights, and\nfurther fine-tuning on a subset of datasets, Spann3R shows competitive\nperformance and generalization ability on various unseen datasets and can\nprocess ordered image collections in real time. Project page:\n\\url{https://hengyiwang.github.io/projects/spanner}\n","authors":["Hengyi Wang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2408.16061v1.pdf","comment":"Project page: \\url{https://hengyiwang.github.io/projects/spanner}"},{"id":"http://arxiv.org/abs/2011.08388v3","updated":"2024-08-28T22:05:07Z","published":"2020-11-17T02:55:16Z","title":"Interpretable Image Emotion Recognition: A Domain Adaptation Approach\n Using Facial Expressions","summary":" This paper proposes a feature-based domain adaptation technique for\nidentifying emotions in generic images, encompassing both facial and non-facial\nobjects, as well as non-human components. This approach addresses the challenge\nof the limited availability of pre-trained models and well-annotated datasets\nfor Image Emotion Recognition (IER). Initially, a deep-learning-based Facial\nExpression Recognition (FER) system is developed, classifying facial images\ninto discrete emotion classes. Maintaining the same network architecture, this\nFER system is then adapted to recognize emotions in generic images through the\napplication of discrepancy loss, enabling the model to effectively learn IER\nfeatures while classifying emotions into categories such as 'happy,' 'sad,'\n'hate,' and 'anger.' Additionally, a novel interpretability method, Divide and\nConquer based Shap (DnCShap), is introduced to elucidate the visual features\nmost relevant for emotion recognition. The proposed IER system demonstrated\nemotion classification accuracies of 60.98% for the IAPSa dataset, 58.86% for\nthe ArtPhoto dataset, 69.13% for the FI dataset, and 58.06% for the EMOTIC\ndataset. The system effectively identifies the important visual features\nleading to specific emotion classifications and provides detailed embedding\nplots to explain the predictions, enhancing the understanding and trust in\nAI-driven emotion recognition systems.\n","authors":["Puneet Kumar","Balasubramanian Raman"],"pdf_url":"https://arxiv.org/pdf/2011.08388v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.15953v1","updated":"2024-08-28T17:12:01Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing the sequence of historical interactions between users and items,\nsequential recommendation models learn user intent and make predictions about\nthe next item of interest. Next to these item interactions, most systems also\nhave interactions with pages not related to specific items, for example\nnavigation pages, account pages, and pages for a specific category, which may\nprovide additional insights into the user's interests. However, while there are\nseveral approaches to integrate additional information about items and users,\nthe topic of integrating non-item pages has been less explored. We use the\nhypotheses testing framework HypTrails to show that there is indeed a\nrelationship between these non-item pages and the items of interest and fill\nthis gap by proposing various approaches of representing non-item pages (e.g,\nbased on their content) to use them as an additional information source for the\ntask of sequential next-item prediction.\n We create a synthetic dataset with non-item pages highly related to the\nsubsequent item to show that the models are generally capable of learning from\nthese interactions, and subsequently evaluate the improvements gained by\nincluding non-item pages in two real-world datasets.\n We adapt eight popular sequential recommender models, covering CNN-, RNN- and\ntransformer-based architectures, to integrate non-item pages and investigate\nthe capabilities of these models to leverage their information for next item\nprediction. We also analyze their behavior on noisy data and compare different\nitem representation strategies.\n Our results show that non-item pages are a valuable source of information,\nbut representing such a page well is the key to successfully leverage them. The\ninclusion of non-item pages can increase the performance for next-item\nprediction in all examined model architectures with a varying degree.\n","authors":["Elisabeth Fischer","Daniel Schlör","Albin Zehe","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2408.15953v1.pdf","comment":"36 pages, 19 figures; Work in Progress"},{"id":"http://arxiv.org/abs/2408.15836v1","updated":"2024-08-28T14:48:37Z","published":"2024-08-28T14:48:37Z","title":"Knowledge Navigator: LLM-guided Browsing Framework for Exploratory\n Search in Scientific Literature","summary":" The exponential growth of scientific literature necessitates advanced tools\nfor effective knowledge exploration. We present Knowledge Navigator, a system\ndesigned to enhance exploratory search abilities by organizing and structuring\nthe retrieved documents from broad topical queries into a navigable, two-level\nhierarchy of named and descriptive scientific topics and subtopics. This\nstructured organization provides an overall view of the research themes in a\ndomain, while also enabling iterative search and deeper knowledge discovery\nwithin specific subtopics by allowing users to refine their focus and retrieve\nadditional relevant documents. Knowledge Navigator combines LLM capabilities\nwith cluster-based methods to enable an effective browsing method. We\ndemonstrate our approach's effectiveness through automatic and manual\nevaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code,\nprompts, and benchmarks are made publicly available.\n","authors":["Uri Katz","Mosh Levy","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2408.15836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15796v1","updated":"2024-08-28T13:42:28Z","published":"2024-08-28T13:42:28Z","title":"Evaluating Named Entity Recognition Using Few-Shot Prompting with Large\n Language Models","summary":" This paper evaluates Few-Shot Prompting with Large Language Models for Named\nEntity Recognition (NER). Traditional NER systems rely on extensive labeled\ndatasets, which are costly and time-consuming to obtain. Few-Shot Prompting or\nin-context learning enables models to recognize entities with minimal examples.\nWe assess state-of-the-art models like GPT-4 in NER tasks, comparing their\nfew-shot performance to fully supervised benchmarks. Results show that while\nthere is a performance gap, large models excel in adapting to new entity types\nand domains with very limited data. We also explore the effects of prompt\nengineering, guided output format and context length on performance. This study\nunderscores Few-Shot Learning's potential to reduce the need for large labeled\ndatasets, enhancing NER scalability and accessibility.\n","authors":["Hédi Zhegidi","Ludovic Moncla"],"pdf_url":"https://arxiv.org/pdf/2408.15796v1.pdf","comment":"Github repo: https://github.com/GEODE-project/ner-llm"},{"id":"http://arxiv.org/abs/2408.15787v1","updated":"2024-08-28T13:29:59Z","published":"2024-08-28T13:29:59Z","title":"Interactive Agents: Simulating Counselor-Client Psychological Counseling\n via Role-Playing LLM-to-LLM Interactions","summary":" Virtual counselors powered by large language models (LLMs) aim to create\ninteractive support systems that effectively assist clients struggling with\nmental health challenges. To replicate counselor-client conversations,\nresearchers have built an online mental health platform that allows\nprofessional counselors to provide clients with text-based counseling services\nfor about an hour per session. Notwithstanding its effectiveness, challenges\nexist as human annotation is time-consuming, cost-intensive, privacy-protected,\nand not scalable. To address this issue and investigate the applicability of\nLLMs in psychological counseling conversation simulation, we propose a\nframework that employs two LLMs via role-playing for simulating\ncounselor-client interactions. Our framework involves two LLMs, one acting as a\nclient equipped with a specific and real-life user profile and the other\nplaying the role of an experienced counselor, generating professional responses\nusing integrative therapy techniques. We implement both the counselor and the\nclient by zero-shot prompting the GPT-4 model. In order to assess the\neffectiveness of LLMs in simulating counselor-client interactions and\nunderstand the disparities between LLM- and human-generated conversations, we\nevaluate the synthetic data from various perspectives. We begin by assessing\nthe client's performance through automatic evaluations. Next, we analyze and\ncompare the disparities between dialogues generated by the LLM and those\ngenerated by professional counselors. Furthermore, we conduct extensive\nexperiments to thoroughly examine the performance of our LLM-based counselor\ntrained with synthetic interactive dialogues by benchmarking against\nstate-of-the-art models for mental health.\n","authors":["Huachuan Qiu","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2408.15787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14432v2","updated":"2024-08-28T12:39:57Z","published":"2024-08-26T17:20:34Z","title":"Contextual Bandit with Herding Effects: Algorithms and Recommendation\n Applications","summary":" Contextual bandits serve as a fundamental algorithmic framework for\noptimizing recommendation decisions online. Though extensive attention has been\npaid to tailoring contextual bandits for recommendation applications, the\n\"herding effects\" in user feedback have been ignored. These herding effects\nbias user feedback toward historical ratings, breaking down the assumption of\nunbiased feedback inherent in contextual bandits. This paper develops a novel\nvariant of the contextual bandit that is tailored to address the feedback bias\ncaused by the herding effects. A user feedback model is formulated to capture\nthis feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)\nalgorithm, which employs posterior sampling to balance the exploration and\nexploitation tradeoff. We prove an upper bound for the regret of the algorithm,\nrevealing the impact of herding effects on learning speed. Extensive\nexperiments on datasets demonstrate that TS-Conf outperforms four benchmark\nalgorithms. Analysis reveals that TS-Conf effectively mitigates the negative\nimpact of herding effects, resulting in faster learning and improved\nrecommendation accuracy.\n","authors":["Luyue Xu","Liming Wang","Hong Xie","Mingqiang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14432v2.pdf","comment":"Published as a conference paper at PRICAI 2024"},{"id":"http://arxiv.org/abs/2408.15688v1","updated":"2024-08-28T10:25:36Z","published":"2024-08-28T10:25:36Z","title":"PDSR: A Privacy-Preserving Diversified Service Recommendation Method on\n Distributed Data","summary":" The last decade has witnessed a tremendous growth of service computing, while\nefficient service recommendation methods are desired to recommend high-quality\nservices to users. It is well known that collaborative filtering is one of the\nmost popular methods for service recommendation based on QoS, and many existing\nproposals focus on improving recommendation accuracy, i.e., recommending\nhigh-quality redundant services. Nevertheless, users may have different\nrequirements on QoS, and hence diversified recommendation has been attracting\nincreasing attention in recent years to fulfill users' diverse demands and to\nexplore potential services. Unfortunately, the recommendation performances\nrelies on a large volume of data (e.g., QoS data), whereas the data may be\ndistributed across multiple platforms. Therefore, to enable data sharing across\nthe different platforms for diversified service recommendation, we propose a\nPrivacy-preserving Diversified Service Recommendation (PDSR) method.\nSpecifically, we innovate in leveraging the Locality-Sensitive Hashing (LSH)\nmechanism such that privacy-preserved data sharing across different platforms\nis enabled to construct a service similarity graph. Based on the similarity\ngraph, we propose a novel accuracy-diversity metric and design a\n$2$-approximation algorithm to select $K$ services to recommend by maximizing\nthe accuracy-diversity measure. Extensive experiments on real datasets are\nconducted to verify the efficacy of our PDSR method.\n","authors":["Lina Wang","Huan Yang","Yiran Shen","Chao Liu","Lianyong Qi","Xiuzhen Cheng","Feng Li"],"pdf_url":"https://arxiv.org/pdf/2408.15688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11245v4","updated":"2024-08-28T08:51:57Z","published":"2022-05-18T04:38:15Z","title":"PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for\n Multi-stage Ranking","summary":" This paper describes the PASH participation in TREC 2021 Deep Learning Track.\nIn the recall stage, we adopt a scheme combining sparse and dense retrieval\nmethod. In the multi-stage ranking phase, point-wise and pair-wise ranking\nstrategies are used one after another based on model continual pre-trained on\ngeneral knowledge and document-level data. Compared to TREC 2020 Deep Learning\nTrack, we have additionally introduced the generative model T5 to further\nenhance the performance.\n","authors":["Yixuan Qiao","Hao Chen","Jun Wang","Tuozhen Liu","Xianbin Ye","Xin Tang","Rui Fang","Peng Gao","Wenfeng Xie","Guotong Xie"],"pdf_url":"https://arxiv.org/pdf/2205.11245v4.pdf","comment":"TREC 2021"},{"id":"http://arxiv.org/abs/2408.15620v1","updated":"2024-08-28T08:21:56Z","published":"2024-08-28T08:21:56Z","title":"CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge\n Graph and Ternary Relationship","summary":" The problem of career trajectory prediction (CTP) aims to predict one's\nfuture employer or job position. While several CTP methods have been developed\nfor this problem, we posit that none of these methods (1) jointly considers the\nmutual ternary dependency between three key units (i.e., user, position, and\ncompany) of a career and (2) captures the characteristic shifts of key units in\ncareer over time, leading to an inaccurate understanding of the job movement\npatterns in the labor market. To address the above challenges, we propose a\nnovel solution, named as CAPER, that solves the challenges via sophisticated\ntemporal knowledge graph (TKG) modeling. It enables the utilization of a\ngraph-structured knowledge base with rich expressiveness, effectively\npreserving the changes in job movement patterns. Furthermore, we devise an\nextrapolated career reasoning task on TKG for a realistic evaluation. The\nexperiments on a real-world career trajectory dataset demonstrate that CAPER\nconsistently and significantly outperforms four baselines, two recent TKG\nreasoning methods, and five state-of-the-art CTP methods in predicting one's\nfuture companies and positions-i.e., on average, yielding 6.80% and 34.58% more\naccurate predictions, respectively.\n","authors":["Yeon-Chang Lee","JaeHyun Lee","Michiharu Yamashita","Dongwon Lee","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2408.15620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15575v1","updated":"2024-08-28T07:00:19Z","published":"2024-08-28T07:00:19Z","title":"Lyrically Speaking: Exploring the Link Between Lyrical Emotions, Themes\n and Depression Risk","summary":" Lyrics play a crucial role in affecting and reinforcing emotional states by\nproviding meaning and emotional connotations that interact with the acoustic\nproperties of the music. Specific lyrical themes and emotions may intensify\nexisting negative states in listeners and may lead to undesirable outcomes,\nespecially in listeners with mood disorders such as depression. Hence, it is\nimportant for such individuals to be mindful of their listening strategies. In\nthis study, we examine online music consumption of individuals at risk of\ndepression in light of lyrical themes and emotions. Lyrics obtained from the\nlistening histories of 541 Last.fm users, divided into At-Risk and No-Risk\nbased on their mental well-being scores, were analyzed using natural language\nprocessing techniques. Statistical analyses of the results revealed that\nindividuals at risk for depression prefer songs with lyrics associated with low\nvalence and low arousal. Additionally, lyrics associated with themes of denial,\nself-reference, and ambivalence were preferred. In contrast, themes such as\nliberation, familiarity, and activity are not as favored. This study opens up\nthe possibility of an approach to assessing depression risk from the digital\nfootprint of individuals and potentially developing personalized recommendation\nsystems.\n","authors":["Pavani Chowdary","Bhavyajeet Singh","Rajat Agarwal","Vinoo Alluri"],"pdf_url":"https://arxiv.org/pdf/2408.15575v1.pdf","comment":"Accepted at the 25th International Society for Music Information\n Retrieval Conference (ISMIR) 2024, San Francisco, United States"},{"id":"http://arxiv.org/abs/2408.07611v2","updated":"2024-08-28T03:47:28Z","published":"2024-08-14T15:19:16Z","title":"WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation\n Integrating Web Search and Knowledge Graphs","summary":" Large Language Models (LLMs) have greatly contributed to the development of\nadaptive intelligent agents and are positioned as an important way to achieve\nArtificial General Intelligence (AGI). However, LLMs are prone to produce\nfactually incorrect information and often produce \"phantom\" content that\nundermines their reliability, which poses a serious challenge for their\ndeployment in real-world scenarios. Enhancing LLMs by combining external\ndatabases and information retrieval mechanisms is an effective path. To address\nthe above challenges, we propose a new approach called WeKnow-RAG, which\nintegrates Web search and Knowledge Graphs into a \"Retrieval-Augmented\nGeneration (RAG)\" system. First, the accuracy and reliability of LLM responses\nare improved by combining the structured representation of Knowledge Graphs\nwith the flexibility of dense vector retrieval. WeKnow-RAG then utilizes\ndomain-specific knowledge graphs to satisfy a variety of queries and domains,\nthereby improving performance on factual information and complex reasoning\ntasks by employing multi-stage web page retrieval techniques using both sparse\nand dense retrieval methods. Our approach effectively balances the efficiency\nand accuracy of information retrieval, thus improving the overall retrieval\nprocess. Finally, we also integrate a self-assessment mechanism for the LLM to\nevaluate the trustworthiness of the answers it generates. Our approach proves\nits outstanding effectiveness in a wide range of offline experiments and online\nsubmissions.\n","authors":["Weijian Xie","Xuefeng Liang","Yuhui Liu","Kaihua Ni","Hong Cheng","Zetian Hu"],"pdf_url":"https://arxiv.org/pdf/2408.07611v2.pdf","comment":"8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta\n KDD Cup 2024 CRAG Challenge"},{"id":"http://arxiv.org/abs/2402.07926v2","updated":"2024-08-28T18:53:17Z","published":"2024-02-05T18:16:04Z","title":"From Data Creator to Data Reuser: Distance Matters","summary":" Sharing research data is necessary, but not sufficient, for data reuse. Open\nscience policies focus more heavily on data sharing than on reuse, yet both are\ncomplex, labor-intensive, expensive, and require infrastructure investments by\nmultiple stakeholders. The value of data reuse lies in relationships between\ncreators and reusers. By addressing knowledge exchange, rather than mere\ntransactions between stakeholders, investments in data management and knowledge\ninfrastructures can be made more wisely. Drawing upon empirical studies of data\nsharing and reuse, we develop the theoretical construct of distance between\ndata creator and data reuser, identifying six distance dimensions that\ninfluence the ability to transfer knowledge effectively: domain, methods,\ncollaboration, curation, purposes, and time and temporality. We address the\nsocial and socio-technical aspects of these dimensions, exploring ways in which\nthey may decrease -- or increase -- distances between creators and reusers. Our\ntheoretical framing of the distance between data creators and prospective\nreusers leads to recommendations to four categories of stakeholders on how to\nmake data sharing and reuse more effective: data creators, data reusers, data\narchivists, and funding agencies. 'It takes a village' to share research data\n-- and a village to reuse data. Our aim is to provoke new research questions,\nnew research, and new investments in effective and efficient circulation of\nresearch data; and to identify criteria for investments at each stage of data\nand research life cycles.\n","authors":["Christine L. Borgman","Paul T. Groth"],"pdf_url":"https://arxiv.org/pdf/2402.07926v2.pdf","comment":"74 pages, double-spaced, consisting of Table of Contents, Abstract,\n 45 page narrative, 1 box, 1 figure, 1 table, 27 pages references. Original\n work"},{"id":"http://arxiv.org/abs/2408.16036v1","updated":"2024-08-28T16:16:55Z","published":"2024-08-28T16:16:55Z","title":"Efficient $k$-NN Search in IoT Data: Overlap Optimization in Tree-Based\n Indexing Structures","summary":" The proliferation of interconnected devices in the Internet of Things (IoT)\nhas led to an exponential increase in data, commonly known as Big IoT Data.\nEfficient retrieval of this heterogeneous data demands a robust indexing\nmechanism for effective organization. However, a significant challenge remains:\nthe overlap in data space partitions during index construction. This overlap\nincreases node access during search and retrieval, resulting in higher resource\nconsumption, performance bottlenecks, and impedes system scalability. To\naddress this issue, we propose three innovative heuristics designed to quantify\nand strategically reduce data space partition overlap. The volume-based method\n(VBM) offers a detailed assessment by calculating the intersection volume\nbetween partitions, providing deeper insights into spatial relationships. The\ndistance-based method (DBM) enhances efficiency by using the distance between\npartition centers and radii to evaluate overlap, offering a streamlined yet\naccurate approach. Finally, the object-based method (OBM) provides a practical\nsolution by counting objects across multiple partitions, delivering an\nintuitive understanding of data space dynamics. Experimental results\ndemonstrate the effectiveness of these methods in reducing search time,\nunderscoring their potential to improve data space partitioning and enhance\noverall system performance.\n","authors":["Ala-Eddine Benrazek","Zineddine Kouahla","Brahim Farou","Hamid Seridi","Ibtissem Kemouguette"],"pdf_url":"https://arxiv.org/pdf/2408.16036v1.pdf","comment":"28 pages, 21 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.16032v1","updated":"2024-08-28T10:31:50Z","published":"2024-08-28T10:31:50Z","title":"An Extremely Data-efficient and Generative LLM-based Reinforcement\n Learning Agent for Recommenders","summary":" Recent advancements in large language models (LLMs) have enabled\nunderstanding webpage contexts, product details, and human instructions.\nUtilizing LLMs as the foundational architecture for either reward models or\npolicies in reinforcement learning has gained popularity -- a notable\nachievement is the success of InstructGPT. RL algorithms have been instrumental\nin maximizing long-term customer satisfaction and avoiding short-term, myopic\ngoals in industrial recommender systems, which often rely on deep learning\nmodels to predict immediate clicks or purchases.\n In this project, several RL methods are implemented and evaluated using the\nWebShop benchmark environment, data, simulator, and pre-trained model\ncheckpoints. The goal is to train an RL agent to maximize the purchase reward\ngiven a detailed human instruction describing a desired product. The RL agents\nare developed by fine-tuning a pre-trained BERT model with various objectives,\nlearning from preferences without a reward model, and employing contemporary\ntraining techniques such as Proximal Policy Optimization (PPO) as used in\nInstructGPT, and Direct Preference Optimization (DPO). This report also\nevaluates the RL agents trained using generative trajectories. Evaluations were\nconducted using Thompson sampling in the WebShop simulator environment.\n The simulated online experiments demonstrate that agents trained on generated\ntrajectories exhibited comparable task performance to those trained using human\ntrajectories. This has demonstrated an example of an extremely low-cost\ndata-efficient way of training reinforcement learning agents. Also, with\nlimited training time (<2hours), without utilizing any images, a DPO agent\nachieved a 19% success rate after approximately 3000 steps or 30 minutes of\ntraining on T4 GPUs, compared to a PPO agent, which reached a 15% success rate.\n","authors":["Shuang Feng","Grace Feng"],"pdf_url":"https://arxiv.org/pdf/2408.16032v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.15999v1","updated":"2024-08-28T18:05:53Z","published":"2024-08-28T18:05:53Z","title":"Q-MRS: A Deep Learning Framework for Quantitative Magnetic Resonance\n Spectra Analysis","summary":" Magnetic resonance spectroscopy (MRS) is an established technique for\nstudying tissue metabolism, particularly in central nervous system disorders.\nWhile powerful and versatile, MRS is often limited by challenges associated\nwith data quality, processing, and quantification. Existing MRS quantification\nmethods face difficulties in balancing model complexity and reproducibility\nduring spectral modeling, often falling into the trap of either\noversimplification or over-parameterization. To address these limitations, this\nstudy introduces a deep learning (DL) framework that employs transfer learning,\nin which the model is pre-trained on simulated datasets before it undergoes\nfine-tuning on in vivo data. The proposed framework showed promising\nperformance when applied to the Philips dataset from the BIG GABA repository\nand represents an exciting advancement in MRS data analysis.\n","authors":["Christopher J. Wu","Lawrence S. Kegeles","Jia Guo"],"pdf_url":"https://arxiv.org/pdf/2408.15999v1.pdf","comment":"8 pages, 4 figures, and 3 tables for the main body; 9 pages, 4\n figures, and 3 tables for the supplementary material"},{"id":"http://arxiv.org/abs/2408.15998v1","updated":"2024-08-28T17:59:31Z","published":"2024-08-28T17:59:31Z","title":"Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of\n Encoders","summary":" The ability to accurately interpret complex visual information is a crucial\ntopic of multimodal large language models (MLLMs). Recent work indicates that\nenhanced visual perception significantly reduces hallucinations and improves\nperformance on resolution-sensitive tasks, such as optical character\nrecognition and document analysis. A number of recent MLLMs achieve this goal\nusing a mixture of vision encoders. Despite their success, there is a lack of\nsystematic comparisons and detailed ablation studies addressing critical\naspects, such as expert selection and the integration of multiple vision\nexperts. This study provides an extensive exploration of the design space for\nMLLMs using a mixture of vision encoders and resolutions. Our findings reveal\nseveral underlying principles common to various existing strategies, leading to\na streamlined yet effective design approach. We discover that simply\nconcatenating visual tokens from a set of complementary vision encoders is as\neffective as more complex mixing architectures or strategies. We additionally\nintroduce Pre-Alignment to bridge the gap between vision-focused encoders and\nlanguage tokens, enhancing model coherence. The resulting family of MLLMs,\nEagle, surpasses other leading open-source models on major MLLM benchmarks.\nModels and code: https://github.com/NVlabs/Eagle\n","authors":["Min Shi","Fuxiao Liu","Shihao Wang","Shijia Liao","Subhashree Radhakrishnan","De-An Huang","Hongxu Yin","Karan Sapra","Yaser Yacoob","Humphrey Shi","Bryan Catanzaro","Andrew Tao","Jan Kautz","Zhiding Yu","Guilin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15998v1.pdf","comment":"Github: https://github.com/NVlabs/Eagle, HuggingFace:\n https://huggingface.co/NVEagle"},{"id":"http://arxiv.org/abs/2408.15997v1","updated":"2024-08-28T17:59:27Z","published":"2024-08-28T17:59:27Z","title":"Mamba or Transformer for Time Series Forecasting? Mixture of Universals\n (MoU) Is All You Need","summary":" Time series forecasting requires balancing short-term and long-term\ndependencies for accurate predictions. Existing methods mainly focus on\nlong-term dependency modeling, neglecting the complexities of short-term\ndynamics, which may hinder performance. Transformers are superior in modeling\nlong-term dependencies but are criticized for their quadratic computational\ncost. Mamba provides a near-linear alternative but is reported less effective\nin time series longterm forecasting due to potential information loss. Current\narchitectures fall short in offering both high efficiency and strong\nperformance for long-term dependency modeling. To address these challenges, we\nintroduce Mixture of Universals (MoU), a versatile model to capture both\nshort-term and long-term dependencies for enhancing performance in time series\nforecasting. MoU is composed of two novel designs: Mixture of Feature\nExtractors (MoF), an adaptive method designed to improve time series patch\nrepresentations for short-term dependency, and Mixture of Architectures (MoA),\nwhich hierarchically integrates Mamba, FeedForward, Convolution, and\nSelf-Attention architectures in a specialized order to model long-term\ndependency from a hybrid perspective. The proposed approach achieves\nstate-of-the-art performance while maintaining relatively low computational\ncosts. Extensive experiments on seven real-world datasets demonstrate the\nsuperiority of MoU. Code is available at https://github.com/lunaaa95/mou/.\n","authors":["Sijia Peng","Yun Xiong","Yangyong Zhu","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2408.15997v1.pdf","comment":"Code at https://github.com/lunaaa95/mou/"},{"id":"http://arxiv.org/abs/2408.15993v1","updated":"2024-08-28T17:58:53Z","published":"2024-08-28T17:58:53Z","title":"ClimDetect: A Benchmark Dataset for Climate Change Detection and\n Attribution","summary":" Detecting and attributing temperature increases due to climate change is\ncrucial for understanding global warming and guiding adaptation strategies. The\ncomplexity of distinguishing human-induced climate signals from natural\nvariability has challenged traditional detection and attribution (D&A)\napproaches, which seek to identify specific \"fingerprints\" in climate response\nvariables. Deep learning offers potential for discerning these complex patterns\nin expansive spatial datasets. However, lack of standard protocols has hindered\nconsistent comparisons across studies. We introduce ClimDetect, a standardized\ndataset of over 816k daily climate snapshots, designed to enhance model\naccuracy in identifying climate change signals. ClimDetect integrates various\ninput and target variables used in past research, ensuring comparability and\nconsistency. We also explore the application of vision transformers (ViT) to\nclimate data, a novel and modernizing approach in this context. Our open-access\ndata and code serve as a benchmark for advancing climate science through\nimproved model evaluations. ClimDetect is publicly accessible via Huggingface\ndataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect.\n","authors":["Sungduk Yu","Brian L. White","Anahita Bhiwandiwalla","Musashi Hinck","Matthew Lyle Olson","Tung Nguyen","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2408.15993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15992v1","updated":"2024-08-28T17:58:39Z","published":"2024-08-28T17:58:39Z","title":"CoGen: Learning from Feedback with Coupled Comprehension and Generation","summary":" Systems with both language comprehension and generation capabilities can\nbenefit from the tight connection between the two. This work studies coupling\ncomprehension and generation with focus on continually learning from\ninteraction with users. We propose techniques to tightly integrate the two\ncapabilities for both learning and inference. We situate our studies in\ntwo-player reference games, and deploy various models for thousands of\ninteractions with human users, while learning from interaction feedback\nsignals. We show dramatic improvements in performance over time, with\ncomprehension-generation coupling leading to performance improvements up to 26%\nin absolute terms and up to 17% higher accuracies compared to a non-coupled\nsystem. Our analysis also shows coupling has substantial qualitative impact on\nthe system's language, making it significantly more human-like.\n","authors":["Mustafa Omer Gul","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2408.15992v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.17701v5","updated":"2024-08-28T17:47:35Z","published":"2024-04-26T20:59:23Z","title":"Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning\n in Particle Detector Readout","summary":" Embedded field programmable gate array (eFPGA) technology allows the\nimplementation of reconfigurable logic within the design of an\napplication-specific integrated circuit (ASIC). This approach offers the low\npower and efficiency of an ASIC along with the ease of FPGA configuration,\nparticularly beneficial for the use case of machine learning in the data\npipeline of next-generation collider experiments. An open-source framework\ncalled \"FABulous\" was used to design eFPGAs using 130 nm and 28 nm CMOS\ntechnology nodes, which were subsequently fabricated and verified through\ntesting. The capability of an eFPGA to act as a front-end readout chip was\nassessed using simulation of high energy particles passing through a silicon\npixel sensor. A machine learning-based classifier, designed for reduction of\nsensor data at the source, was synthesized and configured onto the eFPGA. A\nsuccessful proof-of-concept was demonstrated through reproduction of the\nexpected algorithm result on the eFPGA with perfect accuracy. Further\ndevelopment of the eFPGA technology and its application to collider detector\nreadout is discussed.\n","authors":["Julia Gonski","Aseem Gupta","Haoyi Jia","Hyunjoon Kim","Lorenzo Rota","Larry Ruckman","Angelo Dragone","Ryan Herbst"],"pdf_url":"https://arxiv.org/pdf/2404.17701v5.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.15969v1","updated":"2024-08-28T17:43:18Z","published":"2024-08-28T17:43:18Z","title":"Stability of Primal-Dual Gradient Flow Dynamics for Multi-Block Convex\n Optimization Problems","summary":" We examine stability properties of primal-dual gradient flow dynamics for\ncomposite convex optimization problems with multiple, possibly nonsmooth, terms\nin the objective function under the generalized consensus constraint. The\nproposed dynamics are based on the proximal augmented Lagrangian and they\nprovide a viable alternative to ADMM which faces significant challenges from\nboth analysis and implementation viewpoints in large-scale multi-block\nscenarios. In contrast to customized algorithms with individualized convergence\nguarantees, we provide a systematic approach for solving a broad class of\nchallenging composite optimization problems. We leverage various structural\nproperties to establish global (exponential) convergence guarantees for the\nproposed dynamics. Our assumptions are much weaker than those required to prove\n(exponential) stability of various primal-dual dynamics as well as (linear)\nconvergence of discrete-time methods, e.g., standard two-block and multi-block\nADMM and EXTRA algorithms. Finally, we show necessity of some of our structural\nassumptions for exponential stability and provide computational experiments to\ndemonstrate the convenience of the proposed dynamics for parallel and\ndistributed computing applications.\n","authors":["Ibrahim K. Ozaslan","Panagiotis Patrinos","Mihailo R. Jovanović"],"pdf_url":"https://arxiv.org/pdf/2408.15969v1.pdf","comment":"31 pages; 4 figures"},{"id":"http://arxiv.org/abs/2406.10260v2","updated":"2024-08-28T17:26:03Z","published":"2024-06-11T01:16:10Z","title":"Flextron: Many-in-One Flexible Large Language Model","summary":" Training modern LLMs is extremely resource intensive, and customizing them\nfor various deployment scenarios characterized by limited compute and memory\nresources through repeated training is impractical. In this paper, we introduce\nFlextron, a network architecture and post-training model optimization framework\nsupporting flexible model deployment. The Flextron architecture utilizes a\nnested elastic structure to rapidly adapt to specific user-defined latency and\naccuracy targets during inference with no additional fine-tuning required. It\nis also input-adaptive, and can automatically route tokens through its\nsub-networks for improved performance and efficiency. We present a\nsample-efficient training method and associated routing algorithms for\nsystematically transforming an existing trained LLM into a Flextron model. We\nevaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate\nsuperior performance over multiple end-to-end trained variants and other\nstate-of-the-art elastic networks, all with a single pretraining run that\nconsumes a mere 7.63% tokens compared to original pretraining.\n","authors":["Ruisi Cai","Saurav Muralidharan","Greg Heinrich","Hongxu Yin","Zhangyang Wang","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2406.10260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15958v1","updated":"2024-08-28T17:20:56Z","published":"2024-08-28T17:20:56Z","title":"Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume","summary":" Current anomaly detection methods excel with benchmark industrial data but\nstruggle with natural images and medical data due to varying definitions of\n'normal' and 'abnormal.' This makes accurate identification of deviations in\nthese fields particularly challenging. Especially for 3D brain MRI data, all\nthe state-of-the-art models are reconstruction-based with 3D convolutional\nneural networks which are memory-intensive, time-consuming and producing noisy\noutputs that require further post-processing. We propose a framework called\nSimple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained\non ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature\nextractor to reduce computational cost. We aggregate the extracted features to\nperform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a\nconditional normalizing flow to calculate log likelihood of features and\nemploys the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The\nresults indicate improved performance, showcasing our model's remarkable\nadaptability and effectiveness when addressing the challenges exists in brain\nMRI data. In addition, for the large-scale 3D brain volumes, our model\nSimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of\naccuracy, memory usage and time consumption. Code is available at:\nhttps://anonymous.4open.science/r/SimpleSliceNet-8EA3.\n","authors":["Zeduo Zhang","Yalda Mohsenzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.15958v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15956v1","updated":"2024-08-28T17:17:20Z","published":"2024-08-28T17:17:20Z","title":"Generating Binary Species Range Maps","summary":" Accurately predicting the geographic ranges of species is crucial for\nassisting conservation efforts. Traditionally, range maps were manually created\nby experts. However, species distribution models (SDMs) and, more recently,\ndeep learning-based variants offer a potential automated alternative. Deep\nlearning-based SDMs generate a continuous probability representing the\npredicted presence of a species at a given location, which must be binarized by\nsetting per-species thresholds to obtain binary range maps. However, selecting\nappropriate per-species thresholds to binarize these predictions is non-trivial\nas different species can require distinct thresholds. In this work, we evaluate\ndifferent approaches for automatically identifying the best thresholds for\nbinarizing range maps using presence-only data. This includes approaches that\nrequire the generation of additional pseudo-absence data, along with ones that\nonly require presence data. We also propose an extension of an existing\npresence-only technique that is more robust to outliers. We perform a detailed\nevaluation of different thresholding techniques on the tasks of binary range\nestimation and large-scale fine-grained visual classification, and we\ndemonstrate improved performance over existing pseudo-absence free approaches\nusing our method.\n","authors":["Filip Dorm","Christian Lange","Scott Loarie","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2408.15956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15953v1","updated":"2024-08-28T17:12:01Z","published":"2024-08-28T17:12:01Z","title":"Modeling and Analyzing the Influence of Non-Item Pages on Sequential\n Next-Item Prediction","summary":" Analyzing the sequence of historical interactions between users and items,\nsequential recommendation models learn user intent and make predictions about\nthe next item of interest. Next to these item interactions, most systems also\nhave interactions with pages not related to specific items, for example\nnavigation pages, account pages, and pages for a specific category, which may\nprovide additional insights into the user's interests. However, while there are\nseveral approaches to integrate additional information about items and users,\nthe topic of integrating non-item pages has been less explored. We use the\nhypotheses testing framework HypTrails to show that there is indeed a\nrelationship between these non-item pages and the items of interest and fill\nthis gap by proposing various approaches of representing non-item pages (e.g,\nbased on their content) to use them as an additional information source for the\ntask of sequential next-item prediction.\n We create a synthetic dataset with non-item pages highly related to the\nsubsequent item to show that the models are generally capable of learning from\nthese interactions, and subsequently evaluate the improvements gained by\nincluding non-item pages in two real-world datasets.\n We adapt eight popular sequential recommender models, covering CNN-, RNN- and\ntransformer-based architectures, to integrate non-item pages and investigate\nthe capabilities of these models to leverage their information for next item\nprediction. We also analyze their behavior on noisy data and compare different\nitem representation strategies.\n Our results show that non-item pages are a valuable source of information,\nbut representing such a page well is the key to successfully leverage them. The\ninclusion of non-item pages can increase the performance for next-item\nprediction in all examined model architectures with a varying degree.\n","authors":["Elisabeth Fischer","Daniel Schlör","Albin Zehe","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2408.15953v1.pdf","comment":"36 pages, 19 figures; Work in Progress"},{"id":"http://arxiv.org/abs/2408.15946v1","updated":"2024-08-28T17:04:56Z","published":"2024-08-28T17:04:56Z","title":"Sigma Flows for Image and Data Labeling and Learning Structured\n Prediction","summary":" This paper introduces the sigma flow model for the prediction of structured\nlabelings of data observed on Riemannian manifolds, including Euclidean image\ndomains as special case. The approach combines the Laplace-Beltrami framework\nfor image denoising and enhancement, introduced by Sochen, Kimmel and Malladi\nabout 25 years ago, and the assignment flow approach introduced and studied by\nthe authors.\n The sigma flow arises as Riemannian gradient flow of generalized harmonic\nenergies and thus is governed by a nonlinear geometric PDE which determines a\nharmonic map from a closed Riemannian domain manifold to a statistical\nmanifold, equipped with the Fisher-Rao metric from information geometry. A\nspecific ingredient of the sigma flow is the mutual dependency of the\nRiemannian metric of the domain manifold on the evolving state. This makes the\napproach amenable to machine learning in a specific way, by realizing this\ndependency through a mapping with compact time-variant parametrization that can\nbe learned from data. Proof of concept experiments demonstrate the expressivity\nof the sigma flow model and prediction performance.\n Structural similarities to transformer network architectures and networks\ngenerated by the geometric integration of sigma flows are pointed out, which\nhighlights the connection to deep learning and, conversely, may stimulate the\nuse of geometric design principles for structured prediction in other areas of\nscientific machine learning.\n","authors":["Jonas Cassel","Bastian Boll","Stefania Petra","Peter Albers","Christoph Schnörr"],"pdf_url":"https://arxiv.org/pdf/2408.15946v1.pdf","comment":"51 pages"},{"id":"http://arxiv.org/abs/2402.09786v4","updated":"2024-08-28T16:48:06Z","published":"2024-02-15T08:34:21Z","title":"Examining Pathological Bias in a Generative Adversarial Network\n Discriminator: A Case Study on a StyleGAN3 Model","summary":" Generative adversarial networks (GANs) generate photorealistic faces that are\noften indistinguishable by humans from real faces. While biases in machine\nlearning models are often assumed to be due to biases in training data, we find\npathological internal color and luminance biases in the discriminator of a\npre-trained StyleGAN3-r model that are not explicable by the training data. We\nalso find that the discriminator systematically stratifies scores by both\nimage- and face-level qualities and that this disproportionately affects images\nacross gender, race, and other categories. We examine axes common in research\non stereotyping in social psychology.\n","authors":["Alvin Grissom II","Ryan F. Lei","Matt Gusdorff","Jeova Farias Sales Rocha Neto","Bailey Lin","Ryan Trotter"],"pdf_url":"https://arxiv.org/pdf/2402.09786v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15923v1","updated":"2024-08-28T16:36:18Z","published":"2024-08-28T16:36:18Z","title":"Generalized Naive Bayes","summary":" In this paper we introduce the so-called Generalized Naive Bayes structure as\nan extension of the Naive Bayes structure. We give a new greedy algorithm that\nfinds a good fitting Generalized Naive Bayes (GNB) probability distribution. We\nprove that this fits the data at least as well as the probability distribution\ndetermined by the classical Naive Bayes (NB). Then, under a not very\nrestrictive condition, we give a second algorithm for which we can prove that\nit finds the optimal GNB probability distribution, i.e. best fitting structure\nin the sense of KL divergence. Both algorithms are constructed to maximize the\ninformation content and aim to minimize redundancy. Based on these algorithms,\nnew methods for feature selection are introduced. We discuss the similarities\nand differences to other related algorithms in terms of structure, methodology,\nand complexity. Experimental results show, that the algorithms introduced\noutperform the related algorithms in many cases.\n","authors":["Edith Alice Kovács","Anna Ország","Dániel Pfeifer","András Benczúr"],"pdf_url":"https://arxiv.org/pdf/2408.15923v1.pdf","comment":"44 pages, 19 figures"},{"id":"http://arxiv.org/abs/2408.15916v1","updated":"2024-08-28T16:30:41Z","published":"2024-08-28T16:30:41Z","title":"Multi-modal Adversarial Training for Zero-Shot Voice Cloning","summary":" A text-to-speech (TTS) model trained to reconstruct speech given text tends\ntowards predictions that are close to the average characteristics of a dataset,\nfailing to model the variations that make human speech sound natural. This\nproblem is magnified for zero-shot voice cloning, a task that requires training\ndata with high variance in speaking styles. We build off of recent works which\nhave used Generative Advsarial Networks (GAN) by proposing a Transformer\nencoder-decoder architecture to conditionally discriminates between real and\ngenerated speech features. The discriminator is used in a training pipeline\nthat improves both the acoustic and prosodic features of a TTS model. We\nintroduce our novel adversarial training technique by applying it to a\nFastSpeech2 acoustic model and training on Libriheavy, a large multi-speaker\ndataset, for the task of zero-shot voice cloning. Our model achieves\nimprovements over the baseline in terms of speech quality and speaker\nsimilarity. Audio examples from our system are available online.\n","authors":["John Janiczek","Dading Chong","Dongyang Dai","Arlo Faria","Chao Wang","Tao Wang","Yuzong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15916v1.pdf","comment":"Accepted at INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.15905v1","updated":"2024-08-28T16:19:35Z","published":"2024-08-28T16:19:35Z","title":"MetaGFN: Exploring Distant Modes with Adapted Metadynamics for\n Continuous GFlowNets","summary":" Generative Flow Networks (GFlowNets) are a class of generative models that\nsample objects in proportion to a specified reward function through a learned\npolicy. They can be trained either on-policy or off-policy, needing a balance\nbetween exploration and exploitation for fast convergence to a target\ndistribution. While exploration strategies for discrete GFlowNets have been\nstudied, exploration in the continuous case remains to be investigated, despite\nthe potential for novel exploration algorithms due to the local connectedness\nof continuous domains. Here, we introduce Adapted Metadynamics, a variant of\nmetadynamics that can be applied to arbitrary black-box reward functions on\ncontinuous domains. We use Adapted Metadynamics as an exploration strategy for\ncontinuous GFlowNets. We show three continuous domains where the resulting\nalgorithm, MetaGFN, accelerates convergence to the target distribution and\ndiscovers more distant reward modes than previous off-policy exploration\nstrategies used for GFlowNets.\n","authors":["Dominic Phillips","Flaviu Cipcigan"],"pdf_url":"https://arxiv.org/pdf/2408.15905v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.15901v1","updated":"2024-08-28T16:12:55Z","published":"2024-08-28T16:12:55Z","title":"Nexus: Specialization meets Adaptability for Efficiently Training\n Mixture of Experts","summary":" Efficiency, specialization, and adaptability to new data distributions are\nqualities that are hard to combine in current Large Language Models. The\nMixture of Experts (MoE) architecture has been the focus of significant\nresearch because its inherent conditional computation enables such desirable\nproperties. In this work, we focus on \"upcycling\" dense expert models into an\nMoE, aiming to improve specialization while also adding the ability to adapt to\nnew tasks easily. We introduce Nexus, an enhanced MoE architecture with\nadaptive routing where the model learns to project expert embeddings from\ndomain representations. This approach allows Nexus to flexibly add new experts\nafter the initial upcycling through separately trained dense models, without\nrequiring large-scale MoE training for unseen data domains. Our experiments\nshow that Nexus achieves a relative gain of up to 2.1% over the baseline for\ninitial upcycling, and a 18.8% relative gain for extending the MoE with a new\nexpert by using limited finetuning data. This flexibility of Nexus is crucial\nto enable an open-source ecosystem where every user continuously assembles\ntheir own MoE-mix according to their needs.\n","authors":["Nikolas Gritsch","Qizhen Zhang","Acyr Locatelli","Sara Hooker","Ahmet Üstün"],"pdf_url":"https://arxiv.org/pdf/2408.15901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15898v1","updated":"2024-08-28T16:12:16Z","published":"2024-08-28T16:12:16Z","title":"Airfoil Diffusion: Denoising Diffusion Model For Conditional Airfoil\n Generation","summary":" The design of aerodynamic shapes, such as airfoils, has traditionally\nrequired significant computational resources and relied on predefined design\nparameters, which limit the potential for novel shape synthesis. In this work,\nwe introduce a data-driven methodology for airfoil generation using a diffusion\nmodel. Trained on a dataset of preexisting airfoils, our model can generate an\narbitrary number of new airfoils from random vectors, which can be conditioned\non specific aerodynamic performance metrics such as lift and drag, or geometric\ncriteria. Our results demonstrate that the diffusion model effectively produces\nairfoil shapes with realistic aerodynamic properties, offering substantial\nimprovements in efficiency, flexibility, and the potential for discovering\ninnovative airfoil designs. This approach significantly expands the design\nspace, facilitating the synthesis of high-performance aerodynamic shapes that\ntranscend the limitations of traditional methods.\n","authors":["Reid Graves","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2408.15898v1.pdf","comment":"12 Pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15896v1","updated":"2024-08-28T16:06:12Z","published":"2024-08-28T16:06:12Z","title":"A New Method for Cross-Lingual-based Semantic Role Labeling","summary":" Semantic role labeling is a crucial task in natural language processing,\nenabling better comprehension of natural language. However, the lack of\nannotated data in multiple languages has posed a challenge for researchers. To\naddress this, a deep learning algorithm based on model transfer has been\nproposed. The algorithm utilizes a dataset consisting of the English portion of\nCoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency\nof training, only ten percent of the educational data from each language is\nused. The results of the proposed model demonstrate significant improvements\ncompared to Niksirt et al.'s model. In monolingual mode, the proposed model\nachieved a 2.05 percent improvement on F1-score, while in cross-lingual mode,\nthe improvement was even more substantial, reaching 6.23 percent. Worth noting\nis that the compared model only trained two of the four stages of semantic role\nlabeling and employed golden data for the remaining two stages. This suggests\nthat the actual superiority of the proposed model surpasses the reported\nnumbers by a significant margin. The development of cross-lingual methods for\nsemantic role labeling holds promise, particularly in addressing the scarcity\nof annotated data for various languages. These advancements pave the way for\nfurther research in understanding and processing natural language across\ndifferent linguistic contexts.\n","authors":["Mohammad Ebrahimi","Behrouz Minaei Bidgoli","Nasim Khozouei"],"pdf_url":"https://arxiv.org/pdf/2408.15896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15895v1","updated":"2024-08-28T16:05:20Z","published":"2024-08-28T16:05:20Z","title":"Bias in LLMs as Annotators: The Effect of Party Cues on Labelling\n Decision by Large Language Models","summary":" Human coders are biased. We test similar biases in Large Language Models\n(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and\nMeyer (2018), we find evidence that LLMs use political information, and\nspecifically party cues, to judge political statements. Not only do LLMs use\nrelevant information to contextualize whether a statement is positive,\nnegative, or neutral based on the party cue, they also reflect the biases of\nthe human-generated data upon which they have been trained. We also find that\nunlike humans, who are only biased when faced with statements from extreme\nparties, LLMs exhibit significant bias even when prompted with statements from\ncenter-left and center-right parties. The implications of our findings are\ndiscussed in the conclusion.\n","authors":["Sebastian Vallejo Vera","Hunter Driggers"],"pdf_url":"https://arxiv.org/pdf/2408.15895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15894v1","updated":"2024-08-28T16:04:40Z","published":"2024-08-28T16:04:40Z","title":"The Role of Fibration Symmetries in Geometric Deep Learning","summary":" Geometric Deep Learning (GDL) unifies a broad class of machine learning\ntechniques from the perspectives of symmetries, offering a framework for\nintroducing problem-specific inductive biases like Graph Neural Networks\n(GNNs). However, the current formulation of GDL is limited to global symmetries\nthat are not often found in real-world problems. We propose to relax GDL to\nallow for local symmetries, specifically fibration symmetries in graphs, to\nleverage regularities of realistic instances. We show that GNNs apply the\ninductive bias of fibration symmetries and derive a tighter upper bound for\ntheir expressive power. Additionally, by identifying symmetries in networks, we\ncollapse network nodes, thereby increasing their computational efficiency\nduring both inference and training of deep neural networks. The mathematical\nextension introduced here applies beyond graphs to manifolds, bundles, and\ngrids for the development of models with inductive biases induced by local\nsymmetries that can lead to better generalization.\n","authors":["Osvaldo Velarde","Lucas Parra","Paolo Boldi","Hernan Makse"],"pdf_url":"https://arxiv.org/pdf/2408.15894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14780v2","updated":"2024-08-28T15:48:31Z","published":"2024-08-27T04:57:53Z","title":"GINN-KAN: Interpretability pipelining with applications in Physics\n Informed Neural Networks","summary":" Neural networks are powerful function approximators, yet their ``black-box\"\nnature often renders them opaque and difficult to interpret. While many\npost-hoc explanation methods exist, they typically fail to capture the\nunderlying reasoning processes of the networks. A truly interpretable neural\nnetwork would be trained similarly to conventional models using techniques such\nas backpropagation, but additionally provide insights into the learned\ninput-output relationships. In this work, we introduce the concept of\ninterpretability pipelineing, to incorporate multiple interpretability\ntechniques to outperform each individual technique. To this end, we first\nevaluate several architectures that promise such interpretability, with a\nparticular focus on two recent models selected for their potential to\nincorporate interpretability into standard neural network architectures while\nstill leveraging backpropagation: the Growing Interpretable Neural Network\n(GINN) and Kolmogorov Arnold Networks (KAN). We analyze the limitations and\nstrengths of each and introduce a novel interpretable neural network GINN-KAN\nthat synthesizes the advantages of both models. When tested on the Feynman\nsymbolic regression benchmark datasets, GINN-KAN outperforms both GINN and KAN.\nTo highlight the capabilities and the generalizability of this approach, we\nposition GINN-KAN as an alternative to conventional black-box networks in\nPhysics-Informed Neural Networks (PINNs). We expect this to have far-reaching\nimplications in the application of deep learning pipelines in the natural\nsciences. Our experiments with this interpretable PINN on 15 different partial\ndifferential equations demonstrate that GINN-KAN augmented PINNs outperform\nPINNs with black-box networks in solving differential equations and surpass the\ncapabilities of both GINN and KAN.\n","authors":["Nisal Ranasinghe","Yu Xia","Sachith Seneviratne","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2408.14780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04600v2","updated":"2024-08-28T15:46:39Z","published":"2023-11-08T11:02:51Z","title":"A Deep Learning Based Resource Allocator for Communication Systems with\n Dynamic User Utility Demands","summary":" Deep learning (DL) based resource allocation (RA) has recently gained\nsignificant attention due to its performance efficiency. However, most related\nstudies assume an ideal case where the number of users and their utility\ndemands, e.g., data rate constraints, are fixed, and the designed DL-based RA\nscheme exploits a policy trained only for these fixed parameters. Consequently,\ncomputationally complex policy retraining is required whenever these parameters\nchange. In this paper, we introduce a DL-based resource allocator (ALCOR) that\nallows users to adjust their utility demands freely, such as based on their\napplication layer requirements. ALCOR employs deep neural networks (DNNs) as\nthe policy in a time-sharing problem. The underlying optimization algorithm\niteratively optimizes the on-off status of users to satisfy their utility\ndemands in expectation. The policy performs unconstrained RA (URA)--RA without\nconsidering user utility demands--among active users to maximize the sum\nutility (SU) at each time instant. Depending on the chosen URA scheme, ALCOR\ncan perform RA in either a centralized or distributed scenario. Derived\nconvergence analyses provide guarantees for ALCOR's convergence, and numerical\nexperiments corroborate its effectiveness.\n","authors":["Pourya Behmandpoor","Mark Eisen","Panagiotis Patrinos","Marc Moonen"],"pdf_url":"https://arxiv.org/pdf/2311.04600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15874v1","updated":"2024-08-28T15:44:34Z","published":"2024-08-28T15:44:34Z","title":"Robust Statistical Scaling of Outlier Scores: Improving the Quality of\n Outlier Probabilities for Outliers (Extended Version)","summary":" Outlier detection algorithms typically assign an outlier score to each\nobservation in a dataset, indicating the degree to which an observation is an\noutlier. However, these scores are often not comparable across algorithms and\ncan be difficult for humans to interpret. Statistical scaling addresses this\nproblem by transforming outlier scores into outlier probabilities without using\nground-truth labels, thereby improving interpretability and comparability\nacross algorithms. However, the quality of this transformation can be different\nfor outliers and inliers. Missing outliers in scenarios where they are of\nparticular interest - such as healthcare, finance, or engineering - can be\ncostly or dangerous. Thus, ensuring good probabilities for outliers is\nessential. This paper argues that statistical scaling, as commonly used in the\nliterature, does not produce equally good probabilities for outliers as for\ninliers. Therefore, we propose robust statistical scaling, which uses robust\nestimators to improve the probabilities for outliers. We evaluate several\nvariants of our method against other outlier score transformations for\nreal-world datasets and outlier detection algorithms, where it can improve the\nprobabilities for outliers.\n","authors":["Philipp Röchner","Henrique O. Marques","Ricardo J. G. B. Campello","Arthur Zimek","Franz Rothlauf"],"pdf_url":"https://arxiv.org/pdf/2408.15874v1.pdf","comment":"15 pages, 4 figures, accepted for publication in SISAP 2024"},{"id":"http://arxiv.org/abs/2403.05645v3","updated":"2024-08-28T15:39:45Z","published":"2024-03-08T19:36:20Z","title":"Geometric Neural Network based on Phase Space for BCI-EEG decoding","summary":" Objective: The integration of Deep Learning (DL) algorithms on brain signal\nanalysis is still in its nascent stages compared to their success in fields\nlike Computer Vision. This is particularly true for BCI, where the brain\nactivity is decoded to control external devices without requiring muscle\ncontrol. Electroencephalography (EEG) is a widely adopted choice for designing\nBCI systems due to its non-invasive and cost-effective nature and excellent\ntemporal resolution. Still, it comes at the expense of limited training data,\npoor signal-to-noise, and a large variability across and within-subject\nrecordings. Finally, setting up a BCI system with many electrodes takes a long\ntime, hindering the widespread adoption of reliable DL architectures in BCIs\noutside research laboratories. To improve adoption, we need to improve user\ncomfort using, for instance, reliable algorithms that operate with few\nelectrodes. Approach: Our research aims to develop a DL algorithm that delivers\neffective results with a limited number of electrodes. Taking advantage of the\nAugmented Covariance Method and the framework of SPDNet, we propose the\nPhase-SPDNet architecture and analyze its performance and the interpretability\nof the results. The evaluation is conducted on 5-fold cross-validation, using\nonly three electrodes positioned above the Motor Cortex. The methodology was\ntested on nearly 100 subjects from several open-source datasets using the\nMother Of All BCI Benchmark (MOABB) framework. Main results: The results of our\nPhase-SPDNet demonstrate that the augmented approach combined with the SPDNet\nsignificantly outperforms all the current state-of-the-art DL architecture in\nMI decoding. Significance: This new architecture is explainable and with a low\nnumber of trainable parameters.\n","authors":["Igor Carrara","Bruno Aristimunha","Marie-Constance Corsi","Raphael Y. de Camargo","Sylvain Chevallier","Théodore Papadopoulo"],"pdf_url":"https://arxiv.org/pdf/2403.05645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10734v2","updated":"2024-08-28T15:36:08Z","published":"2024-07-15T14:01:34Z","title":"On-Device Training of Fully Quantized Deep Neural Networks on Cortex-M\n Microcontrollers","summary":" On-device training of DNNs allows models to adapt and fine-tune to newly\ncollected data or changing domains while deployed on microcontroller units\n(MCUs). However, DNN training is a resource-intensive task, making the\nimplementation and execution of DNN training algorithms on MCUs challenging due\nto low processor speeds, constrained throughput, limited floating-point\nsupport, and memory constraints. In this work, we explore on-device training of\nDNNs for Cortex-M MCUs. We present a method that enables efficient training of\nDNNs completely in place on the MCU using fully quantized training (FQT) and\ndynamic partial gradient updates. We demonstrate the feasibility of our\napproach on multiple vision and time-series datasets and provide insights into\nthe tradeoff between training accuracy, memory overhead, energy, and latency on\nreal hardware.\n","authors":["Mark Deutel","Frank Hannig","Christopher Mutschler","Jürgen Teich"],"pdf_url":"https://arxiv.org/pdf/2407.10734v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15866v1","updated":"2024-08-28T15:33:47Z","published":"2024-08-28T15:33:47Z","title":"Retrieval-Augmented Instruction Tuning for Automated Process Engineering\n Calculations : A Tool-Chaining Problem-Solving Framework with Attributable\n Reflection","summary":" The current technology landscape lacks a foundational AI model for solving\nprocess engineering calculations. In this work, we introduce a novel autonomous\nagent framework leveraging Retrieval-Augmented Instruction-Tuning (RAIT) to\nenhance open, customizable small code language models (SLMs) for these\ncalculations. By combining instruction tuned code SLMs with Retrieval-Augmented\nCode Generation (RACG) using external tools, the agent generates, debugs, and\noptimizes code from natural language specifications. Our approach addresses the\nlimitations of the current lack of a foundational AI model for specialized\nprocess engineering tasks and offers benefits of explainability, knowledge\nediting, and cost-effectiveness. Additionally, we curate custom datasets of\nchemical and process engineering problems and solutions to overcome data\nscarcity. Experimental results show that our framework matches the performance\nof large-scale proprietary models on benchmark datasets, proving its\neffectiveness and usability.\n","authors":["Sagar Srinivas Sakhinana","Geethan Sannidhi","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.15866v1.pdf","comment":"Accepted for publication at ML4CCE workshop at ECML PKDD 2024. Please\n find the link: https://ml4cce-ecml.com/#agenda"},{"id":"http://arxiv.org/abs/2408.15865v1","updated":"2024-08-28T15:29:27Z","published":"2024-08-28T15:29:27Z","title":"microYOLO: Towards Single-Shot Object Detection on Microcontrollers","summary":" This work-in-progress paper presents results on the feasibility of\nsingle-shot object detection on microcontrollers using YOLO. Single-shot object\ndetectors like YOLO are widely used, however due to their complexity mainly on\nlarger GPU-based platforms. We present microYOLO, which can be used on Cortex-M\nbased microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when\nclassifying 128x128 RGB images while using less than 800 KB Flash and less than\n350 KB RAM. Furthermore, we share experimental results for three different\nobject detection tasks, analyzing the accuracy of microYOLO on them.\n","authors":["Mark Deutel","Christopher Mutschler","Jürgen Teich"],"pdf_url":"https://arxiv.org/pdf/2408.15865v1.pdf","comment":"Published at the ECML PKDD Conference 2023, at the 4th Workshop on\n IoT, Edge, and Mobile for Embedded Machine Learning"},{"id":"http://arxiv.org/abs/2310.10835v3","updated":"2024-08-28T15:29:17Z","published":"2023-10-16T21:17:29Z","title":"Provable Probabilistic Imaging using Score-Based Generative Priors","summary":" Estimating high-quality images while also quantifying their uncertainty are\ntwo desired features in an image reconstruction algorithm for solving ill-posed\ninverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as\na principled framework for characterizing the space of possible solutions to a\ngeneral inverse problem. PMC is able to incorporate expressive score-based\ngenerative priors for high-quality image reconstruction while also performing\nuncertainty quantification via posterior sampling. In particular, we develop\ntwo PMC algorithms that can be viewed as the sampling analogues of the\ntraditional plug-and-play priors (PnP) and regularization by denoising (RED)\nalgorithms. To improve the sampling efficiency, we introduce weighted annealing\ninto these PMC algorithms, further developing two additional annealed PMC\nalgorithms (APMC). We establish a theoretical analysis for characterizing the\nconvergence behavior of PMC algorithms. Our analysis provides non-asymptotic\nstationarity guarantees in terms of the Fisher information, fully compatible\nwith the joint presence of weighted annealing, potentially non-log-concave\nlikelihoods, and imperfect score networks. We demonstrate the performance of\nthe PMC algorithms on multiple representative inverse problems with both linear\nand nonlinear forward models. Experimental results show that PMC significantly\nimproves reconstruction quality and enables high-fidelity uncertainty\nquantification.\n","authors":["Yu Sun","Zihui Wu","Yifan Chen","Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2310.10835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15861v1","updated":"2024-08-28T15:21:10Z","published":"2024-08-28T15:21:10Z","title":"Fusing Pruned and Backdoored Models: Optimal Transport-based Data-free\n Backdoor Mitigation","summary":" Backdoor attacks present a serious security threat to deep neuron networks\n(DNNs). Although numerous effective defense techniques have been proposed in\nrecent years, they inevitably rely on the availability of either clean or\npoisoned data. In contrast, data-free defense techniques have evolved slowly\nand still lag significantly in performance. To address this issue, different\nfrom the traditional approach of pruning followed by fine-tuning, we propose a\nnovel data-free defense method named Optimal Transport-based Backdoor Repairing\n(OTBR) in this work. This method, based on our findings on neuron weight\nchanges (NWCs) of random unlearning, uses optimal transport (OT)-based model\nfusion to combine the advantages of both pruned and backdoored models.\nSpecifically, we first demonstrate our findings that the NWCs of random\nunlearning are positively correlated with those of poison unlearning. Based on\nthis observation, we propose a random-unlearning NWC pruning technique to\neliminate the backdoor effect and obtain a backdoor-free pruned model. Then,\nmotivated by the OT-based model fusion, we propose the pruned-to-backdoored\nOT-based fusion technique, which fuses pruned and backdoored models to combine\nthe advantages of both, resulting in a model that demonstrates high clean\naccuracy and a low attack success rate. To our knowledge, this is the first\nwork to apply OT and model fusion techniques to backdoor defense. Extensive\nexperiments show that our method successfully defends against all seven\nbackdoor attacks across three benchmark datasets, outperforming both\nstate-of-the-art (SOTA) data-free and data-dependent methods. The code\nimplementation and Appendix are provided in the Supplementary Material.\n","authors":["Weilin Lin","Li Liu","Jianze Li","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.15861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16653v3","updated":"2024-08-28T15:17:44Z","published":"2022-11-30T00:47:03Z","title":"Correlation recurrent units: A novel neural architecture for improving\n the predictive performance of time-series data","summary":" The time-series forecasting (TSF) problem is a traditional problem in the\nfield of artificial intelligence. Models such as Recurrent Neural Network\n(RNN), Long Short Term Memory (LSTM), and GRU (Gate Recurrent Units) have\ncontributed to improving the predictive accuracy of TSF. Furthermore, model\nstructures have been proposed to combine time-series decomposition methods,\nsuch as seasonal-trend decomposition using Loess (STL) to ensure improved\npredictive accuracy. However, because this approach is learned in an\nindependent model for each component, it cannot learn the relationships between\ntime-series components. In this study, we propose a new neural architecture\ncalled a correlation recurrent unit (CRU) that can perform time series\ndecomposition within a neural cell and learn correlations (autocorrelation and\ncorrelation) between each decomposition component. The proposed neural\narchitecture was evaluated through comparative experiments with previous\nstudies using five univariate time-series datasets and four multivariate\ntime-series data. The results showed that long- and short-term predictive\nperformance was improved by more than 10%. The experimental results show that\nthe proposed CRU is an excellent method for TSF problems compared to other\nneural architectures.\n","authors":["Sunghyun Sim","Dohee Kim","Hyerim Bae"],"pdf_url":"https://arxiv.org/pdf/2211.16653v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15852v1","updated":"2024-08-28T15:14:58Z","published":"2024-08-28T15:14:58Z","title":"chemtrain: Learning Deep Potential Models via Automatic Differentiation\n and Statistical Physics","summary":" Neural Networks (NNs) are promising models for refining the accuracy of\nmolecular dynamics, potentially opening up new fields of application. Typically\ntrained bottom-up, atomistic NN potential models can reach first-principle\naccuracy, while coarse-grained implicit solvent NN potentials surpass classical\ncontinuum solvent models. However, overcoming the limitations of costly\ngeneration of accurate reference data and data inefficiency of common bottom-up\ntraining demands efficient incorporation of data from many sources. This paper\nintroduces the framework chemtrain to learn sophisticated NN potential models\nthrough customizable training routines and advanced training algorithms. These\nroutines can combine multiple top-down and bottom-up algorithms, e.g., to\nincorporate both experimental and simulation data or pre-train potentials with\nless costly algorithms. chemtrain provides an object-oriented high-level\ninterface to simplify the creation of custom routines. On the lower level,\nchemtrain relies on JAX to compute gradients and scale the computations to use\navailable resources. We demonstrate the simplicity and importance of combining\nmultiple algorithms in the examples of parametrizing an all-atomistic model of\ntitanium and a coarse-grained implicit solvent model of alanine dipeptide.\n","authors":["Paul Fuchs","Stephan Thaler","Sebastien Röcken","Julija Zavadlav"],"pdf_url":"https://arxiv.org/pdf/2408.15852v1.pdf","comment":"Package source code published at http://github.com/tummfm/chemtrain"},{"id":"http://arxiv.org/abs/2404.07839v2","updated":"2024-08-28T15:05:42Z","published":"2024-04-11T15:27:22Z","title":"RecurrentGemma: Moving Past Transformers for Efficient Open Language\n Models","summary":" We introduce RecurrentGemma, a family of open language models which uses\nGoogle's novel Griffin architecture. Griffin combines linear recurrences with\nlocal attention to achieve excellent performance on language. It has a\nfixed-sized state, which reduces memory use and enables efficient inference on\nlong sequences. We provide two sizes of models, containing 2B and 9B\nparameters, and provide pre-trained and instruction tuned variants for both.\nOur models achieve comparable performance to similarly-sized Gemma baselines\ndespite being trained on fewer tokens.\n","authors":["Aleksandar Botev","Soham De","Samuel L Smith","Anushan Fernando","George-Cristian Muraru","Ruba Haroun","Leonard Berrada","Razvan Pascanu","Pier Giuseppe Sessa","Robert Dadashi","Léonard Hussenot","Johan Ferret","Sertan Girgin","Olivier Bachem","Alek Andreev","Kathleen Kenealy","Thomas Mesnard","Cassidy Hardin","Surya Bhupatiraju","Shreya Pathak","Laurent Sifre","Morgane Rivière","Mihir Sanjay Kale","Juliette Love","Pouya Tafti","Armand Joulin","Noah Fiedel","Evan Senter","Yutian Chen","Srivatsan Srinivasan","Guillaume Desjardins","David Budden","Arnaud Doucet","Sharad Vikram","Adam Paszke","Trevor Gale","Sebastian Borgeaud","Charlie Chen","Andy Brock","Antonia Paterson","Jenny Brennan","Meg Risdal","Raj Gundluru","Nesh Devanathan","Paul Mooney","Nilay Chauhan","Phil Culliton","Luiz Gustavo Martins","Elisa Bandy","David Huntsperger","Glenn Cameron","Arthur Zucker","Tris Warkentin","Ludovic Peran","Minh Giang","Zoubin Ghahramani","Clément Farabet","Koray Kavukcuoglu","Demis Hassabis","Raia Hadsell","Yee Whye Teh","Nando de Frietas"],"pdf_url":"https://arxiv.org/pdf/2404.07839v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01245v2","updated":"2024-08-28T15:01:04Z","published":"2024-04-01T17:03:41Z","title":"A Statistical Framework of Watermarks for Large Language Models: Pivot,\n Detection Efficiency and Optimal Rules","summary":" Since ChatGPT was introduced in November 2022, embedding (nearly)\nunnoticeable statistical signals into text generated by large language models\n(LLMs), also known as watermarking, has been used as a principled approach to\nprovable detection of LLM-generated text from its human-written counterpart. In\nthis paper, we introduce a general and flexible framework for reasoning about\nthe statistical efficiency of watermarks and designing powerful detection\nrules. Inspired by the hypothesis testing formulation of watermark detection,\nour framework starts by selecting a pivotal statistic of the text and a secret\nkey -- provided by the LLM to the verifier -- to enable controlling the false\npositive rate (the error of mistakenly detecting human-written text as\nLLM-generated). Next, this framework allows one to evaluate the power of\nwatermark detection rules by obtaining a closed-form expression of the\nasymptotic false negative rate (the error of incorrectly classifying\nLLM-generated text as human-written). Our framework further reduces the problem\nof determining the optimal detection rule to solving a minimax optimization\nprogram. We apply this framework to two representative watermarks -- one of\nwhich has been internally implemented at OpenAI -- and obtain several findings\nthat can be instrumental in guiding the practice of implementing watermarks. In\nparticular, we derive optimal detection rules for these watermarks under our\nframework. These theoretically derived detection rules are demonstrated to be\ncompetitive and sometimes enjoy a higher power than existing detection\napproaches through numerical experiments.\n","authors":["Xiang Li","Feng Ruan","Huiyuan Wang","Qi Long","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2404.01245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15641v2","updated":"2024-08-28T15:00:32Z","published":"2023-10-24T08:59:40Z","title":"Guaranteed Coverage Prediction Intervals with Gaussian Process\n Regression","summary":" Gaussian Process Regression (GPR) is a popular regression method, which\nunlike most Machine Learning techniques, provides estimates of uncertainty for\nits predictions. These uncertainty estimates however, are based on the\nassumption that the model is well-specified, an assumption that is violated in\nmost practical applications, since the required knowledge is rarely available.\nAs a result, the produced uncertainty estimates can become very misleading; for\nexample the prediction intervals (PIs) produced for the 95% confidence level\nmay cover much less than 95% of the true labels. To address this issue, this\npaper introduces an extension of GPR based on a Machine Learning framework\ncalled, Conformal Prediction (CP). This extension guarantees the production of\nPIs with the required coverage even when the model is completely misspecified.\nThe proposed approach combines the advantages of GPR with the valid coverage\nguarantee of CP, while the performed experimental results demonstrate its\nsuperiority over existing methods.\n","authors":["Harris Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.15641v2.pdf","comment":"12 pages. This article has been accepted for publication in IEEE\n Transactions on Pattern Analysis and Machine Intelligence. This is the\n author's version which has not been fully edited and content may change prior\n to final publication. Citation information: DOI 10.1109/TPAMI.2024.3418214"},{"id":"http://arxiv.org/abs/2307.08220v2","updated":"2024-08-28T14:47:24Z","published":"2023-07-17T03:45:00Z","title":"FRANC: A Lightweight Framework for High-Quality Code Generation","summary":" In recent years, the use of automated source code generation utilizing\ntransformer-based generative models has expanded, and these models can generate\nfunctional code according to the requirements of the developers. However,\nrecent research revealed that these automatically generated source codes can\ncontain vulnerabilities and other quality issues. Despite researchers' and\npractitioners' attempts to enhance code generation models, retraining and\nfine-tuning large language models is time-consuming and resource-intensive.\nThus, we describe FRANC, a lightweight framework for recommending more secure\nand high-quality source code derived from transformer-based code generation\nmodels. FRANC includes a static filter to make the generated code compilable\nwith heuristics and a quality-aware ranker to sort the code snippets based on a\nquality score. Moreover, the framework uses prompt engineering to fix\npersistent quality issues. We evaluated the framework with five Python and Java\ncode generation models and six prompt datasets, including a newly created one\nin this work (SOEval). The static filter improves 9% to 46% Java suggestions\nand 10% to 43% Python suggestions regarding compilability. The average\nimprovement over the NDCG@10 score for the ranking system is 0.0763, and the\nrepairing techniques repair the highest 80% of prompts. FRANC takes, on\naverage, 1.98 seconds for Java; for Python, it takes 0.08 seconds.\n","authors":["Mohammed Latif Siddiq","Beatrice Casey","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2307.08220v2.pdf","comment":"Accepted at the 24th IEEE International Conference on Source Code\n Analysis and Manipulation (SCAM 2024)"},{"id":"http://arxiv.org/abs/2408.15827v1","updated":"2024-08-28T14:40:15Z","published":"2024-08-28T14:40:15Z","title":"Automatic Differential Diagnosis using Transformer-Based Multi-Label\n Sequence Classification","summary":" As the field of artificial intelligence progresses, assistive technologies\nare becoming more widely used across all industries. The healthcare industry is\nno different, with numerous studies being done to develop assistive tools for\nhealthcare professionals. Automatic diagnostic systems are one such beneficial\ntool that can assist with a variety of tasks, including collecting patient\ninformation, analyzing test results, and diagnosing patients. However, the idea\nof developing systems that can provide a differential diagnosis has been\nlargely overlooked in most of these research studies. In this study, we propose\na transformer-based approach for providing differential diagnoses based on a\npatient's age, sex, medical history, and symptoms. We use the DDXPlus dataset,\nwhich provides differential diagnosis information for patients based on 49\ndisease types. Firstly, we propose a method to process the tabular patient data\nfrom the dataset and engineer them into patient reports to make them suitable\nfor our research. In addition, we introduce two data modification modules to\ndiversify the training data and consequently improve the robustness of the\nmodels. We approach the task as a multi-label classification problem and\nconduct extensive experiments using four transformer models. All the models\ndisplayed promising results by achieving over 97% F1 score on the held-out test\nset. Moreover, we design additional behavioral tests to get a broader\nunderstanding of the models. In particular, for one of our test cases, we\nprepared a custom test set of 100 samples with the assistance of a doctor. The\nresults on the custom set showed that our proposed data modification modules\nimproved the model's generalization capabilities. We hope our findings will\nprovide future researchers with valuable insights and inspire them to develop\nreliable systems for automatic differential diagnosis.\n","authors":["Abu Adnan Sadi","Mohammad Ashrafuzzaman Khan","Lubaba Binte Saber"],"pdf_url":"https://arxiv.org/pdf/2408.15827v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2404.10155v2","updated":"2024-08-28T14:38:51Z","published":"2024-04-15T22:02:58Z","title":"The Fault in our Stars: Quality Assessment of Code Generation Benchmarks","summary":" Large Language Models (LLMs) are gaining popularity among software engineers.\nA crucial aspect of developing effective code generation LLMs is to evaluate\nthese models using a robust benchmark. Evaluation benchmarks with quality\nissues can provide a false sense of performance. In this work, we conduct the\nfirst-of-its-kind study of the quality of prompts within benchmarks used to\ncompare the performance of different code generation models. To conduct this\nstudy, we analyzed 3,566 prompts from 9 code generation benchmarks to identify\nquality issues in them. We also investigated whether fixing the identified\nquality issues in the benchmarks' prompts affects a model's performance. We\nalso studied memorization issues of the evaluation dataset, which can put into\nquestion a benchmark's trustworthiness. We found that code generation\nevaluation benchmarks mainly focused on Python and coding exercises and had\nvery limited contextual dependencies to challenge the model. These datasets and\nthe developers' prompts suffer from quality issues like spelling and\ngrammatical errors, unclear sentences to express developers' intent, and not\nusing proper documentation style. Fixing all these issues in the benchmarks can\nlead to a better performance for Python code generation, but not a significant\nimprovement was observed for Java code generation. We also found evidence that\nGPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues.\n","authors":["Mohammed Latif Siddiq","Simantika Dristi","Joy Saha","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2404.10155v2.pdf","comment":"Accepted at the 24th IEEE International Conference on Source Code\n Analysis and Manipulation(SCAM 2024)"},{"id":"http://arxiv.org/abs/2408.15819v1","updated":"2024-08-28T14:32:24Z","published":"2024-08-28T14:32:24Z","title":"Automated Mixture Analysis via Structural Evaluation","summary":" The determination of chemical mixture components is vital to a multitude of\nscientific fields. Oftentimes spectroscopic methods are employed to decipher\nthe composition of these mixtures. However, the sheer density of spectral\nfeatures present in spectroscopic databases can make unambiguous assignment to\nindividual species challenging. Yet, components of a mixture are commonly\nchemically related due to environmental processes or shared precursor\nmolecules. Therefore, analysis of the chemical relevance of a molecule is\nimportant when determining which species are present in a mixture. In this\npaper, we combine machine-learning molecular embedding methods with a\ngraph-based ranking system to determine the likelihood of a molecule being\npresent in a mixture based on the other known species and/or chemical priors.\nBy incorporating this metric in a rotational spectroscopy mixture analysis\nalgorithm, we demonstrate that the mixture components can be identified with\nextremely high accuracy (>97%) in an efficient manner.\n","authors":["Zachary T. P. Fried","Brett A. McGuire"],"pdf_url":"https://arxiv.org/pdf/2408.15819v1.pdf","comment":"Accepted for publication in The Journal of Physical Chemistry A"},{"id":"http://arxiv.org/abs/2408.14511v2","updated":"2024-08-28T14:13:41Z","published":"2024-08-25T04:07:18Z","title":"Unveiling the Statistical Foundations of Chain-of-Thought Prompting\n Methods","summary":" Chain-of-Thought (CoT) prompting and its variants have gained popularity as\neffective methods for solving multi-step reasoning problems using pretrained\nlarge language models (LLMs). In this work, we analyze CoT prompting from a\nstatistical estimation perspective, providing a comprehensive characterization\nof its sample complexity. To this end, we introduce a multi-step latent\nvariable model that encapsulates the reasoning process, where the latent\nvariable encodes the task information. Under this framework, we demonstrate\nthat when the pretraining dataset is sufficiently large, the estimator formed\nby CoT prompting is equivalent to a Bayesian estimator. This estimator\neffectively solves the multi-step reasoning problem by aggregating a posterior\ndistribution inferred from the demonstration examples in the prompt. Moreover,\nwe prove that the statistical error of the CoT estimator can be decomposed into\ntwo main components: (i) a prompting error, which arises from inferring the\ntrue task using CoT prompts, and (ii) the statistical error of the pretrained\nLLM. We establish that, under appropriate assumptions, the prompting error\ndecays exponentially to zero as the number of demonstrations increases.\nAdditionally, we explicitly characterize the approximation and generalization\nerrors of the pretrained LLM. Notably, we construct a transformer model that\napproximates the target distribution of the multi-step reasoning problem with\nan error that decreases exponentially in the number of transformer blocks. Our\nanalysis extends to other variants of CoT, including Self-Consistent CoT,\nTree-of-Thought, and Selection-Inference, offering a broad perspective on the\nefficacy of these methods. We also provide numerical experiments to validate\nthe theoretical findings.\n","authors":["Xinyang Hu","Fengzhuo Zhang","Siyu Chen","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2408.14511v2.pdf","comment":"150 pages, 18 figures, 3 tables"},{"id":"http://arxiv.org/abs/2404.18531v2","updated":"2024-08-28T14:12:22Z","published":"2024-04-29T09:17:36Z","title":"A Framework to Model ML Engineering Processes","summary":" The development of Machine Learning (ML) based systems is complex and\nrequires multidisciplinary teams with diverse skill sets. This may lead to\ncommunication issues or misapplication of best practices. Process models can\nalleviate these challenges by standardizing task orchestration, providing a\ncommon language to facilitate communication, and nurturing a collaborative\nenvironment. Unfortunately, current process modeling languages are not suitable\nfor describing the development of such systems. In this paper, we introduce a\nframework for modeling ML-based software development processes, built around a\ndomain-specific language and derived from an analysis of scientific and gray\nliterature. A supporting toolkit is also available.\n","authors":["Sergio Morales","Robert Clarisó","Jordi Cabot"],"pdf_url":"https://arxiv.org/pdf/2404.18531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14846v4","updated":"2024-08-28T14:04:05Z","published":"2024-02-19T14:53:01Z","title":"Stick to your Role! Stability of Personal Values Expressed in Large\n Language Models","summary":" The standard way to study Large Language Models (LLMs) with benchmarks or\npsychology questionnaires is to provide many different queries from similar\nminimal contexts (e.g. multiple choice questions). However, due to LLMs' highly\ncontext-dependent nature, conclusions from such minimal-context evaluations may\nbe little informative about the model's behavior in deployment (where it will\nbe exposed to many new contexts). We argue that context-dependence\n(specifically, value stability) should be studied as a specific property of\nLLMs and used as another dimension of LLM comparison (alongside others such as\ncognitive abilities, knowledge, or model size). We present a case-study on the\nstability of value expression over different contexts (simulated conversations\non different topics) as measured using a standard psychology questionnaire\n(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we\nstudy Rank-order stability on the population (interpersonal) level, and\nIpsative stability on the individual (intrapersonal) level. We consider two\nsettings (with and without instructing LLMs to simulate particular personas),\ntwo simulated populations, and three downstream tasks. We observe consistent\ntrends in the stability of models and model families - Mixtral, Mistral,\nGPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency\nof these trends implies that some models exhibit higher value stability than\nothers, and that stability can be estimated with the set of introduced\nmethodological tools. When instructed to simulate particular personas, LLMs\nexhibit low Rank-order stability, which further diminishes with conversation\nlength. This highlights the need for future research on LLMs that coherently\nsimulate different personas. This paper provides a foundational step in that\ndirection, and, to our knowledge, it is the first study of value stability in\nLLMs.\n","authors":["Grgur Kovač","Rémy Portelas","Masataka Sawayama","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2402.14846v4.pdf","comment":"The project website and code are available at\n https://sites.google.com/view/llmvaluestability Published in PLOS ONE (\n https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ),\n and a shorter version at CogSci 24 (\n https://escholarship.org/uc/item/7w4823c6 )"},{"id":"http://arxiv.org/abs/2405.12390v2","updated":"2024-08-28T13:48:07Z","published":"2024-05-20T21:50:19Z","title":"A Metric-based Principal Curve Approach for Learning One-dimensional\n Manifold","summary":" Principal curve is a well-known statistical method oriented in manifold\nlearning using concepts from differential geometry. In this paper, we propose a\nnovel metric-based principal curve (MPC) method that learns one-dimensional\nmanifold of spatial data. Synthetic datasets Real applications using MNIST\ndataset show that our method can learn the one-dimensional manifold well in\nterms of the shape.\n","authors":["Elvis Han Cui"],"pdf_url":"https://arxiv.org/pdf/2405.12390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15793v1","updated":"2024-08-28T13:37:07Z","published":"2024-08-28T13:37:07Z","title":"Language Adaptation on a Tight Academic Compute Budget: Tokenizer\n Swapping Works and Pure bfloat16 Is Enough","summary":" We investigate continued pretraining of LLMs for language adaptation on a\ntight academic budget: a setting in which only a few GPUs can be used in\nparallel, for a heavily constrained duration. We focus on adapting Mistral-7B\nto German or Arabic and evaluate several techniques to improve efficiency and\neffectiveness in this setting. Our German models adapted on this tight compute\nbudget underperform compared to the base Mistral-7B, while our Arabic models\noutperform several baselines, showing that for sufficiently well-represented\nlanguages, continued pretraining for specialization is not always helpful. Our\nmain findings focus on training precision and tokenizer swapping. Our results\nshow that pure bfloat16 training is a viable alternative to mixed-precision\ntraining, while being much faster when only using a few GPUs. Swapping the\ntokenizer for a specialized one yields more efficient tokenization and is\ncompetitive with the original tokenizer, which already contains some German\ntokens, but did not significantly increase performance for German. Code and\nmodel weights are available at on GitHub.\n","authors":["Konstantin Dobler","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2408.15793v1.pdf","comment":"WANT@ICML 2024"},{"id":"http://arxiv.org/abs/2408.15792v1","updated":"2024-08-28T13:35:54Z","published":"2024-08-28T13:35:54Z","title":"Efficient LLM Scheduling by Learning to Rank","summary":" In Large Language Model (LLM) inference, the output length of an LLM request\nis typically regarded as not known a priori. Consequently, most LLM serving\nsystems employ a simple First-come-first-serve (FCFS) scheduling strategy,\nleading to Head-Of-Line (HOL) blocking and reduced throughput and service\nquality. In this paper, we reexamine this assumption -- we show that, although\npredicting the exact generation length of each request is infeasible, it is\npossible to predict the relative ranks of output lengths in a batch of\nrequests, using learning to rank. The ranking information offers valuable\nguidance for scheduling requests. Building on this insight, we develop a novel\nscheduler for LLM inference and serving that can approximate the\nshortest-job-first (SJF) schedule better than existing approaches. We integrate\nthis scheduler with the state-of-the-art LLM serving system and show\nsignificant performance improvement in several important applications: 2.8x\nlower latency in chatbot serving and 6.5x higher throughput in synthetic data\ngeneration. Our code is available at https://github.com/hao-ai-lab/vllm-ltr.git\n","authors":["Yichao Fu","Siqi Zhu","Runlong Su","Aurick Qiao","Ion Stoica","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.03469v2","updated":"2024-08-28T13:34:13Z","published":"2022-06-07T17:40:51Z","title":"Marked Neural Spatio-Temporal Point Process Involving a Dynamic Graph\n Neural Network","summary":" Temporal Point Processes (TPPs) have recently become increasingly interesting\nfor learning dynamics in graph data. A reason for this is that learning on\ndynamic graph data is becoming more relevant, since data from many scientific\nfields, ranging from mathematics, biology, social sciences, and physics to\ncomputer science, is naturally related and inherently dynamic. In addition,\nTPPs provide a meaningful characterization of event streams and a prediction\nmechanism for future events. Therefore, (semi-)parameterized Neural TPPs have\nbeen introduced whose characterization can be (partially) learned and, thus,\nenable the representation of more complex phenomena. However, the research on\nmodeling dynamic graphs with TPPs is relatively young, and only a few models\nfor node attribute changes or evolving edges have been proposed yet. To allow\nfor learning on fully dynamic graph streams, i.e., graphs that can change in\ntheir structure (addition/deletion of nodes/edge) and in their node/edge\nattributes, we propose a Marked Neural Spatio-Temporal Point Process (MNSTPP).\nIt leverages a Dynamic Graph Neural Network to learn a Marked TPP that handles\nattributes and spatial data to model and predict any event in a graph stream.\n","authors":["Alice Moallemy-Oureh","Silvia Beddar-Wiesing","Yannick Nagel","Rüdiger Nather","Josephine M. Thomas"],"pdf_url":"https://arxiv.org/pdf/2206.03469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00645v2","updated":"2024-08-28T13:32:43Z","published":"2023-08-30T13:26:49Z","title":"Analysis of Diagnostics (Part I): Prevalence, Uncertainty\n Quantification, and Machine Learning","summary":" Diagnostic testing provides a unique setting for studying and developing\ntools in classification theory. In such contexts, the concept of prevalence,\ni.e. the number of individuals with a given condition, is fundamental, both as\nan inherent quantity of interest and as a parameter that controls\nclassification accuracy. This manuscript is the first in a two-part series that\nstudies deeper connections between classification theory and prevalence,\nshowing how the latter establishes a more complete theory of uncertainty\nquantification (UQ) for certain types of machine learning (ML). We motivate\nthis analysis via a lemma demonstrating that general classifiers minimizing a\nprevalence-weighted error contain the same probabilistic information as\nBayes-optimal classifiers, which depend on conditional probability densities.\nThis leads us to study relative probability level-sets $B^\\star (q)$, which are\nreinterpreted as both classification boundaries and useful tools for\nquantifying uncertainty in class labels. To realize this in practice, we also\npropose a numerical, homotopy algorithm that estimates the $B^\\star (q)$ by\nminimizing a prevalence-weighted empirical error. The successes and\nshortcomings of this method motivate us to revisit properties of the level\nsets, and we deduce the corresponding classifiers obey a useful monotonicity\nproperty that stabilizes the numerics and points to important extensions to UQ\nof ML. Throughout, we validate our methods in the context of synthetic data and\na research-use-only SARS-CoV-2 enzyme-linked immunosorbent (ELISA) assay.\n","authors":["Paul N. Patrone","Raquel A. Binder","Catherine S. Forconi","Ann M. Moormann","Anthony J. Kearsley"],"pdf_url":"https://arxiv.org/pdf/2309.00645v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14382v2","updated":"2024-08-28T13:30:36Z","published":"2023-07-25T20:08:41Z","title":"When Multi-Task Learning Meets Partial Supervision: A Computer Vision\n Review","summary":" Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while\nexploiting their mutual relationships. By using shared resources to\nsimultaneously calculate multiple outputs, this learning paradigm has the\npotential to have lower memory requirements and inference times compared to the\ntraditional approach of using separate methods for each task. Previous work in\nMTL has mainly focused on fully-supervised methods, as task relationships can\nnot only be leveraged to lower the level of data-dependency of those methods\nbut they can also improve performance. However, MTL introduces a set of\nchallenges due to a complex optimisation scheme and a higher labeling\nrequirement. This review focuses on how MTL could be utilised under different\npartial supervision settings to address these challenges. First, this review\nanalyses how MTL traditionally uses different parameter sharing techniques to\ntransfer knowledge in between tasks. Second, it presents the different\nchallenges arising from such a multi-objective optimisation scheme. Third, it\nintroduces how task groupings can be achieved by analysing task relationships.\nFourth, it focuses on how partially supervised methods applied to MTL can\ntackle the aforementioned challenges. Lastly, this review presents the\navailable datasets, tools and benchmarking results of such methods.\n","authors":["Maxime Fontana","Michael Spratling","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2307.14382v2.pdf","comment":"Accepted by Proceedings of the IEEE"},{"id":"http://arxiv.org/abs/2408.15784v1","updated":"2024-08-28T13:26:36Z","published":"2024-08-28T13:26:36Z","title":"Implicit Regularization Paths of Weighted Neural Representations","summary":" We study the implicit regularization effects induced by (observation)\nweighting of pretrained features. For weight and feature matrices of bounded\noperator norms that are infinitesimally free with respect to (normalized) trace\nfunctionals, we derive equivalence paths connecting different weighting\nmatrices and ridge regularization levels. Specifically, we show that ridge\nestimators trained on weighted features along the same path are asymptotically\nequivalent when evaluated against test vectors of bounded norms. These paths\ncan be interpreted as matching the effective degrees of freedom of ridge\nestimators fitted with weighted features. For the special case of subsampling\nwithout replacement, our results apply to independently sampled random features\nand kernel features and confirm recent conjectures (Conjectures 7 and 8) of the\nauthors on the existence of such paths in Patil et al. We also present an\nadditive risk decomposition for ensembles of weighted estimators and show that\nthe risks are equivalent along the paths when the ensemble size goes to\ninfinity. As a practical consequence of the path equivalences, we develop an\nefficient cross-validation method for tuning and apply it to subsampled\npretrained representations across several models (e.g., ResNet-50) and datasets\n(e.g., CIFAR-100).\n","authors":["Jin-Hong Du","Pratik Patil"],"pdf_url":"https://arxiv.org/pdf/2408.15784v1.pdf","comment":"19 pages for main and 19 pages for appendix"},{"id":"http://arxiv.org/abs/2408.09237v2","updated":"2024-08-28T13:10:40Z","published":"2024-08-17T16:06:14Z","title":"QEDCartographer: Automating Formal Verification Using Reward-Free\n Reinforcement Learning","summary":" Formal verification is a promising method for producing reliable software,\nbut the difficulty of manually writing verification proofs severely limits its\nutility in practice. Recent methods have automated some proof synthesis by\nguiding a search through the proof space using a theorem prover. Unfortunately,\nthe theorem prover provides only the crudest estimate of progress, resulting in\neffectively undirected search. To address this problem, we create\nQEDCartographer, an automated proof-synthesis tool that combines supervised and\nreinforcement learning to more effectively explore the proof space.\nQEDCartographer incorporates the proofs' branching structure, enabling\nreward-free search and overcoming the sparse reward problem inherent to formal\nverification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K\ntheorems from 124 open-source Coq projects. QEDCartographer fully automatically\nproves 21.4% of the test-set theorems. Previous search-based proof-synthesis\ntools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on\nsupervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively.\nDiva, which combines 62 tools, proves 19.2%. Comparing to the most effective\nprior tool, Proverbot9001, QEDCartographer produces 26% shorter proofs 27%\nfaster, on average over the theorems both tools prove. Together,\nQEDCartographer and non-learning-based CoqHammer prove 31.8% of the theorems,\nwhile CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement\nlearning is a fruitful research direction for improving proof-synthesis tools'\nsearch mechanisms.\n","authors":["Alex Sanchez-Stern","Abhishek Varghese","Zhanna Kaufman","Dylan Zhang","Talia Ringer","Yuriy Brun"],"pdf_url":"https://arxiv.org/pdf/2408.09237v2.pdf","comment":"Published in the International Conference on Software Engineering\n (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan\n Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal\n Verification Using Reward-Free Reinforcement Learning, in Proceedings of the\n 47th International Conference on Software Engineering (ICSE), 2025"},{"id":"http://arxiv.org/abs/2408.15771v1","updated":"2024-08-28T13:09:20Z","published":"2024-08-28T13:09:20Z","title":"wav2pos: Sound Source Localization using Masked Autoencoders","summary":" We present a novel approach to the 3D sound source localization task for\ndistributed ad-hoc microphone arrays by formulating it as a set-to-set\nregression problem. By training a multi-modal masked autoencoder model that\noperates on audio recordings and microphone coordinates, we show that such a\nformulation allows for accurate localization of the sound source, by\nreconstructing coordinates masked in the input. Our approach is flexible in the\nsense that a single model can be used with an arbitrary number of microphones,\neven when a subset of audio recordings and microphone coordinates are missing.\nWe test our method on simulated and real-world recordings of music and speech\nin indoor environments, and demonstrate competitive performance compared to\nboth classical and other learning based localization methods.\n","authors":["Axel Berg","Jens Gulin","Mark O'Connor","Chuteng Zhou","Karl Åström","Magnus Oskarsson"],"pdf_url":"https://arxiv.org/pdf/2408.15771v1.pdf","comment":"IPIN 2024"},{"id":"http://arxiv.org/abs/2405.19730v5","updated":"2024-08-28T13:05:41Z","published":"2024-05-30T06:21:34Z","title":"Research on the Spatial Data Intelligent Foundation Model","summary":" This report focuses on spatial data intelligent large models, delving into\nthe principles, methods, and cutting-edge applications of these models. It\nprovides an in-depth discussion on the definition, development history, current\nstatus, and trends of spatial data intelligent large models, as well as the\nchallenges they face. The report systematically elucidates the key technologies\nof spatial data intelligent large models and their applications in urban\nenvironments, aerospace remote sensing, geography, transportation, and other\nscenarios. Additionally, it summarizes the latest application cases of spatial\ndata intelligent large models in themes such as urban development, multimodal\nsystems, remote sensing, smart transportation, and resource environments.\nFinally, the report concludes with an overview and outlook on the development\nprospects of spatial data intelligent large models.\n","authors":["Shaohua Wang","Xing Xie","Yong Li","Danhuai Guo","Zhi Cai","Yu Liu","Yang Yue","Xiao Pan","Feng Lu","Huayi Wu","Zhipeng Gui","Zhiming Ding","Bolong Zheng","Fuzheng Zhang","Jingyuan Wang","Zhengchao Chen","Hao Lu","Jiayi Li","Peng Yue","Wenhao Yu","Yao Yao","Leilei Sun","Yong Zhang","Longbiao Chen","Xiaoping Du","Xiang Li","Xueying Zhang","Kun Qin","Zhaoya Gong","Weihua Dong","Xiaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2405.19730v5.pdf","comment":"V1 and V2 are in Chinese language, other versions are in English"},{"id":"http://arxiv.org/abs/2408.15766v1","updated":"2024-08-28T12:59:12Z","published":"2024-08-28T12:59:12Z","title":"Harmonized Speculative Sampling","summary":" Speculative sampling has proven to be an effective solution to accelerate\ndecoding from large language models, where the acceptance rate significantly\ndetermines the performance. Most previous works on improving the acceptance\nrate focus on aligned training and efficient decoding, implicitly paying less\nattention to the linkage of training and decoding. In this work, we first\ninvestigate the linkage of training and decoding for speculative sampling and\nthen propose a solution named HArmonized Speculative Sampling (HASS). HASS\nimproves the acceptance rate without extra inference overhead by harmonizing\ntraining and decoding on their objectives and contexts. Experiments on three\nLLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup\nratio averaging across three datasets, which is 8%-15% faster than EAGLE-2.\n","authors":["Lefan Zhang","Xiaodan Wang","Yanhua Huang","Ruiwen Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09495v2","updated":"2024-08-28T12:46:28Z","published":"2024-06-13T17:36:05Z","title":"FADE: Towards Fairness-aware Augmentation for Domain Generalization via\n Classifier-Guided Score-based Diffusion Models","summary":" Fairness-aware domain generalization (FairDG) has emerged as a critical\nchallenge for deploying trustworthy AI systems, particularly in scenarios\ninvolving distribution shifts. Traditional methods for addressing fairness have\nfailed in domain generalization due to their lack of consideration for\ndistribution shifts. Although disentanglement has been used to tackle FairDG,\nit is limited by its strong assumptions. To overcome these limitations, we\npropose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as\na novel approach to effectively address the FairDG issue. Specifically, we\nfirst pre-train a score-based diffusion model (SDM) and two classifiers to\nequip the model with strong generalization capabilities across different\ndomains. Then, we guide the SDM using these pre-trained classifiers to\neffectively eliminate sensitive information from the generated data. Finally,\nthe generated fair data is used to train downstream classifiers, ensuring\nrobust performance under new data distributions. Extensive experiments on three\nreal-world datasets demonstrate that FADE not only enhances fairness but also\nimproves accuracy in the presence of distribution shifts. Additionally, FADE\noutperforms existing methods in achieving the best accuracy-fairness\ntrade-offs.\n","authors":["Yujie Lin","Dong Li","Chen Zhao","Minglai Shao"],"pdf_url":"https://arxiv.org/pdf/2406.09495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02255v3","updated":"2024-08-28T12:43:10Z","published":"2023-12-04T18:56:08Z","title":"Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis","summary":" Recent neural rendering and reconstruction techniques, such as NeRFs or\nGaussian Splatting, have shown remarkable novel view synthesis capabilities but\nrequire hundreds of images of the scene from diverse viewpoints to render\nhigh-quality novel views. With fewer images available, these methods start to\nfail since they can no longer correctly triangulate the underlying 3D geometry\nand converge to a non-optimal solution. These failures can manifest as floaters\nor blurry renderings in sparsely observed areas of the scene. In this paper, we\npropose Re-Nerfing, a simple and general add-on approach that leverages novel\nview synthesis itself to tackle this problem. Using an already trained NVS\nmethod, we render novel views between existing ones and augment the training\ndata to optimize a second model. This introduces additional multi-view\nconstraints and allows the second model to converge to a better solution. With\nRe-Nerfing we achieve significant improvements upon multiple pipelines based on\nNeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and\nLLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra\nsupervision signals, making it a flexible and practical add-on.\n","authors":["Felix Tristram","Stefano Gasperini","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.02255v3.pdf","comment":"Code will be released upon acceptance"},{"id":"http://arxiv.org/abs/2408.14432v2","updated":"2024-08-28T12:39:57Z","published":"2024-08-26T17:20:34Z","title":"Contextual Bandit with Herding Effects: Algorithms and Recommendation\n Applications","summary":" Contextual bandits serve as a fundamental algorithmic framework for\noptimizing recommendation decisions online. Though extensive attention has been\npaid to tailoring contextual bandits for recommendation applications, the\n\"herding effects\" in user feedback have been ignored. These herding effects\nbias user feedback toward historical ratings, breaking down the assumption of\nunbiased feedback inherent in contextual bandits. This paper develops a novel\nvariant of the contextual bandit that is tailored to address the feedback bias\ncaused by the herding effects. A user feedback model is formulated to capture\nthis feedback bias. We design the TS-Conf (Thompson Sampling under Conformity)\nalgorithm, which employs posterior sampling to balance the exploration and\nexploitation tradeoff. We prove an upper bound for the regret of the algorithm,\nrevealing the impact of herding effects on learning speed. Extensive\nexperiments on datasets demonstrate that TS-Conf outperforms four benchmark\nalgorithms. Analysis reveals that TS-Conf effectively mitigates the negative\nimpact of herding effects, resulting in faster learning and improved\nrecommendation accuracy.\n","authors":["Luyue Xu","Liming Wang","Hong Xie","Mingqiang Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14432v2.pdf","comment":"Published as a conference paper at PRICAI 2024"},{"id":"http://arxiv.org/abs/2408.15753v1","updated":"2024-08-28T12:39:51Z","published":"2024-08-28T12:39:51Z","title":"A Neural Material Point Method for Particle-based Simulations","summary":" Mesh-free Lagrangian methods are widely used for simulating fluids, solids,\nand their complex interactions due to their ability to handle large\ndeformations and topological changes. These physics simulators, however,\nrequire substantial computational resources for accurate simulations. To\naddress these issues, deep learning emulators promise faster and scalable\nsimulations, yet they often remain expensive and difficult to train, limiting\ntheir practical use. Inspired by the Material Point Method (MPM), we present\nNeuralMPM, a neural emulation framework for particle-based simulations.\nNeuralMPM interpolates Lagrangian particles onto a fixed-size grid, computes\nupdates on grid nodes using image-to-image neural networks, and interpolates\nback to the particles. Similarly to MPM, NeuralMPM benefits from the regular\nvoxelized representation to simplify the computation of the state dynamics,\nwhile avoiding the drawbacks of mesh-based Eulerian methods. We demonstrate the\nadvantages of NeuralMPM on several datasets, including fluid dynamics and\nfluid-solid interactions. Compared to existing methods, NeuralMPM reduces\ntraining times from days to hours, while achieving comparable or superior\nlong-term accuracy, making it a promising approach for practical forward and\ninverse problems. A project page is available at https://neuralmpm.isach.be\n","authors":["Omer Rochman Sharabi","Sacha Lewin","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2408.15753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11122v6","updated":"2024-08-28T12:33:52Z","published":"2023-10-17T10:14:10Z","title":"Sensitivity-Aware Amortized Bayesian Inference","summary":" Sensitivity analyses reveal the influence of various modeling choices on the\noutcomes of statistical analyses. While theoretically appealing, they are\noverwhelmingly inefficient for complex Bayesian models. In this work, we\npropose sensitivity-aware amortized Bayesian inference (SA-ABI), a multifaceted\napproach to efficiently integrate sensitivity analyses into simulation-based\ninference with neural networks. First, we utilize weight sharing to encode the\nstructural similarities between alternative likelihood and prior specifications\nin the training process with minimal computational overhead. Second, we\nleverage the rapid inference of neural networks to assess sensitivity to data\nperturbations and preprocessing steps. In contrast to most other Bayesian\napproaches, both steps circumvent the costly bottleneck of refitting the model\nfor each choice of likelihood, prior, or data set. Finally, we propose to use\ndeep ensembles to detect sensitivity arising from unreliable approximation\n(e.g., due to model misspecification). We demonstrate the effectiveness of our\nmethod in applied modeling problems, ranging from disease outbreak dynamics and\nglobal warming thresholds to human decision-making. Our results support\nsensitivity-aware inference as a default choice for amortized Bayesian\nworkflows, automatically providing modelers with insights into otherwise hidden\ndimensions.\n","authors":["Lasse Elsemüller","Hans Olischläger","Marvin Schmitt","Paul-Christian Bürkner","Ullrich Köthe","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2310.11122v6.pdf","comment":"Published in TMLR (2024)"},{"id":"http://arxiv.org/abs/2407.16496v2","updated":"2024-08-28T12:20:42Z","published":"2024-07-23T14:11:12Z","title":"Articulation Work and Tinkering for Fairness in Machine Learning","summary":" The field of fair AI aims to counter biased algorithms through computational\nmodelling. However, it faces increasing criticism for perpetuating the use of\noverly technical and reductionist methods. As a result, novel approaches appear\nin the field to address more socially-oriented and interdisciplinary (SOI)\nperspectives on fair AI. In this paper, we take this dynamic as the starting\npoint to study the tension between computer science (CS) and SOI research. By\ndrawing on STS and CSCW theory, we position fair AI research as a matter of\n'organizational alignment': what makes research 'doable' is the successful\nalignment of three levels of work organization (the social world, the\nlaboratory, and the experiment). Based on qualitative interviews with CS\nresearchers, we analyze the tasks, resources, and actors required for doable\nresearch in the case of fair AI. We find that CS researchers engage with SOI\nresearch to some extent, but organizational conditions, articulation work, and\nambiguities of the social world constrain the doability of SOI research for\nthem. Based on our findings, we identify and discuss problems for aligning CS\nand SOI as fair AI continues to evolve.\n","authors":["Miriam Fahimi","Mayra Russo","Kristen M. Scott","Maria-Esther Vidal","Bettina Berendt","Katharina Kinder-Kurlanda"],"pdf_url":"https://arxiv.org/pdf/2407.16496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08459v3","updated":"2024-08-28T12:11:46Z","published":"2023-03-15T09:03:58Z","title":"Forecasting Intraday Power Output by a Set of PV Systems using Recurrent\n Neural Networks and Physical Covariates","summary":" Accurate intraday forecasts of the power output by PhotoVoltaic (PV) systems\nare critical to improve the operation of energy distribution grids. We describe\na neural autoregressive model that aims to perform such intraday forecasts. We\nbuild upon a physical, deterministic PV performance model, the output of which\nis used as covariates in the context of the neural model. In addition, our\napplication data relates to a geographically distributed set of PV systems. We\naddress all PV sites with a single neural model, which embeds the information\nabout the PV site in specific covariates. We use a scale-free approach which\nrelies on the explicit modeling of seasonal effects. Our proposal repurposes a\nmodel initially used in the retail sector and discloses a novel truncated\nGaussian output distribution. An ablation study and a comparison to alternative\narchitectures from the literature shows that the components in the best\nperforming proposed model variant work synergistically to reach a skill score\nof 15.72% with respect to the physical model, used as a baseline.\n","authors":["Pierrick Bruneau","David Fiorelli","Christian Braun","Daniel Koster"],"pdf_url":"https://arxiv.org/pdf/2303.08459v3.pdf","comment":"25 pages, 7 figures, Accepted for publication in Neural Computing and\n Applications on 12/07/2024"},{"id":"http://arxiv.org/abs/2408.14398v2","updated":"2024-08-28T12:03:54Z","published":"2024-08-26T16:29:13Z","title":"Language-specific Calibration for Pruning Multilingual Language Models","summary":" Recent advances in large language model (LLM) pruning have shown\nstate-of-the-art compression results in post-training and retraining-free\nsettings while maintaining high predictive performance. However, such research\nmainly considers calibrating pruning using English text, despite the\nmultilingual nature of modern LLMs and their frequent uses in non-English\nlanguages. In this paper, we set out to explore effective strategies for\ncalibrating the pruning of multilingual language models. We present the first\ncomprehensive empirical study, comparing different calibration languages for\npruning multilingual models across diverse tasks, models, and state-of-the-art\npruning techniques. Our results present practical suggestions, for example,\ncalibrating in the target language can efficiently yield lower perplexity, but\ndoes not necessarily benefit downstream tasks. Our further analysis experiments\nunveil that calibration in the target language mainly contributes to preserving\nlanguage-specific features related to fluency and coherence, but might not\ncontribute to capturing language-agnostic features such as language\nunderstanding and reasoning. Last, we provide practical recommendations for\nfuture practitioners.\n","authors":["Simon Kurz","Jian-Jia Chen","Lucie Flek","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.14398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11960v3","updated":"2024-08-28T11:48:43Z","published":"2024-03-18T16:57:16Z","title":"Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal\n Time Series Imputation","summary":" Spatiotemporal time series are usually collected via monitoring sensors\nplaced at different locations, which usually contain missing values due to\nvarious failures, such as mechanical damages and Internet outages. Imputing the\nmissing values is crucial for analyzing time series. When recovering a specific\ndata point, most existing methods consider all the information relevant to that\npoint regardless of the cause-and-effect relationship. During data collection,\nit is inevitable that some unknown confounders are included, e.g., background\nnoise in time series and non-causal shortcut edges in the constructed sensor\nnetwork. These confounders could open backdoor paths and establish non-causal\ncorrelations between the input and output. Over-exploiting these non-causal\ncorrelations could cause overfitting. In this paper, we first revisit\nspatiotemporal time series imputation from a causal perspective and show how to\nblock the confounders via the frontdoor adjustment. Based on the results of\nfrontdoor adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph\nNeural Network (Casper), which contains a novel Prompt Based Decoder (PBD) and\na Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of\nconfounders and SCA could discover the sparse causal relationships among\nembeddings. Theoretical analysis reveals that SCA discovers causal\nrelationships based on the values of gradients. We evaluate Casper on three\nreal-world datasets, and the experimental results show that Casper could\noutperform the baselines and could effectively discover causal relationships.\n","authors":["Baoyu Jing","Dawei Zhou","Kan Ren","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11960v3.pdf","comment":"Accepted by CIKM'2024"},{"id":"http://arxiv.org/abs/2408.15722v1","updated":"2024-08-28T11:39:24Z","published":"2024-08-28T11:39:24Z","title":"Advanced POD-Based Performance Evaluation of Classifiers Applied to\n Human Driver Lane Changing Prediction","summary":" Machine learning (ML) classifiers serve as essential tools facilitating\nclassification and prediction across various domains. The performance of these\nalgorithms should be known to ensure their reliable application. In certain\nfields, receiver operating characteristic and precision-recall curves are\nfrequently employed to assess machine learning algorithms without accounting\nfor the impact of process parameters. However, it may be essential to evaluate\nthe performance of these algorithms in relation to such parameters. As a\nperformance evaluation metric capable of considering the effects of process\nparameters, this paper uses a modified probability of detection (POD) approach\nto assess the reliability of ML-based algorithms. As an example, the POD-based\napproach is employed to assess ML models used for predicting the lane changing\nbehavior of a vehicle driver. The time remaining to the predicted (and\ntherefore unknown) lane changing event is considered as process parameter. The\nhit/miss approach to POD is taken here and modified by considering the\nprobability of lane changing derived from ML algorithms at each time step, and\nobtaining the final result of the analysis accordingly. This improves the\nreliability of results compared to the standard hit/miss approach, which\nconsiders the outcome of the classifiers as either 0 or 1, while also\nsimplifying evaluation compared to the \\^a versus a approach. Performance\nevaluation results of the proposed approach are compared with those obtained\nwith the standard hit/miss approach and a pre-developed \\^a versus a approach\nto validate the effectiveness of the proposed method. The comparison shows that\nthis method provides an averaging conservative behavior with the advantage of\nenhancing the reliability of the hit/miss approach to POD while retaining its\nsimplicity.\n","authors":["Zahra Rastin","Dirk Söffker"],"pdf_url":"https://arxiv.org/pdf/2408.15722v1.pdf","comment":"Manuscript: 8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2305.17479v3","updated":"2024-08-28T11:27:25Z","published":"2023-05-27T13:57:26Z","title":"Inferring Individual Direct Causal Effects Under Heterogeneous Peer\n Influence","summary":" Causal inference in networks should account for interference, which occurs\nwhen a unit's outcome is influenced by treatments or outcomes of peers.\nHeterogeneous peer influence (HPI) occurs when a unit's outcome is influenced\ndifferently by different peers based on their attributes and relationships, or\nwhen each unit has a different susceptibility to peer influence. Existing\nsolutions to estimating direct causal effects under interference consider\neither homogeneous influence from peers or specific heterogeneous influence\nmechanisms (e.g., based on local neighborhood structure). This paper presents a\nmethodology for estimating individual direct causal effects in the presence of\nHPI where the mechanism of influence is not known a priori. We propose a\nstructural causal model for networks that can capture different possible\nassumptions about network structure, interference conditions, and causal\ndependence and enables reasoning about identifiability in the presence of HPI.\nWe find potential heterogeneous contexts using the causal model and propose a\nnovel graph neural network-based estimator to estimate individual direct causal\neffects. We show that state-of-the-art methods for individual direct effect\nestimation produce biased results in the presence of HPI, and that our proposed\nestimator is robust.\n","authors":["Shishir Adhikari","Elena Zheleva"],"pdf_url":"https://arxiv.org/pdf/2305.17479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15715v1","updated":"2024-08-28T11:21:33Z","published":"2024-08-28T11:21:33Z","title":"Autoregressive model path dependence near Ising criticality","summary":" Autoregressive models are a class of generative model that probabilistically\npredict the next output of a sequence based on previous inputs. The\nautoregressive sequence is by definition one-dimensional (1D), which is natural\nfor language tasks and hence an important component of modern architectures\nlike recurrent neural networks (RNNs) and transformers. However, when language\nmodels are used to predict outputs on physical systems that are not\nintrinsically 1D, the question arises of which choice of autoregressive\nsequence -- if any -- is optimal. In this paper, we study the reconstruction of\ncritical correlations in the two-dimensional (2D) Ising model, using RNNs and\ntransformers trained on binary spin data obtained near the thermal phase\ntransition. We compare the training performance for a number of different 1D\nautoregressive sequences imposed on finite-size 2D lattices. We find that paths\nwith long 1D segments are more efficient at training the autoregressive models\ncompared to space-filling curves that better preserve the 2D locality. Our\nresults illustrate the potential importance in choosing the optimal\nautoregressive sequence ordering when training modern language models for tasks\nin physics.\n","authors":["Yi Hong Teoh","Roger G. Melko"],"pdf_url":"https://arxiv.org/pdf/2408.15715v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.15714v1","updated":"2024-08-28T11:21:23Z","published":"2024-08-28T11:21:23Z","title":"Pixels to Prose: Understanding the art of Image Captioning","summary":" In the era of evolving artificial intelligence, machines are increasingly\nemulating human-like capabilities, including visual perception and linguistic\nexpression. Image captioning stands at the intersection of these domains,\nenabling machines to interpret visual content and generate descriptive text.\nThis paper provides a thorough review of image captioning techniques, catering\nto individuals entering the field of machine learning who seek a comprehensive\nunderstanding of available options, from foundational methods to\nstate-of-the-art approaches. Beginning with an exploration of primitive\narchitectures, the review traces the evolution of image captioning models to\nthe latest cutting-edge solutions. By dissecting the components of these\narchitectures, readers gain insights into the underlying mechanisms and can\nselect suitable approaches tailored to specific problem requirements without\nduplicating efforts. The paper also delves into the application of image\ncaptioning in the medical domain, illuminating its significance in various\nreal-world scenarios.\n Furthermore, the review offers guidance on evaluating the performance of\nimage captioning systems, highlighting key metrics for assessment. By\nsynthesizing theoretical concepts with practical application, this paper equips\nreaders with the knowledge needed to navigate the complex landscape of image\ncaptioning and harness its potential for diverse applications in machine\nlearning and beyond.\n","authors":["Hrishikesh Singh","Aarti Sharma","Millie Pant"],"pdf_url":"https://arxiv.org/pdf/2408.15714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08235v3","updated":"2024-08-28T11:18:26Z","published":"2023-04-14T07:55:07Z","title":"A Platform-Agnostic Deep Reinforcement Learning Framework for Effective\n Sim2Real Transfer towards Autonomous Driving","summary":" Deep Reinforcement Learning (DRL) has shown remarkable success in solving\ncomplex tasks across various research fields. However, transferring DRL agents\nto the real world is still challenging due to the significant discrepancies\nbetween simulation and reality. To address this issue, we propose a robust DRL\nframework that leverages platform-dependent perception modules to extract\ntask-relevant information and train a lane-following and overtaking agent in\nsimulation. This framework facilitates the seamless transfer of the DRL agent\nto new simulated environments and the real world with minimal effort. We\nevaluate the performance of the agent in various driving scenarios in both\nsimulation and the real world, and compare it to human players and the PID\nbaseline in simulation. Our proposed framework significantly reduces the gaps\nbetween different platforms and the Sim2Real gap, enabling the trained agent to\nachieve similar performance in both simulation and the real world, driving the\nvehicle effectively.\n","authors":["Dianzhao Li","Ostap Okhrin"],"pdf_url":"https://arxiv.org/pdf/2304.08235v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03472v3","updated":"2024-08-28T11:18:00Z","published":"2023-08-07T11:02:44Z","title":"Improving the forecast accuracy of wind power by leveraging multiple\n hierarchical structure","summary":" Renewable energy generation is of utmost importance for global\ndecarbonization. Forecasting renewable energies, particularly wind energy, is\nchallenging due to the inherent uncertainty in wind energy generation, which\ndepends on weather conditions. Recent advances in hierarchical forecasting\nthrough reconciliation have demonstrated a significant increase in the quality\nof wind energy forecasts for short-term periods. We leverage the\ncross-sectional and temporal hierarchical structure of turbines in wind farms\nand build cross-temporal hierarchies to further investigate how integrated\ncross-sectional and temporal dimensions can add value to forecast accuracy in\nwind farms. We found that cross-temporal reconciliation was superior to\nindividual cross-sectional reconciliation at multiple temporal aggregations.\nAdditionally, machine learning based forecasts that were cross-temporally\nreconciled demonstrated high accuracy at coarser temporal granularities, which\nmay encourage adoption for short-term wind forecasts. Empirically, we provide\ninsights for decision-makers on the best methods for forecasting high-frequency\nwind data across different forecasting horizons and levels.\n","authors":["Lucas English","Mahdi Abolghasemi"],"pdf_url":"https://arxiv.org/pdf/2308.03472v3.pdf","comment":"41 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.15702v1","updated":"2024-08-28T11:02:23Z","published":"2024-08-28T11:02:23Z","title":"Evaluating Model Robustness Using Adaptive Sparse L0 Regularization","summary":" Deep Neural Networks have demonstrated remarkable success in various domains\nbut remain susceptible to adversarial examples, which are slightly altered\ninputs designed to induce misclassification. While adversarial attacks\ntypically optimize under Lp norm constraints, attacks based on the L0 norm,\nprioritising input sparsity, are less studied due to their complex and non\nconvex nature. These sparse adversarial examples challenge existing defenses by\naltering a minimal subset of features, potentially uncovering more subtle DNN\nweaknesses. However, the current L0 norm attack methodologies face a trade off\nbetween accuracy and efficiency either precise but computationally intense or\nexpedient but imprecise. This paper proposes a novel, scalable, and effective\napproach to generate adversarial examples based on the L0 norm, aimed at\nrefining the robustness evaluation of DNNs against such perturbations.\n","authors":["Weiyou Liu","Zhenyang Li","Weitong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15702v1.pdf","comment":"Accepted by the 20th International Conference on Advanced Data Mining\n and Applications (ADMA 2024)"},{"id":"http://arxiv.org/abs/2403.17550v2","updated":"2024-08-28T10:52:32Z","published":"2024-03-26T09:58:06Z","title":"DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping","summary":" Recently, significant progress has been achieved in sensing real large-scale\noutdoor 3D environments, particularly by using modern acquisition equipment\nsuch as LiDAR sensors. Unfortunately, they are fundamentally limited in their\nability to produce dense, complete 3D scenes. To address this issue, recent\nlearning-based methods integrate neural implicit representations and\noptimizable feature grids to approximate surfaces of 3D scenes. However,\nnaively fitting samples along raw LiDAR rays leads to noisy 3D mapping results\ndue to the nature of sparse, conflicting LiDAR measurements. Instead, in this\nwork we depart from fitting LiDAR data exactly, instead letting the network\noptimize a non-metric monotonic implicit field defined in 3D space. To fit our\nfield, we design a learning system integrating a monotonicity loss that enables\noptimizing neural monotonic fields and leverages recent progress in large-scale\n3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as\ncaptured by multiple quantitative and perceptual measures and visual results\nobtained for Mai City, Newer College, and KITTI benchmarks. The code of our\napproach will be made publicly available.\n","authors":["Kutay Yılmaz","Matthias Nießner","Anastasiia Kornilova","Alexey Artemov"],"pdf_url":"https://arxiv.org/pdf/2403.17550v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.03187v3","updated":"2024-08-28T10:00:01Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n Generation from Spontaneous Facial Expression Reaction","summary":" Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically score user preferences\nfrom their spontaneous facial expression reaction to the generated images. We\ncollect a dataset of Facial Expression Reaction to Generated Images (FERGI) and\nshow that the activations of multiple facial action units (AUs) are highly\ncorrelated with user evaluations of the generated images. We develop an FAU-Net\n(Facial Action Units Neural Network), which receives inputs from an AU\nestimation model, to automatically score user preferences for text-to-image\ngeneration based on their facial expression reactions, which is complementary\nto the pre-trained scoring models based on the input text prompts and generated\nimages. Integrating our FAU-Net valence score with the pre-trained scoring\nmodels improves their consistency with human preferences. This method of\nautomatic annotation with facial expression analysis can be potentially\ngeneralized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18316v2","updated":"2024-08-28T09:42:58Z","published":"2024-06-26T12:59:37Z","title":"Trade-off between Gradient Measurement Efficiency and Expressivity in\n Deep Quantum Neural Networks","summary":" Quantum neural networks (QNNs) require an efficient training algorithm to\nachieve practical quantum advantages. A promising approach is the use of\ngradient-based optimization algorithms, where gradients are estimated through\nquantum measurements. However, general QNNs lack an efficient gradient\nmeasurement algorithm, which poses a fundamental and practical challenge to\nrealizing scalable QNNs. In this work, we rigorously prove a trade-off between\ngradient measurement efficiency, defined as the mean number of simultaneously\nmeasurable gradient components, and expressivity in a wide class of deep QNNs,\nelucidating the theoretical limits and possibilities of efficient gradient\nestimation. This trade-off implies that a more expressive QNN requires a higher\nmeasurement cost in gradient estimation, whereas we can increase gradient\nmeasurement efficiency by reducing the QNN expressivity to suit a given task.\nWe further propose a general QNN ansatz called the stabilizer-logical product\nansatz (SLPA), which can reach the upper limit of the trade-off inequality by\nleveraging the symmetric structure of the quantum circuit. In learning an\nunknown symmetric function, the SLPA drastically reduces the quantum resources\nrequired for training while maintaining accuracy and trainability compared to a\nwell-designed symmetric circuit based on the parameter-shift method. Our\nresults not only reveal a theoretical understanding of efficient training in\nQNNs but also provide a standard and broadly applicable efficient QNN design.\n","authors":["Koki Chinzei","Shinichiro Yamano","Quoc Hoan Tran","Yasuhiro Endo","Hirotaka Oshima"],"pdf_url":"https://arxiv.org/pdf/2406.18316v2.pdf","comment":"31 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.15667v1","updated":"2024-08-28T09:40:40Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n vision transformers","summary":" Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15664v1","updated":"2024-08-28T09:31:09Z","published":"2024-08-28T09:31:09Z","title":"Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts","summary":" For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to\nrouting collapse or increased computational overhead. Existing methods commonly\nemploy an auxiliary loss to encourage load balance, but a large auxiliary loss\nwill introduce non-negligible interference gradients into training and thus\nimpair the model performance. In order to control load balance while not\nproducing undesired gradients during training, we propose Loss-Free Balancing,\nfeatured by an auxiliary-loss-free load balancing strategy. To be specific,\nbefore the top-K routing decision, Loss-Free Balancing will first apply an\nexpert-wise bias to the routing scores of each expert. By dynamically updating\nthe bias of each expert according to its recent load, Loss-Free Balancing can\nconsistently maintain a balanced distribution of expert load. In addition,\nsince Loss-Free Balancing does not produce any interference gradients, it also\nelevates the upper bound of model performance gained from MoE training. We\nvalidate the performance of Loss-Free Balancing on MoE models with up to 3B\nparameters trained on up to 200B tokens. Experimental results show that\nLoss-Free Balancing achieves both better performance and better load balance\ncompared with traditional auxiliary-loss-controlled load balancing strategies.\n","authors":["Lean Wang","Huazuo Gao","Chenggang Zhao","Xu Sun","Damai Dai"],"pdf_url":"https://arxiv.org/pdf/2408.15664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14951v2","updated":"2024-08-28T09:08:11Z","published":"2024-08-27T10:54:51Z","title":"Domain-decoupled Physics-informed Neural Networks with Closed-form\n Gradients for Fast Model Learning of Dynamical Systems","summary":" Physics-informed neural networks (PINNs) are trained using physical equations\nand can also incorporate unmodeled effects by learning from data. PINNs for\ncontrol (PINCs) of dynamical systems are gaining interest due to their\nprediction speed compared to classical numerical integration methods for\nnonlinear state-space models, making them suitable for real-time control\napplications. We introduce the domain-decoupled physics-informed neural network\n(DD-PINN) to address current limitations of PINC in handling large and complex\nnonlinear dynamical systems. The time domain is decoupled from the feed-forward\nneural network to construct an Ansatz function, allowing for calculation of\ngradients in closed form. This approach significantly reduces training times,\nespecially for large dynamical systems, compared to PINC, which relies on\ngraph-based automatic differentiation. Additionally, the DD-PINN inherently\nfulfills the initial condition and supports higher-order excitation inputs,\nsimplifying the training process and enabling improved prediction accuracy.\nValidation on three systems - a nonlinear mass-spring-damper, a\nfive-mass-chain, and a two-link robot - demonstrates that the DD-PINN achieves\nsignificantly shorter training times. In cases where the PINC's prediction\ndiverges, the DD-PINN's prediction remains stable and accurate due to higher\nphysics loss reduction or use of a higher-order excitation input. The DD-PINN\nallows for fast and accurate learning of large dynamical systems previously out\nof reach for the PINC.\n","authors":["Henrik Krauss","Tim-Lukas Habich","Max Bartholdt","Thomas Seel","Moritz Schappler"],"pdf_url":"https://arxiv.org/pdf/2408.14951v2.pdf","comment":"Accepted to International Conference on Informatics in Control,\n Automation and Robotics (ICINCO) 2024"},{"id":"http://arxiv.org/abs/2402.09066v2","updated":"2024-08-28T09:01:37Z","published":"2024-02-14T10:24:04Z","title":"Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images:\n A Survey","summary":" The detection and characterization of illegal solid waste disposal sites are\nessential for environmental protection, particularly for mitigating pollution\nand health hazards. Improperly managed landfills contaminate soil and\ngroundwater via rainwater infiltration, posing threats to both animals and\nhumans. Traditional landfill identification approaches, such as on-site\ninspections, are time-consuming and expensive. Remote sensing is a\ncost-effective solution for the identification and monitoring of solid waste\ndisposal sites that enables broad coverage and repeated acquisitions over time.\nEarth Observation (EO) satellites, equipped with an array of sensors and\nimaging capabilities, have been providing high-resolution data for several\ndecades. Researchers proposed specialized techniques that leverage remote\nsensing imagery to perform a range of tasks such as waste site detection,\ndumping site monitoring, and assessment of suitable locations for new\nlandfills. This review aims to provide a detailed illustration of the most\nrelevant proposals for the detection and monitoring of solid waste sites by\ndescribing and comparing the approaches, the implemented techniques, and the\nemployed data. Furthermore, since the data sources are of the utmost importance\nfor developing an effective solid waste detection model, a comprehensive\noverview of the satellites and publicly available data sets is presented.\nFinally, this paper identifies the open issues in the state-of-the-art and\ndiscusses the relevant research directions for reducing the costs and improving\nthe effectiveness of novel solid waste detection methods.\n","authors":["Piero Fraternali","Luca Morandini","Sergio Luis Herrera González"],"pdf_url":"https://arxiv.org/pdf/2402.09066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15640v1","updated":"2024-08-28T08:52:14Z","published":"2024-08-28T08:52:14Z","title":"GANs Conditioning Methods: A Survey","summary":" In recent years, Generative Adversarial Networks (GANs) have seen significant\nadvancements, leading to their widespread adoption across various fields. The\noriginal GAN architecture enables the generation of images without any specific\ncontrol over the content, making it an unconditional generation process.\nHowever, many practical applications require precise control over the generated\noutput, which has led to the development of conditional GANs (cGANs) that\nincorporate explicit conditioning to guide the generation process. cGANs extend\nthe original framework by incorporating additional information (conditions),\nenabling the generation of samples that adhere to that specific criteria.\nVarious conditioning methods have been proposed, each differing in how they\nintegrate the conditioning information into both the generator and the\ndiscriminator networks. In this work, we review the conditioning methods\nproposed for GANs, exploring the characteristics of each method and\nhighlighting their unique mechanisms and theoretical foundations. Furthermore,\nwe conduct a comparative analysis of these methods, evaluating their\nperformance on various image datasets. Through these analyses, we aim to\nprovide insights into the strengths and limitations of various conditioning\ntechniques, guiding future research and application in generative modeling.\n","authors":["Anis Bourou","Auguste Genovesio","Valérie Mezger"],"pdf_url":"https://arxiv.org/pdf/2408.15640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15633v1","updated":"2024-08-28T08:35:34Z","published":"2024-08-28T08:35:34Z","title":"Comparison of Model Predictive Control and Proximal Policy Optimization\n for a 1-DOF Helicopter System","summary":" This study conducts a comparative analysis of Model Predictive Control (MPC)\nand Proximal Policy Optimization (PPO), a Deep Reinforcement Learning (DRL)\nalgorithm, applied to a 1-Degree of Freedom (DOF) Quanser Aero 2 system.\nClassical control techniques such as MPC and Linear Quadratic Regulator (LQR)\nare widely used due to their theoretical foundation and practical\neffectiveness. However, with advancements in computational techniques and\nmachine learning, DRL approaches like PPO have gained traction in solving\noptimal control problems through environment interaction. This paper\nsystematically evaluates the dynamic response characteristics of PPO and MPC,\ncomparing their performance, computational resource consumption, and\nimplementation complexity. Experimental results show that while LQR achieves\nthe best steady-state accuracy, PPO excels in rise-time and adaptability,\nmaking it a promising approach for applications requiring rapid response and\nadaptability. Additionally, we have established a baseline for future\nRL-related research on this specific testbed. We also discuss the strengths and\nlimitations of each control strategy, providing recommendations for selecting\nappropriate controllers for real-world scenarios.\n","authors":["Georg Schäfer","Jakob Rehrl","Stefan Huber","Simon Hirlaender"],"pdf_url":"https://arxiv.org/pdf/2408.15633v1.pdf","comment":"Accepted at INDIN2024"},{"id":"http://arxiv.org/abs/2408.08454v2","updated":"2024-08-28T08:31:28Z","published":"2024-08-15T23:34:04Z","title":"Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention","summary":" The Transformer architecture has revolutionized deep learning through its\nSelf-Attention mechanism, which effectively captures contextual information.\nHowever, the memory footprint of Self-Attention presents significant challenges\nfor long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by\ngrouping queries and mean-pooling the corresponding key-value heads - reducing\nthe number of overall parameters and memory requirements in a flexible manner\nwithout adversely compromising model accuracy. In this work, we introduce\nenhancements to GQA, focusing on two novel approaches that deviate from the\nstatic nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic\nKey-Distributed GQA (DGQA), which leverage information from the norms of the\nkey heads to inform query allocation. Specifically, KDGQA looks at the ratios\nof the norms of the key heads during each forward pass, while DGQA examines the\nratios of the norms as they evolve through training. Additionally, we present\nPerturbed GQA (PGQA) as a case-study, which introduces variability in (static)\ngroup formation via subtracting noise from the attention maps. Our experiments\nwith up-trained Vision Transformers, for Image Classification on datasets such\nas CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of\nthese variants in improving upon the original GQA through more informed and\nadaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of\nup to 8% when utilizing DGQA in comparison to GQA and other variants. We\nfurther analyze the impact of the number of Key-Value Heads on performance,\nunderscoring the importance of utilizing query-key affinities. Code is\navailable on GitHub.\n","authors":["Zohaib Khan","Muhammad Khaquan","Omer Tafveez","Burhanuddin Samiwala","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08454v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15621v1","updated":"2024-08-28T08:22:21Z","published":"2024-08-28T08:22:21Z","title":"Convergent Differential Privacy Analysis for General Federated Learning:\n the f-DP Perspective","summary":" Federated learning (FL) is an efficient collaborative training paradigm\nextensively developed with a focus on local privacy protection, and\ndifferential privacy (DP) is a classical approach to capture and ensure the\nreliability of local privacy. The powerful cooperation of FL and DP provides a\npromising learning framework for large-scale private clients, juggling both\nprivacy securing and trustworthy learning. As the predominant algorithm of DP,\nthe noisy perturbation has been widely studied and incorporated into various\nfederated algorithms, theoretically proven to offer significant privacy\nprotections. However, existing analyses in noisy FL-DP mostly rely on the\ncomposition theorem and cannot tightly quantify the privacy leakage challenges,\nwhich is nearly tight for small numbers of communication rounds but yields an\narbitrarily loose and divergent bound under the large communication rounds.\nThis implies a counterintuitive judgment, suggesting that FL may not provide\nadequate privacy protection during long-term training. To further investigate\nthe convergent privacy and reliability of the FL-DP framework, in this paper,\nwe comprehensively evaluate the worst privacy of two classical methods under\nthe non-convex and smooth objectives based on the f-DP analysis, i.e.\nNoisy-FedAvg and Noisy-FedProx methods. With the aid of the\nshifted-interpolation technique, we successfully prove that the worst privacy\nof the Noisy-FedAvg method achieves a tight convergent lower bound. Moreover,\nin the Noisy-FedProx method, with the regularization of the proxy term, the\nworst privacy has a stable constant lower bound. Our analysis further provides\na solid theoretical foundation for the reliability of privacy protection in\nFL-DP. Meanwhile, our conclusions can also be losslessly converted to other\nclassical DP analytical frameworks, e.g. $(\\epsilon,\\delta)$-DP and\nR$\\acute{\\text{e}}$nyi-DP (RDP).\n","authors":["Yan Sun","Li Shen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.15621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15620v1","updated":"2024-08-28T08:21:56Z","published":"2024-08-28T08:21:56Z","title":"CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge\n Graph and Ternary Relationship","summary":" The problem of career trajectory prediction (CTP) aims to predict one's\nfuture employer or job position. While several CTP methods have been developed\nfor this problem, we posit that none of these methods (1) jointly considers the\nmutual ternary dependency between three key units (i.e., user, position, and\ncompany) of a career and (2) captures the characteristic shifts of key units in\ncareer over time, leading to an inaccurate understanding of the job movement\npatterns in the labor market. To address the above challenges, we propose a\nnovel solution, named as CAPER, that solves the challenges via sophisticated\ntemporal knowledge graph (TKG) modeling. It enables the utilization of a\ngraph-structured knowledge base with rich expressiveness, effectively\npreserving the changes in job movement patterns. Furthermore, we devise an\nextrapolated career reasoning task on TKG for a realistic evaluation. The\nexperiments on a real-world career trajectory dataset demonstrate that CAPER\nconsistently and significantly outperforms four baselines, two recent TKG\nreasoning methods, and five state-of-the-art CTP methods in predicting one's\nfuture companies and positions-i.e., on average, yielding 6.80% and 34.58% more\naccurate predictions, respectively.\n","authors":["Yeon-Chang Lee","JaeHyun Lee","Michiharu Yamashita","Dongwon Lee","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2408.15620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15619v1","updated":"2024-08-28T08:20:05Z","published":"2024-08-28T08:20:05Z","title":"Large-Scale Demand Prediction in Urban Rail using Multi-Graph Inductive\n Representation Learning","summary":" With the expansion of cities over time, URT (Urban Rail Transit) networks\nhave also grown significantly. Demand prediction plays an important role in\nsupporting planning, scheduling, fleet management, and other operational\ndecisions. In this study, we propose an Origin-Destination (OD) demand\nprediction model called Multi-Graph Inductive Representation Learning\n(mGraphSAGE) for large-scale URT networks under operational uncertainties. Our\nmain contributions are twofold: we enhance prediction results while ensuring\nscalability for large networks by relying simultaneously on multiple graphs,\nwhere each OD pair is a node on a graph and distinct OD relationships, such as\ntemporal and spatial correlations; we show the importance of including\noperational uncertainties such as train delays and cancellations as inputs in\ndemand prediction for daily operations. The model is validated on three\ndifferent scales of the URT network in Copenhagen, Denmark. Experimental\nresults show that by leveraging information from neighboring ODs and learning\nnode representations via sampling and aggregation, mGraphSAGE is particularly\nsuitable for OD demand prediction in large-scale URT networks, outperforming\nreference machine learning methods. Furthermore, during periods with train\ncancellations and delays, the performance gap between mGraphSAGE and other\nmethods improves compared to normal operating conditions, demonstrating its\nability to leverage system reliability information for predicting OD demand\nunder uncertainty.\n","authors":["Dang Viet Anh Nguyen","J. Victor Flensburg","Fabrizio Cerreto","Bianca Pascariu","Paola Pellegrini","Carlos Lima Azevedo","Filipe Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2408.15619v1.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.12961v3","updated":"2024-08-28T08:15:18Z","published":"2024-08-23T10:12:08Z","title":"Symplectic Bregman divergences","summary":" We present a generalization of Bregman divergences in symplectic vector\nspaces that we term symplectic Bregman divergences. Symplectic Bregman\ndivergences are derived from a symplectic generalization of the Fenchel-Young\ninequality which relies on the notion of symplectic subdifferentials. The\nsymplectic Fenchel-Young inequality is obtained using the symplectic Fenchel\ntransform which is defined with respect to the symplectic form. Since\nsymplectic forms can be generically built from pairings of dual systems, we get\na generalization of Bregman divergences in dual systems obtained by equivalent\nsymplectic Bregman divergences. In particular, when the symplectic form is\nderived from an inner product, we show that the corresponding symplectic\nBregman divergences amount to ordinary Bregman divergences with respect to\ncomposite inner products. Some potential applications of symplectic divergences\nin geometric mechanics, information geometry, and learning dynamics in machine\nlearning are touched upon.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2408.12961v3.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.15609v1","updated":"2024-08-28T08:03:04Z","published":"2024-08-28T08:03:04Z","title":"Statistical QoS Provision in Business-Centric Networks","summary":" More refined resource management and Quality of Service (QoS) provisioning is\na critical goal of wireless communication technologies. In this paper, we\npropose a novel Business-Centric Network (BCN) aimed at enabling scalable QoS\nprovisioning, based on a cross-layer framework that captures the relationship\nbetween application, transport parameters, and channels. We investigate both\ncontinuous flow and event-driven flow models, presenting key QoS metrics such\nas throughput, delay, and reliability. By jointly considering power and\nbandwidth allocation, transmission parameters, and AP network topology across\nlayers, we optimize weighted resource efficiency with statistical QoS\nprovisioning. To address the coupling among parameters, we propose a novel deep\nreinforcement learning (DRL) framework, which is Collaborative Optimization\namong Heterogeneous Actors with Experience Sharing (COHA-ES). Power and\nsub-channel (SC) Actors representing multiple APs are jointly optimized under\nthe unified guidance of a common critic. Additionally, we introduce a novel\nmultithreaded experience-sharing mechanism to accelerate training and enhance\nrewards. Extensive comparative experiments validate the effectiveness of our\nDRL framework in terms of convergence and efficiency. Moreover, comparative\nanalyses demonstrate the comprehensive advantages of the BCN structure in\nenhancing both spectral and energy efficiency.\n","authors":["Chang Wu","Yuang Chen","Hancheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.15609v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.15601v1","updated":"2024-08-28T07:49:29Z","published":"2024-08-28T07:49:29Z","title":"Grand canonical generative diffusion model for crystalline phases and\n grain boundaries","summary":" The diffusion model has emerged as a powerful tool for generating atomic\nstructures for materials science. This work calls attention to the deficiency\nof current particle-based diffusion models, which represent atoms as a point\ncloud, in generating even the simplest ordered crystalline structures. The\nproblem is attributed to particles being trapped in local minima during the\nscore-driven simulated annealing of the diffusion process, similar to the\nphysical process of force-driven simulated annealing. We develop a solution,\nthe grand canonical diffusion model, which adopts an alternative voxel-based\nrepresentation with continuous rather than fixed number of particles. The\nmethod is applied towards generation of several common crystalline phases as\nwell as the technologically important and challenging problem of grain boundary\nstructures.\n","authors":["Bo Lei","Enze Chen","Hyuna Kwon","Tim Hsu","Babak Sadigh","Vincenzo Lordi","Timofey Frolov","Fei Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.15601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15600v1","updated":"2024-08-28T07:48:39Z","published":"2024-08-28T07:48:39Z","title":"Exploring Selective Layer Fine-Tuning in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for fine-tuning\nfoundation models using distributed data in a privacy-preserving manner. Under\nlimited computational resources, clients often find it more practical to\nfine-tune a selected subset of layers, rather than the entire model, based on\ntheir task-specific data. In this study, we provide a thorough theoretical\nexploration of selective layer fine-tuning in FL, emphasizing a flexible\napproach that allows the clients to adjust their selected layers according to\ntheir local data and resources. We theoretically demonstrate that the layer\nselection strategy has a significant impact on model convergence in two\ncritical aspects: the importance of selected layers and the heterogeneous\nchoices across clients. Drawing from these insights, we further propose a\nstrategic layer selection method that utilizes local gradients and regulates\nlayer selections across clients. The extensive experiments on both image and\ntext datasets demonstrate the effectiveness of the proposed strategy compared\nwith several baselines, highlighting its advances in identifying critical\nlayers that adapt to the client heterogeneity and training dynamics in FL.\n","authors":["Yuchang Sun","Yuexiang Xie","Bolin Ding","Yaliang Li","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15593v1","updated":"2024-08-28T07:36:20Z","published":"2024-08-28T07:36:20Z","title":"Skills Regularized Task Decomposition for Multi-task Offline\n Reinforcement Learning","summary":" Reinforcement learning (RL) with diverse offline datasets can have the\nadvantage of leveraging the relation of multiple tasks and the common skills\nlearned across those tasks, hence allowing us to deal with real-world complex\nproblems efficiently in a data-driven way. In offline RL where only offline\ndata is used and online interaction with the environment is restricted, it is\nyet difficult to achieve the optimal policy for multiple tasks, especially when\nthe data quality varies for the tasks. In this paper, we present a skill-based\nmulti-task RL technique on heterogeneous datasets that are generated by\nbehavior policies of different quality. To learn the shareable knowledge across\nthose datasets effectively, we employ a task decomposition method for which\ncommon skills are jointly learned and used as guidance to reformulate a task in\nshared and achievable subtasks. In this joint learning, we use Wasserstein\nauto-encoder (WAE) to represent both skills and tasks on the same latent space\nand use the quality-weighted loss as a regularization term to induce tasks to\nbe decomposed into subtasks that are more consistent with high-quality skills\nthan others. To improve the performance of offline RL agents learned on the\nlatent space, we also augment datasets with imaginary trajectories relevant to\nhigh-quality skills for each task. Through experiments, we show that our\nmulti-task offline RL approach is robust to the mixed configurations of\ndifferent-quality datasets and it outperforms other state-of-the-art algorithms\nfor several robotic manipulation tasks and drone navigation tasks.\n","authors":["Minjong Yoo","Sangwoo Cho","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2408.15593v1.pdf","comment":"12 pages, 5 figures, acceepted in NeurIPS 2022"},{"id":"http://arxiv.org/abs/2408.15591v1","updated":"2024-08-28T07:31:32Z","published":"2024-08-28T07:31:32Z","title":"VFLIP: A Backdoor Defense for Vertical Federated Learning via\n Identification and Purification","summary":" Vertical Federated Learning (VFL) focuses on handling vertically partitioned\ndata over FL participants. Recent studies have discovered a significant\nvulnerability in VFL to backdoor attacks which specifically target the distinct\ncharacteristics of VFL. Therefore, these attacks may neutralize existing\ndefense mechanisms designed primarily for Horizontal Federated Learning (HFL)\nand deep neural networks. In this paper, we present the first backdoor defense,\ncalled VFLIP, specialized for VFL. VFLIP employs the identification and\npurification techniques that operate at the inference stage, consequently\nimproving the robustness against backdoor attacks to a great extent. VFLIP\nfirst identifies backdoor-triggered embeddings by adopting a participant-wise\nanomaly detection approach. Subsequently, VFLIP conducts purification which\nremoves the embeddings identified as malicious and reconstructs all the\nembeddings based on the remaining embeddings. We conduct extensive experiments\non CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate\nthat VFLIP can effectively mitigate backdoor attacks in VFL.\nhttps://github.com/blingcho/VFLIP-esorics24\n","authors":["Yungi Cho","Woorim Han","Miseon Yu","Ho Bae","Yunheung Paek"],"pdf_url":"https://arxiv.org/pdf/2408.15591v1.pdf","comment":"Accepted by 29th European Symposium on Research in Computer Security\n (ESORICS 2024)"},{"id":"http://arxiv.org/abs/2408.15590v1","updated":"2024-08-28T07:26:30Z","published":"2024-08-28T07:26:30Z","title":"Bayesian optimization of atomic structures with prior probabilities from\n universal interatomic potentials","summary":" The optimization of atomic structures plays a pivotal role in understanding\nand designing materials with desired properties. However, conventional methods\noften struggle with the formidable task of navigating the vast potential energy\nsurface, especially in high-dimensional spaces with numerous local minima.\nRecent advancements in machine learning-driven surrogate models offer a\npromising avenue for alleviating this computational burden. In this study, we\npropose a novel approach that combines the strengths of universal machine\nlearning potentials with a Bayesian approach of the GOFEE/BEACON framework. By\nleveraging the comprehensive chemical knowledge encoded in pretrained universal\nmachine learning potentials as a prior estimate of energy and forces, we enable\nthe Gaussian process to focus solely on capturing the intricate nuances of the\npotential energy surface. We demonstrate the efficacy of our approach through\ncomparative analyses across diverse systems, including periodic bulk materials,\nsurface structures, and a cluster.\n","authors":["Peder Lyngby","Casper Larsen","Karsten Wedel Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2408.15590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15562v1","updated":"2024-08-28T06:28:01Z","published":"2024-08-28T06:28:01Z","title":"Boosting Lossless Speculative Decoding via Feature Sampling and Partial\n Alignment Distillation","summary":" Lossless speculative decoding accelerates target large language model (LLM)\ninference by employing a lightweight draft model for generating tree-structured\ncandidates, which are subsequently verified in parallel by the target LLM.\nCurrently, effective approaches leverage feature-level rather than token-level\nautoregression within the draft model to facilitate more straightforward\npredictions and enhanced knowledge distillation. In this paper, we reassess\nthese approaches and propose FSPAD (Feature Sampling and Partial Alignment\nDistillation for Lossless Speculative Decoding), which introduces two\nstraightforward and effective components within the existing framework to boost\nlossless speculative decoding. Firstly, FSPAD utilizes token embeddings to\nsample features of the target LLM in high-dimensional space before feeding them\ninto the draft model, due to the inherent uncertainty of the features\npreventing the draft model from obtaining the specific token output by the\ntarget LLM. Secondly, FSPAD introduces partial alignment distillation to weaken\nthe draft model's connection between features and logits, aiming to reduce the\nconflict between feature alignment and logit confidence during training. Our\nexperiments include both greedy and non-greedy decoding on the largest and\nsmallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in\nmulti-turn conversation, translation, summarization, question answering,\nmathematical reasoning, and retrieval-augmented generation. The results show\nthat FSPAD outperforms the state-of-the-art method across all the\naforementioned tasks and target LLMs.\n","authors":["Lujun Gui","Bin Xiao","Lei Su","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15562v1.pdf","comment":"The work was not submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2405.07626v2","updated":"2024-08-28T06:18:28Z","published":"2024-05-13T10:37:50Z","title":"AnomalyLLM: Few-shot Anomaly Edge Detection for Dynamic Graphs using\n Large Language Models","summary":" Detecting anomaly edges for dynamic graphs aims to identify edges\nsignificantly deviating from the normal pattern and can be applied in various\ndomains, such as cybersecurity, financial transactions and AIOps. With the\nevolving of time, the types of anomaly edges are emerging and the labeled\nanomaly samples are few for each type. Current methods are either designed to\ndetect randomly inserted edges or require sufficient labeled data for model\ntraining, which harms their applicability for real-world applications. In this\npaper, we study this problem by cooperating with the rich knowledge encoded in\nlarge language models(LLMs) and propose a method, namely AnomalyLLM. To align\nthe dynamic graph with LLMs, AnomalyLLM pre-trains a dynamic-aware encoder to\ngenerate the representations of edges and reprograms the edges using the\nprototypes of word embeddings. Along with the encoder, we design an in-context\nlearning framework that integrates the information of a few labeled samples to\nachieve few-shot anomaly detection. Experiments on four datasets reveal that\nAnomalyLLM can not only significantly improve the performance of few-shot\nanomaly detection, but also achieve superior results on new anomalies without\nany update of model parameters.\n","authors":["Shuo Liu","Di Yao","Lanting Fang","Zhetao Li","Wenbin Li","Kaiyu Feng","XiaoWen Ji","Jingping Bi"],"pdf_url":"https://arxiv.org/pdf/2405.07626v2.pdf","comment":"13pages"},{"id":"http://arxiv.org/abs/2408.15555v1","updated":"2024-08-28T06:08:46Z","published":"2024-08-28T06:08:46Z","title":"Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep\n Learning","summary":" In recently years, a significant amount of research has been conducted on\napplying deep learning methods for glaucoma classification and detection.\nHowever, the explainability of those established machine learning models\nremains a big concern. In this research, in contrast, we learn from cognitive\nscience concept and study how ophthalmologists judge glaucoma detection.\nSimulating experts' efforts, we propose a hierarchical decision making system,\ncentered around a holistic set of carefully designed biomarker-oriented machine\nlearning models. While biomarkers represent the key indicators of how\nophthalmologists identify glaucoma, they usually exhibit latent\ninter-relations. We thus construct a time series model, named TRI-LSTM, capable\nof calculating and uncovering potential and latent relationships among various\nbiomarkers of glaucoma. Our model is among the first efforts to explore the\nintrinsic connections among glaucoma biomarkers. We monitor temporal\nrelationships in patients' disease states over time and to capture and retain\nthe progression of disease-relevant clinical information from prior visits,\nthereby enriching biomarker's potential relationships. Extensive experiments\nover real-world dataset have demonstrated the effectiveness of the proposed\nmodel.\n","authors":["Cheng Huang","Junhao Shen","Qiuyu Luo","Karanjit Kooner","Tsengdar Lee","Yishen Liu","Jia Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15555v1.pdf","comment":"9 pages, 4 images"},{"id":"http://arxiv.org/abs/2408.15554v1","updated":"2024-08-28T06:07:58Z","published":"2024-08-28T06:07:58Z","title":"A Novel Denoising Technique and Deep Learning Based Hybrid Wind Speed\n Forecasting Model for Variable Terrain Conditions","summary":" Wind flow can be highly unpredictable and can suffer substantial fluctuations\nin speed and direction due to the shape and height of hills, mountains, and\nvalleys, making accurate wind speed (WS) forecasting essential in complex\nterrain. This paper presents a novel and adaptive model for short-term\nforecasting of WS. The paper's key contributions are as follows: (a) The\nPartial Auto Correlation Function (PACF) is utilised to minimise the dimension\nof the set of Intrinsic Mode Functions (IMF), hence reducing training time; (b)\nThe sample entropy (SampEn) was used to calculate the complexity of the reduced\nset of IMFs. The proposed technique is adaptive since a specific Deep Learning\n(DL) model-feature combination was chosen based on complexity; (c) A novel\nbidirectional feature-LSTM framework for complicated IMFs has been suggested,\nresulting in improved forecasting accuracy; (d) The proposed model shows\nsuperior forecasting performance compared to the persistence, hybrid, Ensemble\nempirical mode decomposition (EEMD), and Variational Mode Decomposition\n(VMD)-based deep learning models. It has achieved the lowest variance in terms\nof forecasting accuracy between simple and complex terrain conditions 0.70%.\nDimension reduction of IMF's and complexity-based model-feature selection helps\nreduce the training time by 68.77% and improve forecasting quality by 58.58% on\naverage.\n","authors":["Sourav Malakar","Saptarsi Goswami","Amlan Chakrabarti","Bhaswati Ganguli"],"pdf_url":"https://arxiv.org/pdf/2408.15554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15545v1","updated":"2024-08-28T05:41:52Z","published":"2024-08-28T05:41:52Z","title":"SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding","summary":" Scientific literature understanding is crucial for extracting targeted\ninformation and garnering insights, thereby significantly advancing scientific\ndiscovery. Despite the remarkable success of Large Language Models (LLMs), they\nface challenges in scientific literature understanding, primarily due to (1) a\nlack of scientific knowledge and (2) unfamiliarity with specialized scientific\ntasks.\n To develop an LLM specialized in scientific literature understanding, we\npropose a hybrid strategy that integrates continual pre-training (CPT) and\nsupervised fine-tuning (SFT), to simultaneously infuse scientific domain\nknowledge and enhance instruction-following capabilities for domain-specific\ntasks.cIn this process, we identify two key challenges: (1) constructing\nhigh-quality CPT corpora, and (2) generating diverse SFT instructions. We\naddress these challenges through a meticulous pipeline, including PDF text\nextraction, parsing content error correction, quality filtering, and synthetic\ninstruction creation. Applying this strategy, we present a suite of LLMs:\nSciLitLLM, specialized in scientific literature understanding. These models\ndemonstrate promising performance on scientific literature understanding\nbenchmarks.\n Our contributions are threefold: (1) We present an effective framework that\nintegrates CPT and SFT to adapt LLMs to scientific literature understanding,\nwhich can also be easily adapted to other domains. (2) We propose an LLM-based\nsynthesis method to generate diverse and high-quality scientific instructions,\nresulting in a new instruction set -- SciLitIns -- for supervised fine-tuning\nin less-represented scientific domains. (3) SciLitLLM achieves promising\nperformance improvements on scientific literature understanding benchmarks.\n","authors":["Sihang Li","Jian Huang","Jiaxi Zhuang","Yaorui Shi","Xiaochen Cai","Mingjun Xu","Xiang Wang","Linfeng Zhang","Guolin Ke","Hengxing Cai"],"pdf_url":"https://arxiv.org/pdf/2408.15545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15535v1","updated":"2024-08-28T04:56:06Z","published":"2024-08-28T04:56:06Z","title":"Improving Thompson Sampling via Information Relaxation for Budgeted\n Multi-armed Bandits","summary":" We consider a Bayesian budgeted multi-armed bandit problem, in which each arm\nconsumes a different amount of resources when selected and there is a budget\nconstraint on the total amount of resources that can be used. Budgeted Thompson\nSampling (BTS) offers a very effective heuristic to this problem, but its\narm-selection rule does not take into account the remaining budget information.\nWe adopt \\textit{Information Relaxation Sampling} framework that generalizes\nThompson Sampling for classical $K$-armed bandit problems, and propose a series\nof algorithms that are randomized like BTS but more carefully optimize their\ndecisions with respect to the budget constraint. In a one-to-one correspondence\nwith these algorithms, a series of performance benchmarks that improve the\nconventional benchmark are also suggested. Our theoretical analysis and\nsimulation results show that our algorithms (and our benchmarks) make\nincremental improvements over BTS (respectively, the conventional benchmark)\nacross various settings including a real-world example.\n","authors":["Woojin Jeong","Seungki Min"],"pdf_url":"https://arxiv.org/pdf/2408.15535v1.pdf","comment":"accepted"},{"id":"http://arxiv.org/abs/2408.11293v2","updated":"2024-08-28T04:09:33Z","published":"2024-08-21T02:48:42Z","title":"ViIK: Flow-based Vision Inverse Kinematics Solver with Fusing Collision\n Checking","summary":" Inverse Kinematics (IK) is to find the robot's configurations that satisfy\nthe target pose of the end effector. In motion planning, diverse configurations\nwere required in case a feasible trajectory was not found. Meanwhile, collision\nchecking (CC), e.g. Oriented bounding box (OBB), Discrete Oriented Polytope\n(DOP), and Quickhull \\cite{quickhull}, needs to be done for each configuration\nprovided by the IK solver to ensure every goal configuration for motion\nplanning is available. This means the classical IK solver and CC algorithm\nshould be executed repeatedly for every configuration. Thus, the preparation\ntime is long when the required number of goal configurations is large, e.g.\nmotion planning in cluster environments. Moreover, structured maps, which might\nbe difficult to obtain, were required by classical collision-checking\nalgorithms. To sidestep such two issues, we propose a flow-based vision method\nthat can output diverse available configurations by fusing inverse kinematics\nand collision checking, named Vision Inverse Kinematics solver (ViIK).\nMoreover, ViIK uses RGB images as the perception of environments. ViIK can\noutput 1000 configurations within 40 ms, and the accuracy is about 3\nmillimeters and 1.5 degrees. The higher accuracy can be obtained by being\nrefined by the classical IK solver within a few iterations. The self-collision\nrates can be lower than 2%. The collision-with-env rates can be lower than 10%\nin most scenes. The code is available at: https://github.com/AdamQLMeng/ViIK.\n","authors":["Qinglong Meng","Chongkun Xia","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15510v1","updated":"2024-08-28T03:45:49Z","published":"2024-08-28T03:45:49Z","title":"Measuring the Reliability of Causal Probing Methods: Tradeoffs,\n Limitations, and the Plight of Nullifying Interventions","summary":" Causal probing is an approach to interpreting foundation models, such as\nlarge language models, by training probes to recognize latent properties of\ninterest from embeddings, intervening on probes to modify this representation,\nand analyzing the resulting changes in the model's behavior. While some recent\nworks have cast doubt on the theoretical basis of several leading causal\nprobing intervention methods, it has been unclear how to systematically and\nempirically evaluate their effectiveness in practice. To address this problem,\nwe propose a general empirical analysis framework to evaluate the reliability\nof causal probing interventions, formally defining and quantifying two key\ncausal probing desiderata: completeness (fully transforming the representation\nof the target property) and selectivity (minimally impacting other properties).\nOur formalism allows us to make the first direct comparisons between different\nfamilies of causal probing methods (e.g., linear vs. nonlinear or\ncounterfactual vs. nullifying interventions). We conduct extensive experiments\nacross several leading methods, finding that (1) there is an inherent tradeoff\nbetween these criteria, and no method is able to consistently satisfy both at\nonce; and (2) across the board, nullifying interventions are always far less\ncomplete than counterfactual interventions, indicating that nullifying methods\nmay not be an effective approach to causal probing.\n","authors":["Marc Canby","Adam Davies","Chirag Rastogi","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2408.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14502v2","updated":"2024-08-28T03:22:53Z","published":"2024-08-24T02:04:12Z","title":"Physics-Informed Neural Network for Concrete Manufacturing Process\n Optimization","summary":" Concrete manufacturing projects are one of the most common ones for\nconsulting agencies. Because of the highly non-linear dependency of input\nmaterials like ash, water, cement, superplastic, etc; with the resultant\nstrength of concrete, it gets difficult for machine learning models to\nsuccessfully capture this relation and perform cost optimizations. This paper\nhighlights how PINNs (Physics Informed Neural Networks) can be useful in the\ngiven situation. This state-of-the-art model shall also get compared with\ntraditional models like Linear Regression, Random Forest, Gradient Boosting,\nand Deep Neural Network. Results of the research highlights how well PINNs\nperformed even with reduced dataset, thus resolving one of the biggest issues\nof limited data availability for ML models. On an average, PINN got the loss\nvalue reduced by 26.3% even with 40% lesser data compared to the Deep Neural\nNetwork. In addition to predicting strength of the concrete given the quantity\nof raw materials, the paper also highlights the use of heuristic optimization\nmethod like Particle Swarm Optimization (PSO) in predicting quantity of raw\nmaterials required to manufacture concrete of given strength with least cost.\n","authors":["Sam Varghese","Rahul Anand","Dr. Gaurav Paliwal"],"pdf_url":"https://arxiv.org/pdf/2408.14502v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15501v1","updated":"2024-08-28T03:10:45Z","published":"2024-08-28T03:10:45Z","title":"MODULI: Unlocking Preference Generalization via Diffusion Models for\n Offline Multi-Objective Reinforcement Learning","summary":" Multi-objective Reinforcement Learning (MORL) seeks to develop policies that\nsimultaneously optimize multiple conflicting objectives, but it requires\nextensive online interactions. Offline MORL provides a promising solution by\ntraining on pre-collected datasets to generalize to any preference upon\ndeployment. However, real-world offline datasets are often conservatively and\nnarrowly distributed, failing to comprehensively cover preferences, leading to\nthe emergence of out-of-distribution (OOD) preference areas. Existing offline\nMORL algorithms exhibit poor generalization to OOD preferences, resulting in\npolicies that do not align with preferences. Leveraging the excellent\nexpressive and generalization capabilities of diffusion models, we propose\nMODULI (Multi-objective Diffusion Planner with Sliding Guidance), which employs\na preference-conditioned diffusion model as a planner to generate trajectories\nthat align with various preferences and derive action for decision-making. To\nachieve accurate generation, MODULI introduces two return normalization methods\nunder diverse preferences for refining guidance. To further enhance\ngeneralization to OOD preferences, MODULI proposes a novel sliding guidance\nmechanism, which involves training an additional slider adapter to capture the\ndirection of preference changes. Incorporating the slider, it transitions from\nin-distribution (ID) preferences to generating OOD preferences, patching, and\nextending the incomplete Pareto front. Extensive experiments on the D4MORL\nbenchmark demonstrate that our algorithm outperforms state-of-the-art Offline\nMORL baselines, exhibiting excellent generalization to OOD preferences.\n","authors":["Yifu Yuan","Zhenrui Zheng","Zibin Dong","Jianye Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15501v1.pdf","comment":"23 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.15498v1","updated":"2024-08-28T03:00:43Z","published":"2024-08-28T03:00:43Z","title":"Deep Learning to Predict Late-Onset Breast Cancer Metastasis: the Single\n Hyperparameter Grid Search (SHGS) Strategy for Meta Tuning Concerning Deep\n Feed-forward Neural Network","summary":" While machine learning has advanced in medicine, its widespread use in\nclinical applications, especially in predicting breast cancer metastasis, is\nstill limited. We have been dedicated to constructing a DFNN model to predict\nbreast cancer metastasis n years in advance. However, the challenge lies in\nefficiently identifying optimal hyperparameter values through grid search,\ngiven the constraints of time and resources. Issues such as the infinite\npossibilities for continuous hyperparameters like l1 and l2, as well as the\ntime-consuming and costly process, further complicate the task. To address\nthese challenges, we developed Single Hyperparameter Grid Search (SHGS)\nstrategy, serving as a preselection method before grid search. Our experiments\nwith SHGS applied to DFNN models for breast cancer metastasis prediction focus\non analyzing eight target hyperparameters: epochs, batch size, dropout, L1, L2,\nlearning rate, decay, and momentum. We created three figures, each depicting\nthe experiment results obtained from three LSM-I-10-Plus-year datasets. These\nfigures illustrate the relationship between model performance and the target\nhyperparameter values. For each hyperparameter, we analyzed whether changes in\nthis hyperparameter would affect model performance, examined if there were\nspecific patterns, and explored how to choose values for the particular\nhyperparameter. Our experimental findings reveal that the optimal value of a\nhyperparameter is not only dependent on the dataset but is also significantly\ninfluenced by the settings of other hyperparameters. Additionally, our\nexperiments suggested some reduced range of values for a target hyperparameter,\nwhich may be helpful for low-budget grid search. This approach serves as a\nprior experience and foundation for subsequent use of grid search to enhance\nmodel performance.\n","authors":["Yijun Zhou","Om Arora-Jain","Xia Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15495v1","updated":"2024-08-28T02:45:41Z","published":"2024-08-28T02:45:41Z","title":"Remove Symmetries to Control Model Expressivity","summary":" When symmetry is present in the loss function, the model is likely to be\ntrapped in a low-capacity state that is sometimes known as a \"collapse.\" Being\ntrapped in these low-capacity states can be a major obstacle to training across\nmany scenarios where deep learning technology is applied. We first prove two\nconcrete mechanisms through which symmetries lead to reduced capacities and\nignored features during training. We then propose a simple and theoretically\njustified algorithm, syre, to remove almost all symmetry-induced low-capacity\nstates in neural networks. The proposed method is shown to improve the training\nof neural networks in scenarios when this type of entrapment is especially a\nconcern. A remarkable merit of the proposed method is that it is model-agnostic\nand does not require any knowledge of the symmetry.\n","authors":["Liu Ziyin","Yizhou Xu","Isaac Chuang"],"pdf_url":"https://arxiv.org/pdf/2408.15495v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2402.16905v2","updated":"2024-08-28T02:37:08Z","published":"2024-02-24T21:36:26Z","title":"Procedural Adherence and Interpretability Through Neuro-Symbolic\n Generative Agents","summary":" The surge in popularity of large language models (LLMs) has opened doors for\nnew approaches to the creation of interactive agents. However, managing and\ninterpreting the temporal behavior of such agents over the course of a\npotentially infinite interaction remain challenging. The stateful, long-term\nhorizon reasoning required for coherent agent behavior does not fit well into\nthe LLM paradigm. We propose a combination of formal logic-based program\nsynthesis and LLM content generation to bring guarantees of procedural\nadherence and interpretability to generative agent behavior. To illustrate the\nbenefit of procedural adherence and interpretability, we use Temporal Stream\nLogic (TSL) to generate an automaton that enforces an interpretable, high-level\ntemporal structure on an agent. With the automaton tracking the context of the\ninteraction and making decisions to guide the conversation accordingly, we can\ndrive content generation in a way that allows the LLM to focus on a shorter\ncontext window. We evaluated our approach on different tasks involved in\ncreating an interactive agent specialized for generating\nchoose-your-own-adventure games. We found that over all of the tasks, an\nautomaton-enhanced agent with procedural guarantees achieves at least 96%\nadherence to its temporal constraints, whereas a purely LLM-based agent\ndemonstrates as low as 14.67% adherence.\n","authors":["Raven Rothkopf","Hannah Tongxin Zeng","Mark Santolucito"],"pdf_url":"https://arxiv.org/pdf/2402.16905v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.06452v2","updated":"2024-08-28T01:46:48Z","published":"2024-08-12T19:01:49Z","title":"Wireless Channel Aware Data Augmentation Methods for Deep Learning-Based\n Indoor Localization","summary":" Indoor localization is a challenging problem that - unlike outdoor\nlocalization - lacks a universal and robust solution. Machine Learning (ML),\nparticularly Deep Learning (DL), methods have been investigated as a promising\napproach. Although such methods bring remarkable localization accuracy, they\nheavily depend on the training data collected from the environment. The data\ncollection is usually a laborious and time-consuming task, but Data\nAugmentation (DA) can be used to alleviate this issue. In this paper, different\nfrom previously used DA, we propose methods that utilize the domain knowledge\nabout wireless propagation channels and devices. The methods exploit the\ntypical hardware component drift in the transceivers and/or the statistical\nbehavior of the channel, in combination with the measured Power Delay Profile\n(PDP). We comprehensively evaluate the proposed methods to demonstrate their\neffectiveness. This investigation mainly focuses on the impact of factors such\nas the number of measurements, augmentation proportion, and the environment of\ninterest impact the effectiveness of the different DA methods. We show that in\nthe low-data regime (few actual measurements available), localization accuracy\nincreases up to 50%, matching non-augmented results in the high-data regime. In\naddition, the proposed methods may outperform the measurement-only high-data\nperformance by up to 33% using only 1/4 of the amount of measured data. We also\nexhibit the effect of different training data distribution and quality on the\neffectiveness of DA. Finally, we demonstrate the power of the proposed methods\nwhen employed along with Transfer Learning (TL) to address the data scarcity in\ntarget and/or source environments.\n","authors":["Omer Gokalp Serbetci","Daoud Burghal","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2408.06452v2.pdf","comment":"13 pages, 14 figures"},{"id":"http://arxiv.org/abs/2210.17230v4","updated":"2024-08-28T01:37:40Z","published":"2022-10-31T11:15:48Z","title":"Lipschitz-regularized gradient flows and generative particle algorithms\n for high-dimensional scarce data","summary":" We build a new class of generative algorithms capable of efficiently learning\nan arbitrary target distribution from possibly scarce, high-dimensional data\nand subsequently generate new samples. These generative algorithms are\nparticle-based and are constructed as gradient flows of Lipschitz-regularized\nKullback-Leibler or other $f$-divergences, where data from a source\ndistribution can be stably transported as particles, towards the vicinity of\nthe target distribution. As a highlighted result in data integration, we\ndemonstrate that the proposed algorithms correctly transport gene expression\ndata points with dimension exceeding 54K, while the sample size is typically\nonly in the hundreds.\n","authors":["Hyemin Gu","Panagiota Birmpa","Yannis Pantazis","Luc Rey-Bellet","Markos A. Katsoulakis"],"pdf_url":"https://arxiv.org/pdf/2210.17230v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15462v1","updated":"2024-08-28T00:56:03Z","published":"2024-08-28T00:56:03Z","title":"CTRQNets & LQNets: Continuous Time Recurrent and Liquid Quantum Neural\n Networks","summary":" Neural networks have continued to gain prevalence in the modern era for their\nability to model complex data through pattern recognition and behavior\nremodeling. However, the static construction of traditional neural networks\ninhibits dynamic intelligence. This makes them inflexible to temporal changes\nin data and unfit to capture complex dependencies. With the advent of quantum\ntechnology, there has been significant progress in creating quantum algorithms.\nIn recent years, researchers have developed quantum neural networks that\nleverage the capabilities of qubits to outperform classical networks. However,\ntheir current formulation exhibits a static construction limiting the system's\ndynamic intelligence. To address these weaknesses, we develop a Liquid Quantum\nNeural Network (LQNet) and a Continuous Time Recurrent Quantum Neural Network\n(CTRQNet). Both models demonstrate a significant improvement in accuracy\ncompared to existing quantum neural networks (QNNs), achieving accuracy\nincreases as high as 40\\% on CIFAR 10 through binary classification. We propose\nLQNets and CTRQNets might shine a light on quantum machine learning's black\nbox.\n","authors":["Alejandro Mayorga","Alexander Yuan","Andrew Yuan","Tyler Wooldridge","Xiaodi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15458v1","updated":"2024-08-28T00:47:55Z","published":"2024-08-28T00:47:55Z","title":"PersonalizedUS: Interpretable Breast Cancer Risk Assessment with Local\n Coverage Uncertainty Quantification","summary":" Correctly assessing the malignancy of breast lesions identified during\nultrasound examinations is crucial for effective clinical decision-making.\nHowever, the current \"golden standard\" relies on manual BI-RADS scoring by\nclinicians, often leading to unnecessary biopsies and a significant mental\nhealth burden on patients and their families. In this paper, we introduce\nPersonalizedUS, an interpretable machine learning system that leverages recent\nadvances in conformal prediction to provide precise and personalized risk\nestimates with local coverage guarantees and sensitivity, specificity, and\npredictive values above 0.9 across various threshold levels. In particular, we\nidentify meaningful lesion subgroups where distribution-free, model-agnostic\nconditional coverage holds, with approximately 90% of our prediction sets\ncontaining only the ground truth in most lesion subgroups, thus explicitly\ncharacterizing for which patients the model is most suitably applied. Moreover,\nwe make available a curated tabular dataset of 1936 biopsied breast lesions\nfrom a recent observational multicenter study and benchmark the performance of\nseveral state-of-the-art learning algorithms. We also report a successful case\nstudy of the deployed system in the same multicenter context. Concrete clinical\nbenefits include up to a 65% reduction in requested biopsies among BI-RADS 4a\nand 4b lesions, with minimal to no missed cancer cases.\n","authors":["Alek Fröhlich","Thiago Ramos","Gustavo Cabello","Isabela Buzatto","Rafael Izbicki","Daniel Tiezzi"],"pdf_url":"https://arxiv.org/pdf/2408.15458v1.pdf","comment":"9 pages, 5 figure, 2 tables"},{"id":"http://arxiv.org/abs/2303.11789v8","updated":"2024-08-28T00:28:46Z","published":"2023-03-20T08:37:08Z","title":"Decentralized Online Learning for Random Inverse Problems Over Graphs","summary":" We propose a decentralized online learning algorithm for distributed random\ninverse problems over network graphs with online measurements, and unifies the\ndistributed parameter estimation in Hilbert spaces and the least mean square\nproblem in reproducing kernel Hilbert spaces (RKHS-LMS). We transform the\nconvergence of the algorithm into the asymptotic stability of a class of\ninhomogeneous random difference equations in Hilbert spaces with\n$L_{2}$-bounded martingale difference terms and develop the $L_2$-asymptotic\nstability theory in Hilbert spaces. We show that if the network graph is\nconnected and the sequence of forward operators satisfies the\ninfinite-dimensional spatio-temporal persistence of excitation condition, then\nthe estimates of all nodes are mean square and almost surely strongly\nconsistent. Moreover, we propose a decentralized online learning algorithm in\nRKHS based on non-stationary online data streams, and prove that the algorithm\nis mean square and almost surely strongly consistent if the operators induced\nby the random input data satisfy the infinite-dimensional spatio-temporal\npersistence of excitation condition.\n","authors":["Tao Li","Xiwei Zhang","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2303.11789v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13479v2","updated":"2024-08-28T00:19:50Z","published":"2024-08-24T05:38:31Z","title":"Quantum-machine-assisted Drug Discovery: Survey and Perspective","summary":" Drug discovery and development is a highly complex and costly endeavor,\ntypically requiring over a decade and substantial financial investment to bring\na new drug to market. Traditional computer-aided drug design (CADD) has made\nsignificant progress in accelerating this process, but the development of\nquantum computing offers potential due to its unique capabilities. This paper\ndiscusses the integration of quantum computing into drug discovery and\ndevelopment, focusing on how quantum technologies might accelerate and enhance\nvarious stages of the drug development cycle. Specifically, we explore the\napplication of quantum computing in addressing challenges related to drug\ndiscovery, such as molecular simulation and the prediction of drug-target\ninteractions, as well as the optimization of clinical trial outcomes. By\nleveraging the inherent capabilities of quantum computing, we might be able to\nreduce the time and cost associated with bringing new drugs to market,\nultimately benefiting public health.\n","authors":["Yidong Zhou","Jintai Chen","Jinglei Cheng","Gopal Karemore","Marinka Zitnik","Frederic T. Chong","Junyu Liu","Tianfan Fu","Zhiding Liang"],"pdf_url":"https://arxiv.org/pdf/2408.13479v2.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15451v1","updated":"2024-08-28T00:14:09Z","published":"2024-08-28T00:14:09Z","title":"Certified Causal Defense with Generalizable Robustness","summary":" While machine learning models have proven effective across various scenarios,\nit is widely acknowledged that many models are vulnerable to adversarial\nattacks. Recently, there have emerged numerous efforts in adversarial defense.\nAmong them, certified defense is well known for its theoretical guarantees\nagainst arbitrary adversarial perturbations on input within a certain range\n(e.g., $l_2$ ball). However, most existing works in this line struggle to\ngeneralize their certified robustness in other data domains with distribution\nshifts. This issue is rooted in the difficulty of eliminating the negative\nimpact of spurious correlations on robustness in different domains. To address\nthis problem, in this work, we propose a novel certified defense framework\nGLEAN, which incorporates a causal perspective into the generalization problem\nin certified defense. More specifically, our framework integrates a certifiable\ncausal factor learning component to disentangle the causal relations and\nspurious correlations between input and label, and thereby exclude the negative\neffect of spurious correlations on defense. On top of that, we design a\ncausally certified defense strategy to handle adversarial attacks on latent\ncausal factors. In this way, our framework is not only robust against malicious\nnoises on data in the training distribution but also can generalize its\nrobustness across domains with distribution shifts. Extensive experiments on\nbenchmark datasets validate the superiority of our framework in certified\nrobustness generalization in different data domains. Code is available in the\nsupplementary materials.\n","authors":["Yiran Qiao","Yu Yin","Chen Chen","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2408.15451v1.pdf","comment":"Submitted to AAAI"},{"id":"http://arxiv.org/abs/2408.15450v1","updated":"2024-08-28T00:07:51Z","published":"2024-08-28T00:07:51Z","title":"Avoiding Generative Model Writer's Block With Embedding Nudging","summary":" Generative image models, since introduction, have become a global phenomenon.\nFrom new arts becoming possible to new vectors of abuse, many new capabilities\nhave become available. One of the challenging issues with generative models is\ncontrolling the generation process specially to prevent specific generations\nclasses or instances . There are several reasons why one may want to control\nthe output of generative models, ranging from privacy and safety concerns to\napplication limitations or user preferences\n To address memorization and privacy challenges, there has been considerable\nresearch dedicated to filtering prompts or filtering the outputs of these\nmodels. What all these solutions have in common is that at the end of the day\nthey stop the model from producing anything, hence limiting the usability of\nthe model. In this paper, we propose a method for addressing this usability\nissue by making it possible to steer away from unwanted concepts (when detected\nin model's output) and still generating outputs. In particular we focus on the\nlatent diffusion image generative models and how one can prevent them to\ngenerate particular images while generating similar images with limited\noverhead.\n We focus on mitigating issues like image memorization, demonstrating our\ntechnique's effectiveness through qualitative and quantitative evaluations. Our\nmethod successfully prevents the generation of memorized training images while\nmaintaining comparable image quality and relevance to the unmodified model.\n","authors":["Ali Zand","Milad Nasr"],"pdf_url":"https://arxiv.org/pdf/2408.15450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16170v1","updated":"2024-08-28T23:25:25Z","published":"2024-08-28T23:25:25Z","title":"CardBench: A Benchmark for Learned Cardinality Estimation in Relational\n Databases","summary":" Cardinality estimation is crucial for enabling high query performance in\nrelational databases. Recently learned cardinality estimation models have been\nproposed to improve accuracy but there is no systematic benchmark or datasets\nwhich allows researchers to evaluate the progress made by new learned\napproaches and even systematically develop new learned approaches. In this\npaper, we are releasing a benchmark, containing thousands of queries over 20\ndistinct real-world databases for learned cardinality estimation. In contrast\nto other initial benchmarks, our benchmark is much more diverse and can be used\nfor training and testing learned models systematically. Using this benchmark,\nwe explored whether learned cardinality estimation can be transferred to an\nunseen dataset in a zero-shot manner. We trained GNN-based and\ntransformer-based models to study the problem in three setups: 1-)\ninstance-based, 2-) zero-shot, and 3-) fine-tuned. Our results show that while\nwe get promising results for zero-shot cardinality estimation on simple single\ntable queries; as soon as we add joins, the accuracy drops. However, we show\nthat with fine-tuning, we can still utilize pre-trained models for cardinality\nestimation, significantly reducing training overheads compared to instance\nspecific models. We are open sourcing our scripts to collect statistics,\ngenerate queries and training datasets to foster more extensive research, also\nfrom the ML community on the important problem of cardinality estimation and in\nparticular improve on recent directions such as pre-trained cardinality\nestimation.\n","authors":["Yannis Chronis","Yawen Wang","Yu Gan","Sami Abu-El-Haija","Chelsea Lin","Carsten Binnig","Fatma Özcan"],"pdf_url":"https://arxiv.org/pdf/2408.16170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16169v1","updated":"2024-08-28T23:20:17Z","published":"2024-08-28T23:20:17Z","title":"Simulating realistic short tandem repeat capillary electrophoretic\n signal using a generative adversarial network","summary":" DNA profiles are made up from multiple series of electrophoretic signal\nmeasuring fluorescence over time. Typically, human DNA analysts 'read' DNA\nprofiles using their experience to distinguish instrument noise, artefactual\nsignal, and signal corresponding to DNA fragments of interest. Recent work has\ndeveloped an artificial neural network, ANN, to carry out the task of\nclassifying fluorescence types into categories in DNA profile electrophoretic\nsignal. But the creation of the necessarily large amount of labelled training\ndata for the ANN is time consuming and expensive, and a limiting factor in the\nability to robustly train the ANN. If realistic, prelabelled, training data\ncould be simulated then this would remove the barrier to training an ANN with\nhigh efficacy. Here we develop a generative adversarial network, GAN, modified\nfrom the pix2pix GAN to achieve this task. With 1078 DNA profiles we train the\nGAN and achieve the ability to simulate DNA profile information, and then use\nthe generator from the GAN as a 'realism filter' that applies the noise and\nartefact elements exhibited in typical electrophoretic signal.\n","authors":["Duncan Taylor","Melissa Humphries"],"pdf_url":"https://arxiv.org/pdf/2408.16169v1.pdf","comment":"29 pages, 9 Figures"},{"id":"http://arxiv.org/abs/2408.16168v1","updated":"2024-08-28T23:20:03Z","published":"2024-08-28T23:20:03Z","title":"LeMON: Learning to Learn Multi-Operator Networks","summary":" Single-operator learning involves training a deep neural network to learn a\nspecific operator, whereas recent work in multi-operator learning uses an\noperator embedding structure to train a single neural network on data from\nmultiple operators. Thus, multi-operator learning is capable of predicting a\nrange of operators within one model. In this work, we propose pretraining and\nfine-tuning strategies for solving PDEs using multi-operator learning. One key\naspect is that by increasing the number of families of operators used in\npretraining, a PDE foundation model can be fine-tuned to downstream tasks\ninvolving new PDEs with a limited number of samples, thus outperforming single\noperator neural networks. Specifically, a multi-operator learning model\npre-trained with data from diverse PDE families can predict unseen operators\nafter fine-tuning with only a limited number of operators from the new family,\nenabling them to serve as a data-free PDE solver. We also show that the\nproposed training and fine-tuning method is able to predict new operators in\nzero-shot prediction without samples. Additionally, we introduce a PDE-agnostic\nmeta-learning algorithm to improve the adaptability of the model to various\nPDEs by providing a better parameter initialization process. To address the\nneeds of applications with limited computing resources, we explore low-rank\nadaptation methods that reduce computational costs while enhancing solver\naccuracy. Lastly, by examining the scaling law with respect to the number of\noperator families, we establish and highlight its potential for broad\nadaptation in PDE-solving tasks.\n","authors":["Jingmin Sun","Zecheng Zhang","Hayden Schaeffer"],"pdf_url":"https://arxiv.org/pdf/2408.16168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16167v1","updated":"2024-08-28T23:15:46Z","published":"2024-08-28T23:15:46Z","title":"Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree\n Ensembles","summary":" Tree ensembles, including boosting methods, are highly effective and widely\nused for tabular data. However, large ensembles lack interpretability and\nrequire longer inference times. We introduce a method to prune a tree ensemble\ninto a reduced version that is \"functionally identical\" to the original model.\nIn other words, our method guarantees that the prediction function stays\nunchanged for any possible input. As a consequence, this pruning algorithm is\nlossless for any aggregated metric. We formalize the problem of functionally\nidentical pruning on ensembles, introduce an exact optimization model, and\nprovide a fast yet highly effective method to prune large ensembles. Our\nalgorithm iteratively prunes considering a finite set of points, which is\nincrementally augmented using an adversarial model. In multiple computational\nexperiments, we show that our approach is a \"free lunch\", significantly\nreducing the ensemble size without altering the model's behavior. Thus, we can\npreserve state-of-the-art performance at a fraction of the original model's\nsize.\n","authors":["Youssouf Emine","Alexandre Forel","Idriss Malek","Thibaut Vidal"],"pdf_url":"https://arxiv.org/pdf/2408.16167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02325v2","updated":"2024-08-28T22:54:15Z","published":"2024-04-02T21:51:39Z","title":"Heat Death of Generative Models in Closed-Loop Learning","summary":" Improvement and adoption of generative machine learning models is rapidly\naccelerating, as exemplified by the popularity of LLMs (Large Language Models)\nfor text, and diffusion models for image generation. As generative models\nbecome widespread, data they generate is incorporated into shared content\nthrough the public web. This opens the question of what happens when data\ngenerated by a model is fed back to the model in subsequent training campaigns.\nThis is a question about the stability of the training process, whether the\ndistribution of publicly accessible content, which we refer to as \"knowledge\",\nremains stable or collapses.\n Small scale empirical experiments reported in the literature show that this\nclosed-loop training process is prone to degenerating. Models may start\nproducing gibberish data, or sample from only a small subset of the desired\ndata distribution (a phenomenon referred to as mode collapse). So far there has\nbeen only limited theoretical understanding of this process, in part due to the\ncomplexity of the deep networks underlying these generative models.\n The aim of this paper is to provide insights into this process (that we refer\nto as \"generative closed-loop learning\") by studying the learning dynamics of\ngenerative models that are fed back their own produced content in addition to\ntheir original training dataset. The sampling of many of these models can be\ncontrolled via a \"temperature\" parameter. Using dynamical systems tools, we\nshow that, unless a sufficient amount of external data is introduced at each\niteration, any non-trivial temperature leads the model to asymptotically\ndegenerate. In fact, either the generative distribution collapses to a small\nset of outputs or becomes uniform over a large set of outputs.\n","authors":["Matteo Marchi","Stefano Soatto","Pratik Chaudhari","Paulo Tabuada"],"pdf_url":"https://arxiv.org/pdf/2404.02325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16160v1","updated":"2024-08-28T22:45:15Z","published":"2024-08-28T22:45:15Z","title":"CLPNets: Coupled Lie-Poisson Neural Networks for Multi-Part Hamiltonian\n Systems with Symmetries","summary":" To accurately compute data-based prediction of Hamiltonian systems,\nespecially the long-term evolution of such systems, it is essential to utilize\nmethods that preserve the structure of the equations over time. We consider a\ncase that is particularly challenging for data-based methods: systems with\ninteracting parts that do not reduce to pure momentum evolution. Such systems\nare essential in scientific computations. For example, any discretization of a\ncontinuum elastic rod can be viewed as interacting elements that can move and\nrotate in space, with each discrete element moving on the group of rotations\nand translations $SE(3)$.\n We develop a novel method of data-based computation and complete phase space\nlearning of such systems. We follow the original framework of \\emph{SympNets}\n(Jin et al, 2020) building the neural network from canonical phase space\nmappings, and transformations that preserve the Lie-Poisson structure\n(\\emph{LPNets}) as in (Eldred et al, 2024). We derive a novel system of\nmappings that are built into neural networks for coupled systems. We call such\nnetworks Coupled Lie-Poisson Neural Networks, or \\emph{CLPNets}. We consider\nincreasingly complex examples for the applications of CLPNets: rotation of two\nrigid bodies about a common axis, the free rotation of two rigid bodies, and\nfinally the evolution of two connected and interacting $SE(3)$ components. Our\nmethod preserves all Casimir invariants of each system to machine precision,\nirrespective of the quality of the training data, and preserves energy to high\naccuracy. Our method also shows good resistance to the curse of dimensionality,\nrequiring only a few thousand data points for all cases studied, with the\neffective dimension varying from three to eighteen. Additionally, the method is\nhighly economical in memory requirements, requiring only about 200 parameters\nfor the most complex case considered.\n","authors":["Christopher Eldred","François Gay-Balmaz","Vakhtang Putkaradze"],"pdf_url":"https://arxiv.org/pdf/2408.16160v1.pdf","comment":"52 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.10994v2","updated":"2024-08-28T22:22:29Z","published":"2023-05-18T14:14:42Z","title":"Graphical vs. Deep Generative Models: Measuring the Impact of\n Differentially Private Mechanisms and Budgets on Utility","summary":" Generative models trained with Differential Privacy (DP) can produce\nsynthetic data while reducing privacy risks. However, navigating their\nprivacy-utility tradeoffs makes finding the best models for specific\nsettings/tasks challenging. This paper bridges this gap by profiling how DP\ngenerative models for tabular data distribute privacy budgets across rows and\ncolumns, which is one of the primary sources of utility degradation. We compare\ngraphical and deep generative models, focusing on the key factors contributing\nto how privacy budgets are spent, i.e., underlying modeling techniques, DP\nmechanisms, and data dimensionality.\n Through our measurement study, we shed light on the characteristics that make\ndifferent models suitable for various settings and tasks. For instance, we find\nthat graphical models distribute privacy budgets horizontally and thus cannot\nhandle relatively wide datasets for a fixed training time; also, the\nperformance on the task they were optimized for monotonically increases with\nmore data but could also overfit. Deep generative models spend their budgets\nper iteration, so their behavior is less predictable with varying dataset\ndimensions, but are more flexible as they could perform better if trained on\nmore features. Moreover, low levels of privacy ($\\epsilon\\geq100$) could help\nsome models generalize, achieving better results than without applying DP. We\nbelieve our work will aid the deployment of DP synthetic data techniques by\nnavigating through the best candidate models vis-a-vis the dataset features,\ndesired privacy levels, and downstream tasks.\n","authors":["Georgi Ganev","Kai Xu","Emiliano De Cristofaro"],"pdf_url":"https://arxiv.org/pdf/2305.10994v2.pdf","comment":"A shorter version of this paper appears in the Proceedings of the\n 31st ACM Conference on Computer and Communications Security (ACM CCS 2024).\n This is the full version"},{"id":"http://arxiv.org/abs/2408.16154v1","updated":"2024-08-28T22:14:44Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":" Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","André Anjos","Lilian Berton","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2408.16154v1.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16147v1","updated":"2024-08-28T21:28:45Z","published":"2024-08-28T21:28:45Z","title":"Improving the Prediction of Individual Engagement in Recommendations\n Using Cognitive Models","summary":" For public health programs with limited resources, the ability to predict how\nbehaviors change over time and in response to interventions is crucial for\ndeciding when and to whom interventions should be allocated. Using data from a\nreal-world maternal health program, we demonstrate how a cognitive model based\non Instance-Based Learning (IBL) Theory can augment existing purely\ncomputational approaches. Our findings show that, compared to general\ntime-series forecasters (e.g., LSTMs), IBL models, which reflect human\ndecision-making processes, better predict the dynamics of individuals' states.\nAdditionally, IBL provides estimates of the volatility in individuals' states\nand their sensitivity to interventions, which can improve the efficiency of\ntraining of other time series models.\n","authors":["Roderick Seow","Yunfan Zhao","Duncan Wood","Milind Tambe","Cleotilde Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2408.16147v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.14485v6","updated":"2024-08-28T17:08:55Z","published":"2024-06-20T16:48:14Z","title":"Proceedings of The second international workshop on eXplainable AI for\n the Arts (XAIxArts)","summary":" This second international workshop on explainable AI for the Arts (XAIxArts)\nbrought together a community of researchers in HCI, Interaction Design, AI,\nexplainable AI (XAI), and digital arts to explore the role of XAI for the Arts.\nWorkshop held at the 16th ACM Conference on Creativity and Cognition (C&C\n2024), Chicago, USA.\n","authors":["Nick Bryan-Kinns","Corey Ford","Shuoyang Zheng","Helen Kennedy","Alan Chamberlain","Makayla Lewis","Drew Hemment","Zijin Li","Qiong Wu","Lanxi Xiao","Gus Xia","Jeba Rezwana","Michael Clemens","Gabriel Vigliensoni"],"pdf_url":"https://arxiv.org/pdf/2406.14485v6.pdf","comment":"Proceedings of The second international workshop on eXplainable AI\n for the Arts (XAIxArts)"},{"id":"http://arxiv.org/abs/2407.19976v2","updated":"2024-08-28T13:01:06Z","published":"2024-07-29T13:09:26Z","title":"MambaGesture: Enhancing Co-Speech Gesture Generation with Mamba and\n Disentangled Multi-Modality Fusion","summary":" Co-speech gesture generation is crucial for producing synchronized and\nrealistic human gestures that accompany speech, enhancing the animation of\nlifelike avatars in virtual environments. While diffusion models have shown\nimpressive capabilities, current approaches often overlook a wide range of\nmodalities and their interactions, resulting in less dynamic and contextually\nvaried gestures. To address these challenges, we present MambaGesture, a novel\nframework integrating a Mamba-based attention block, MambaAttn, with a\nmulti-modality feature fusion module, SEAD. The MambaAttn block combines the\nsequential data processing strengths of the Mamba model with the contextual\nrichness of attention mechanisms, enhancing the temporal coherence of generated\ngestures. SEAD adeptly fuses audio, text, style, and emotion modalities,\nemploying disentanglement to deepen the fusion process and yield gestures with\ngreater realism and diversity. Our approach, rigorously evaluated on the\nmulti-modal BEAT dataset, demonstrates significant improvements in Fr\\'echet\nGesture Distance (FGD), diversity scores, and beat alignment, achieving\nstate-of-the-art performance in co-speech gesture generation. Project website:\n$\\href{https://fcchit.github.io/mambagesture/}{\\textit{https://fcchit.github.io/mambagesture/}}$.\n","authors":["Chencan Fu","Yabiao Wang","Jiangning Zhang","Zhengkai Jiang","Xiaofeng Mao","Jiafu Wu","Weijian Cao","Chengjie Wang","Yanhao Ge","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19976v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.11982v2","updated":"2024-08-28T11:01:16Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n Results","summary":" Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Xiaoheng Tan","Haiqiang Wang","Xiaozhong Xu","Shan Liu","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15542v1","updated":"2024-08-28T05:34:14Z","published":"2024-08-28T05:34:14Z","title":"Kangaroo: A Powerful Video-Language Model Supporting Long-context Video\n Input","summary":" Rapid advancements have been made in extending Large Language Models (LLMs)\nto Large Multi-modal Models (LMMs). However, extending input modality of LLMs\nto video data remains a challenging endeavor, especially for long videos. Due\nto insufficient access to large-scale high-quality video data and the excessive\ncompression of visual features, current methods exhibit limitations in\neffectively processing long videos. In this paper, we introduce Kangaroo, a\npowerful Video LMM aimed at addressing these challenges. Confronted with issue\nof inadequate training data, we develop a data curation system to build a\nlarge-scale dataset with high-quality annotations for vision-language\npre-training and instruction tuning. In addition, we design a curriculum\ntraining pipeline with gradually increasing resolution and number of input\nframes to accommodate long videos. Evaluation results demonstrate that, with 8B\nparameters, Kangaroo achieves state-of-the-art performance across a variety of\nvideo understanding benchmarks while exhibiting competitive results on others.\nParticularly, on benchmarks specialized for long videos, Kangaroo excels some\nlarger models with over 10B parameters and proprietary models.\n","authors":["Jiajun Liu","Yibing Wang","Hanghang Ma","Xiaoping Wu","Xiaoqi Ma","Xiaoming Wei","Jianbin Jiao","Enhua Wu","Jie Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15521v1","updated":"2024-08-28T04:14:01Z","published":"2024-08-28T04:14:01Z","title":"A Simple Baseline with Single-encoder for Referring Image Segmentation","summary":" Referring image segmentation (RIS) requires dense vision-language\ninteractions between visual pixels and textual words to segment objects based\non a given description. However, commonly adapted dual-encoders in RIS, e.g.,\nSwin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal\ndual-encoder), lack dense multi-modal interactions during pre-training, leading\nto a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods\noften rely on multi-modal fusion modules that interact two encoders, but this\napproach leads to high computational costs. In this paper, we present a novel\nRIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of\nshared self-attention across all framework components. This enables seamless\ninteractions of two modalities from input to final prediction, producing\ngranularly aligned multi-modal features. Furthermore, we propose lightweight\nyet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which\ncontribute to the high efficiency of our model. Our simple baseline with a\nsingle encoder achieves outstanding performances on the RIS benchmark datasets\nwhile maintaining computational efficiency, compared to the most recent SoTA\nmethods based on dual-encoders.\n","authors":["Seonghoon Yu","Ilchae Jung","Byeongju Han","Taeoh Kim","Yunho Kim","Dongyoon Wee","Jeany Son"],"pdf_url":"https://arxiv.org/pdf/2408.15521v1.pdf","comment":"ArXiv pre-print"},{"id":"http://arxiv.org/abs/2408.15461v1","updated":"2024-08-28T00:54:51Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":" Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v1.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2408.16132v1","updated":"2024-08-28T20:48:04Z","published":"2024-08-28T20:48:04Z","title":"SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge","summary":" With the advancements in singing voice generation and the growing presence of\nAI singers on media platforms, the inaugural Singing Voice Deepfake Detection\n(SVDD) Challenge aims to advance research in identifying AI-generated singing\nvoices from authentic singers. This challenge features two tracks: a controlled\nsetting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The\nCtrSVDD track utilizes publicly available singing vocal data to generate\ndeepfakes using state-of-the-art singing voice synthesis and conversion\nsystems. Meanwhile, the WildSVDD track expands upon the existing SingFake\ndataset, which includes data sourced from popular user-generated content\nwebsites. For the CtrSVDD track, we received submissions from 47 teams, with 37\nsurpassing our baselines and the top team achieving a 1.65% equal error rate.\nFor the WildSVDD track, we benchmarked the baselines. This paper reviews these\nresults, discusses key findings, and outlines future directions for SVDD\nresearch.\n","authors":["You Zhang","Yongyi Zang","Jiatong Shi","Ryuichi Yamamoto","Tomoki Toda","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16132v1.pdf","comment":null}]},"2024-08-29T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.16768v1","updated":"2024-08-29T17:59:45Z","published":"2024-08-29T17:59:45Z","title":"SAM2Point: Segment Any 3D as Videos in Zero-shot and Promptable Manners","summary":" We introduce SAM2Point, a preliminary exploration adapting Segment Anything\nModel 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point\ninterprets any 3D data as a series of multi-directional videos, and leverages\nSAM 2 for 3D-space segmentation, without further training or 2D-3D projection.\nOur framework supports various prompt types, including 3D points, boxes, and\nmasks, and can generalize across diverse scenarios, such as 3D objects, indoor\nscenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple\n3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight\nthe robust generalization capabilities of SAM2Point. To our best knowledge, we\npresent the most faithful implementation of SAM in 3D, which may serve as a\nstarting point for future research in promptable 3D segmentation. Online Demo:\nhttps://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\nhttps://github.com/ZiyuGuo99/SAM2Point .\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Chengzhuo Tong","Peng Gao","Chunyuan Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2408.16768v1.pdf","comment":"Work in progress. Online Demo:\n https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\n https://github.com/ZiyuGuo99/SAM2Point"},{"id":"http://arxiv.org/abs/2408.16756v1","updated":"2024-08-29T17:54:14Z","published":"2024-08-29T17:54:14Z","title":"How Far Can Cantonese NLP Go? Benchmarking Cantonese Capabilities of\n Large Language Models","summary":" The rapid evolution of large language models (LLMs) has transformed the\ncompetitive landscape in natural language processing (NLP), particularly for\nEnglish and other data-rich languages. However, underrepresented languages like\nCantonese, spoken by over 85 million people, face significant development gaps,\nwhich is particularly concerning given the economic significance of the\nGuangdong-Hong Kong-Macau Greater Bay Area, and in substantial\nCantonese-speaking populations in places like Singapore and North America.\nDespite its wide use, Cantonese has scant representation in NLP research,\nespecially compared to other languages from similarly developed regions. To\nbridge these gaps, we outline current Cantonese NLP methods and introduce new\nbenchmarks designed to evaluate LLM performance in factual generation,\nmathematical logic, complex reasoning, and general knowledge in Cantonese,\nwhich aim to advance open-source Cantonese LLM technology. We also propose\nfuture research directions and recommended models to enhance Cantonese LLM\ndevelopment.\n","authors":["Jiyue Jiang","Liheng Chen","Pengan Chen","Sheng Wang","Qinghang Bao","Lingpeng Kong","Yu Li","Chuan Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16753v1","updated":"2024-08-29T17:49:18Z","published":"2024-08-29T17:49:18Z","title":"Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning\n of Large Language Models","summary":" Reinforcement learning is used to align language models with human preference\nsignals after first pre-training the model to predict the next token of text\nwithin a large corpus using likelihood maximization. Before being deployed in a\nspecific domain, models are often further fine-tuned on task specific data.\nSince human preferences are often unavailable for the last step, it is\nperformed using likelihood maximization as that is the typical default method.\nHowever, reinforcement learning has other advantages besides facilitating\nalignment to a human derived reward function. For one, whereas likelihood\nmaximization is a form of imitation learning in which the model is trained on\nwhat to do under ideal conditions, reinforcement learning is not limited to\ndemonstrating actions just for optimally reached states and trains a model what\nto do under a range of scenarios as it explores the policy space. In addition,\nit also trains a model what not to do, suppressing competitive but poor\nactions. This work develops a framework for last-mile fine-tuning using\nreinforcement learning and tests whether it garners performance gains. The\nexperiments center on abstractive summarization, but the framework is general\nand broadly applicable. Use of the procedure produced significantly better\nresults than likelihood maximization when comparing raw predictions. For the\nspecific data tested, the gap could be bridged by employing post-processing of\nthe maximum likelihood outputs. Nonetheless, the framework offers a new avenue\nfor model optimization in situations where post-processing may be less\nstraightforward or effective, and it can be extended to include more complex\nclasses of undesirable outputs to penalize and train against, such as\nhallucinations.\n","authors":["Alec Solway"],"pdf_url":"https://arxiv.org/pdf/2408.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16751v1","updated":"2024-08-29T17:46:18Z","published":"2024-08-29T17:46:18Z","title":"A Gradient Analysis Framework for Rewarding Good and Penalizing Bad\n Examples in Language Models","summary":" Beyond maximum likelihood estimation (MLE), the standard objective of a\nlanguage model (LM) that optimizes good examples probabilities, many studies\nhave explored ways that also penalize bad examples for enhancing the quality of\noutput distribution, including unlikelihood training, exponential maximizing\naverage treatment effect (ExMATE), and direct preference optimization (DPO). To\nsystematically compare these methods and further provide a unified recipe for\nLM optimization, in this paper, we present a unique angle of gradient analysis\nof loss functions that simultaneously reward good examples and penalize bad\nones in LMs. Through both mathematical results and experiments on\nCausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional\ncharacteristics among these methods. We find that ExMATE serves as a superior\nsurrogate for MLE, and that combining DPO with ExMATE instead of MLE further\nenhances both the statistical (5-7%) and generative (+18% win rate)\nperformance.\n","authors":["Yi-Lin Tuan","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16749v1","updated":"2024-08-29T17:43:03Z","published":"2024-08-29T17:43:03Z","title":"Assessing Large Language Models for Online Extremism Research:\n Identification, Explanation, and New Knowledge","summary":" The United States has experienced a significant increase in violent\nextremism, prompting the need for automated tools to detect and limit the\nspread of extremist ideology online. This study evaluates the performance of\nBidirectional Encoder Representations from Transformers (BERT) and Generative\nPre-Trained Transformers (GPT) in detecting and classifying online domestic\nextremist posts. We collected social media posts containing \"far-right\" and\n\"far-left\" ideological keywords and manually labeled them as extremist or\nnon-extremist. Extremist posts were further classified into one or more of five\ncontributing elements of extremism based on a working definitional framework.\nThe BERT model's performance was evaluated based on training data size and\nknowledge transfer between categories. We also compared the performance of GPT\n3.5 and GPT 4 models using different prompts: na\\\"ive, layperson-definition,\nrole-playing, and professional-definition. Results showed that the best\nperforming GPT models outperformed the best performing BERT models, with more\ndetailed prompts generally yielding better results. However, overly complex\nprompts may impair performance. Different versions of GPT have unique\nsensitives to what they consider extremist. GPT 3.5 performed better at\nclassifying far-left extremist posts, while GPT 4 performed better at\nclassifying far-right extremist posts. Large language models, represented by\nGPT models, hold significant potential for online extremism classification\ntasks, surpassing traditional BERT models in a zero-shot setting. Future\nresearch should explore human-computer interactions in optimizing GPT models\nfor extremist detection and classification tasks to develop more efficient\n(e.g., quicker, less effort) and effective (e.g., fewer errors or mistakes)\nmethods for identifying extremist content.\n","authors":["Beidi Dong","Jin R. Lee","Ziwei Zhu","Balassubramanian Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2408.16749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16740v1","updated":"2024-08-29T17:34:10Z","published":"2024-08-29T17:34:10Z","title":"Theoretical and Methodological Framework for Studying Texts Produced by\n Large Language Models","summary":" This paper addresses the conceptual, methodological and technical challenges\nin studying large language models (LLMs) and the texts they produce from a\nquantitative linguistics perspective. It builds on a theoretical framework that\ndistinguishes between the LLM as a substrate and the entities the model\nsimulates. The paper advocates for a strictly non-anthropomorphic approach to\nmodels while cautiously applying methodologies used in studying human\nlinguistic behavior to the simulated entities. While natural language\nprocessing researchers focus on the models themselves, their architecture,\nevaluation, and methods for improving performance, we as quantitative linguists\nshould strive to build a robust theory concerning the characteristics of texts\nproduced by LLMs, how they differ from human-produced texts, and the properties\nof simulated entities. Additionally, we should explore the potential of LLMs as\nan instrument for studying human culture, of which language is an integral\npart.\n","authors":["Jiří Milička"],"pdf_url":"https://arxiv.org/pdf/2408.16740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16737v1","updated":"2024-08-29T17:32:35Z","published":"2024-08-29T17:32:35Z","title":"Smaller, Weaker, Yet Better: Training LLM Reasoners via Compute-Optimal\n Sampling","summary":" Training on high-quality synthetic data from strong language models (LMs) is\na common strategy to improve the reasoning performance of LMs. In this work, we\nrevisit whether this strategy is compute-optimal under a fixed inference budget\n(e.g., FLOPs). To do so, we investigate the trade-offs between generating\nsynthetic data using a stronger but more expensive (SE) model versus a weaker\nbut cheaper (WC) model. We evaluate the generated data across three key\nmetrics: coverage, diversity, and false positive rate, and show that the data\nfrom WC models may have higher coverage and diversity, but also exhibit higher\nfalse positive rates. We then finetune LMs on data from SE and WC models in\ndifferent settings: knowledge distillation, self-improvement, and a novel\nweak-to-strong improvement setup where a weaker LM teaches reasoning to a\nstronger LM. Our findings reveal that models finetuned on WC-generated data\nconsistently outperform those trained on SE-generated data across multiple\nbenchmarks and multiple choices of WC and SE models. These results challenge\nthe prevailing practice of relying on SE models for synthetic data generation,\nsuggesting that WC may be the compute-optimal approach for training advanced LM\nreasoners.\n","authors":["Hritik Bansal","Arian Hosseini","Rishabh Agarwal","Vinh Q. Tran","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2408.16737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16725v1","updated":"2024-08-29T17:18:53Z","published":"2024-08-29T17:18:53Z","title":"Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming","summary":" Recent advances in language models have achieved significant progress.\nGPT-4o, as a new milestone, has enabled real-time conversations with humans,\ndemonstrating near-human natural fluency. Such human-computer interaction\nnecessitates models with the capability to perform reasoning directly with the\naudio modality and generate output in streaming. However, this remains beyond\nthe reach of current academic models, as they typically depend on extra TTS\nsystems for speech synthesis, resulting in undesirable latency. This paper\nintroduces the Mini-Omni, an audio-based end-to-end conversational model,\ncapable of real-time speech interaction. To achieve this capability, we propose\na text-instructed speech generation method, along with batch-parallel\nstrategies during inference to further boost the performance. Our method also\nhelps to retain the original model's language capabilities with minimal\ndegradation, enabling other works to establish real-time interaction\ncapabilities. We call this training method \"Any Model Can Talk\". We also\nintroduce the VoiceAssistant-400K dataset to fine-tune models optimized for\nspeech output. To our best knowledge, Mini-Omni is the first fully end-to-end,\nopen-source model for real-time speech interaction, offering valuable potential\nfor future research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16725v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.15409v2","updated":"2024-08-29T17:00:24Z","published":"2024-08-27T21:19:37Z","title":"Awes, Laws, and Flaws From Today's LLM Research","summary":" We perform a critical examination of the scientific methodology behind\ncontemporary large language model (LLM) research. For this we assess over 2,000\nresearch works based on criteria typical of what is considered good research\n(e.g. presence of statistical tests and reproducibility) and cross-validate it\nwith arguments that are at the centre of controversy (e.g., claims of emergent\nbehaviour, the use of LLMs as evaluators). We find multiple trends, such as\ndeclines in claims of emergent behaviour and ethics disclaimers; the rise of\nLLMs as evaluators in spite of a lack of consensus from the community about\ntheir useability; and an increase of claims of LLM reasoning abilities,\ntypically without leveraging human evaluation. This paper underscores the need\nfor more scrutiny and rigour by and from this field to live up to the\nfundamentals of a responsible scientific method that is ethical, reproducible,\nsystematic, and open to criticism.\n","authors":["Adrian de Wynter"],"pdf_url":"https://arxiv.org/pdf/2408.15409v2.pdf","comment":"Under review -- v1 was an old draft with an unrevised abstract (oops)"},{"id":"http://arxiv.org/abs/2405.11039v3","updated":"2024-08-29T16:57:38Z","published":"2024-05-17T18:31:26Z","title":"CC-GPX: Extracting High-Quality Annotated Geospatial Data from Common\n Crawl","summary":" The Common Crawl (CC) corpus is the largest open web crawl dataset containing\n9.5+ petabytes of data captured since 2008. The dataset is instrumental in\ntraining large language models, and as such it has been studied for\n(un)desirable content, and distilled for smaller, domain-specific datasets.\nHowever, to our knowledge, no research has been dedicated to using CC as a\nsource of annotated geospatial data. In this paper, we introduce an efficient\npipeline to extract annotated user-generated tracks from GPX files found in CC,\nand the resulting multimodal dataset with 1,416 pairings of human-written\ndescriptions and MultiLineString vector data from the 6 most recent CC\nreleases. The dataset can be used to study people's outdoor activity patterns,\nthe way people talk about their outdoor experiences, as well as for developing\ntrajectory generation or track annotation models, or for various other problems\nin place of synthetically generated routes. Our reproducible code is available\non GitHub: https://github.com/ilyankou/cc-gpx\n","authors":["Ilya Ilyankou","Meihui Wang","Stefano Cavazzi","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2405.11039v3.pdf","comment":"Accepted as a poster to ACM SIGSPATIAL 2024"},{"id":"http://arxiv.org/abs/2406.04952v2","updated":"2024-08-29T16:49:29Z","published":"2024-06-07T14:16:37Z","title":"Quantifying Geospatial in the Common Crawl Corpus","summary":" Large language models (LLMs) exhibit emerging geospatial capabilities,\nstemming from their pre-training on vast unlabelled text datasets that are\noften derived from the Common Crawl (CC) corpus. However, the geospatial\ncontent within CC remains largely unexplored, impacting our understanding of\nLLMs' spatial reasoning. This paper investigates the prevalence of geospatial\ndata in recent Common Crawl releases using Gemini 1.5, a powerful language\nmodel. By analyzing a sample of documents and manually revising the results, we\nestimate that 18.7% of web documents in CC contain geospatial information such\nas coordinates and addresses. We find little difference in prevalence between\nEnlgish- and non-English-language documents. Our findings provide quantitative\ninsights into the nature and extent of geospatial data in CC, and lay the\ngroundwork for future studies of geospatial biases of LLMs.\n","authors":["Ilya Ilyankou","Meihui Wang","Stefano Cavazzi","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2406.04952v2.pdf","comment":"Accepted as a poster to ACM SIGSPATIAL 2024"},{"id":"http://arxiv.org/abs/2403.05527v3","updated":"2024-08-29T16:48:58Z","published":"2024-03-08T18:48:30Z","title":"GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless\n Generative Inference of LLM","summary":" Key-value (KV) caching has become the de-facto to accelerate generation speed\nfor large language models (LLMs) inference. However, the growing cache demand\nwith increasing sequence length has transformed LLM inference to be a memory\nbound problem, significantly constraining the system throughput. Existing\nmethods rely on dropping unimportant tokens or quantizing all entries\nuniformly. Such methods, however, often incur high approximation errors to\nrepresent the compressed matrices. The autoregressive decoding process further\ncompounds the error of each step, resulting in critical deviation in model\ngeneration and deterioration of performance. To tackle this challenge, we\npropose GEAR, an efficient KV cache compression framework that achieves\nnear-lossless high-ratio compression. GEAR first applies quantization to\nmajority of entries of similar magnitudes to ultra-low precision. It then\nemploys a low rank matrix to approximate the quantization error, and a sparse\nmatrix to remedy individual errors from outlier entries. By adeptly integrating\nthree techniques, GEAR is able to fully exploit their synergistic potentials.\nOur experiments demonstrate that compared to alternatives, GEAR achieves\nnear-lossless 4-bit KV cache compression with up to 2.38x throughput\nimprovement, while reducing peak-memory size up to 2.29x. Our code is publicly\navailable at https://github.com/HaoKang-Timmy/GEAR.\n","authors":["Hao Kang","Qingru Zhang","Souvik Kundu","Geonhwa Jeong","Zaoxing Liu","Tushar Krishna","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.05527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16672v1","updated":"2024-08-29T16:21:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce several improvements to the ColBERT model architecture\nand training pipeline, leveraging techniques successful in the more established\nsingle-vector embedding model paradigm, particularly those suited for\nheterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates\nstrong performance across a range of English and multilingual retrieval tasks,\nwhile also cutting storage requirements by up to 50% compared to previous\nmodels.\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Saba Sturua","Mohammad Kalim Akram","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16667v1","updated":"2024-08-29T16:15:01Z","published":"2024-08-29T16:15:01Z","title":"Iterative Graph Alignment","summary":" By compressing diverse narratives, LLMs go beyond memorization, achieving\nintelligence by capturing generalizable causal relationships. However, they\nsuffer from local 'representation gaps' due to insufficient training data\ndiversity, limiting their real-world utility, especially in tasks requiring\nstrict alignment to rules. Traditional alignment methods relying on heavy human\nannotations are inefficient and unscalable. Recent self-alignment techniques\nalso fall short, as they often depend on self-selection based prompting and\nmemorization-based learning. To address these issues, we introduce Iterative\nGraph Alignment (IGA), an annotation-free rule-based alignment algorithm. A\nteacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical\ngraphs and reference answers. The student model (LLM) identifies local\nknowledge gaps by attempting to align its responses with these references,\ncollaborating with helper models to generate diverse answers. These aligned\nresponses are then used for iterative supervised fine-tuning (SFT). Our\nevaluations across five rule-based scenarios demonstrate IGP's effectiveness,\nwith a 73.12\\% alignment improvement in Claude Sonnet 3.5, and\nLlama3-8B-Instruct achieving an 86.20\\% improvement, outperforming Claude\nSonnet 3.5 in rule-based alignment.\n","authors":["Fangyuan Yu","Hardeep Singh Arora","Matt Johnson"],"pdf_url":"https://arxiv.org/pdf/2408.16667v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.04559v2","updated":"2024-08-29T15:58:09Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n than Measuring Coherence, Grounding, and Repetition","summary":" Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14698v2","updated":"2024-08-29T15:14:48Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n Integration in Adobe Express","summary":" As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v2.pdf","comment":"CIKM 2024 (International Conference on Information and Knowledge\n Management), Multimodal Search and Recommendations Workshop"},{"id":"http://arxiv.org/abs/2405.05418v2","updated":"2024-08-29T14:50:10Z","published":"2024-05-08T20:39:54Z","title":"Mitigating Exaggerated Safety in Large Language Models","summary":" As the popularity of Large Language Models (LLMs) grow, combining model\nsafety with utility becomes increasingly important. The challenge is making\nsure that LLMs can recognize and decline dangerous prompts without sacrificing\ntheir ability to be helpful. The problem of \"exaggerated safety\" demonstrates\nhow difficult this can be. To reduce excessive safety behaviours -- which was\ndiscovered to be 26.1% of safe prompts being misclassified as dangerous and\nrefused -- we use a combination of XSTest dataset prompts as well as\ninteractive, contextual, and few-shot prompting to examine the decision bounds\nof LLMs such as Llama2, Gemma Command R+, and Phi-3. We find that few-shot\nprompting works best for Llama2, interactive prompting works best Gemma, and\ncontextual prompting works best for Command R+ and Phi-3. Using a combination\nof these prompting strategies, we are able to mitigate exaggerated safety\nbehaviors by an overall 92.9% across all LLMs. Our work presents a multiple\nprompting strategies to jailbreak LLMs' decision-making processes, allowing\nthem to navigate the tight line between refusing unsafe prompts and remaining\nhelpful.\n","authors":["Ruchira Ray","Ruchi Bhalani"],"pdf_url":"https://arxiv.org/pdf/2405.05418v2.pdf","comment":"17 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.16586v1","updated":"2024-08-29T14:49:13Z","published":"2024-08-29T14:49:13Z","title":"Enhancing Dialogue Generation in Werewolf Game Through Situation\n Analysis and Persuasion Strategies","summary":" Recent advancements in natural language processing, particularly with large\nlanguage models (LLMs) like GPT-4, have significantly enhanced dialogue\nsystems, enabling them to generate more natural and fluent conversations.\nDespite these improvements, challenges persist, such as managing continuous\ndialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024\naddresses these challenges by employing the Werewolf Game, an incomplete\ninformation game, to test the capabilities of LLMs in complex interactive\nenvironments. This paper introduces a LLM-based Werewolf Game AI, where each\nrole is supported by situation analysis to aid response generation.\nAdditionally, for the werewolf role, various persuasion strategies, including\nlogical appeal, credibility appeal, and emotional appeal, are employed to\neffectively persuade other players to align with its actions.\n","authors":["Zhiyang Qi","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.16586v1.pdf","comment":"Accepted to the AIWolfDial2024 workshop at INLG 2024"},{"id":"http://arxiv.org/abs/2406.11455v2","updated":"2024-08-29T14:48:10Z","published":"2024-06-17T12:11:01Z","title":"Adaptive Reinforcement Learning Planning: Harnessing Large Language\n Models for Complex Information Extraction","summary":" Existing research on large language models (LLMs) shows that they can solve\ninformation extraction tasks through multi-step planning. However, their\nextraction behavior on complex sentences and tasks is unstable, emerging issues\nsuch as false positives and missing elements. We observe that decomposing\ncomplex extraction tasks and extracting them step by step can effectively\nimprove LLMs' performance, and the extraction orders of entities significantly\naffect the final results of LLMs. This paper proposes a two-stage multi-step\nmethod for LLM-based information extraction and adopts the RL framework to\nexecute the multi-step planning. We regard sequential extraction as a Markov\ndecision process, build an LLM-based extraction environment, design a decision\nmodule to adaptively provide the optimal order for sequential entity extraction\non different sentences, and utilize the DDQN algorithm to train the decision\nmodel. We also design the rewards and evaluation metrics suitable for the\nextraction results of LLMs. We conduct extensive experiments on multiple public\ndatasets to demonstrate the effectiveness of our method in improving the\ninformation extraction capabilities of LLMs.\n","authors":["Zepeng Ding","Ruiyang Ke","Wenhao Huang","Guochao Jiang","Yanda Li","Deqing Yang","Jiaqing Liang"],"pdf_url":"https://arxiv.org/pdf/2406.11455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15710v2","updated":"2024-08-29T14:47:37Z","published":"2024-08-28T11:18:06Z","title":"Conan-embedding: General Text Embedding with More and Better Negative\n Samples","summary":" With the growing popularity of RAG, the capabilities of embedding models are\ngaining increasing attention. Embedding models are primarily trained through\ncontrastive loss learning, with negative examples being a key component.\nPrevious work has proposed various hard negative mining strategies, but these\nstrategies are typically employed as preprocessing steps. In this paper, we\npropose the conan-embedding model, which maximizes the utilization of more and\nhigher-quality negative examples. Specifically, since the model's ability to\nhandle preprocessed negative examples evolves during training, we propose\ndynamic hard negative mining method to expose the model to more challenging\nnegative examples throughout the training process. Secondly, contrastive\nlearning requires as many negative examples as possible but is limited by GPU\nmemory constraints. Therefore, we use a Cross-GPU balancing Loss to provide\nmore negative examples for embedding training and balance the batch size across\nmultiple tasks. Moreover, we also discovered that the prompt-response pairs\nfrom LLMs can be used for embedding training. Our approach effectively enhances\nthe capabilities of embedding models, currently ranking first on the Chinese\nleaderboard of Massive text embedding benchmark\n","authors":["Shiyu Li","Yang Tang","Shizhe Chen","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16570v1","updated":"2024-08-29T14:37:05Z","published":"2024-08-29T14:37:05Z","title":"Predictability maximization and the origins of word order harmony","summary":" We address the linguistic problem of the sequential arrangement of a head and\nits dependents from an information theoretic perspective. In particular, we\nconsider the optimal placement of a head that maximizes the predictability of\nthe sequence. We assume that dependents are statistically independent given a\nhead, in line with the open-choice principle and the core assumptions of\ndependency grammar. We demonstrate the optimality of harmonic order, i.e.,\nplacing the head last maximizes the predictability of the head whereas placing\nthe head first maximizes the predictability of dependents. We also show that\npostponing the head is the optimal strategy to maximize its predictability\nwhile bringing it forward is the optimal strategy to maximize the\npredictability of dependents. We unravel the advantages of the strategy of\nmaximizing the predictability of the head over maximizing the predictability of\ndependents. Our findings shed light on the placements of the head adopted by\nreal languages or emerging in different kinds of experiments.\n","authors":["Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2408.16570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17844v2","updated":"2024-08-29T14:06:57Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n Classification: A Systematic Review","summary":" Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy concerns. The goal of this systematic review is to explore the current\nlandscape of speech-based DL approaches for PD classification, based on 33\nscientific works published between 2020 and March 2024. We discuss their\navailable resources, capabilities, potential limitations, and issues related to\nbias, explainability, and privacy. Furthermore, this review provides an\noverview of publicly accessible speech-based datasets and open-source material\nfor PD. The DL approaches are categorized into end-to-end (E2E) learning,\ntransfer learning (TL) and deep acoustic features extraction (DAFE) approaches.\nAmong E2E approaches, Convolutional Neural Networks (CNNs) are prevalent,\nthough Transformers are increasingly popular. E2E approaches face challenges\nsuch as limited data and computational resources, especially with Transformers.\nTL addresses these issues by providing more robust PD diagnosis and better\ngeneralizability across languages. DAFE aims to improve the explainability and\ninterpretability of results by examining the specific effects of deep features\non both other DL approaches and more traditional machine learning (ML) methods.\nHowever, it often underperforms compared to E2E and TL approaches.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v2.pdf","comment":"Submitted in Applied Sciences - peer reviewed Open Access journal.\n This research was funded by the NWO research programme AiNed Fellowship\n Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant\n number NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2402.01805v4","updated":"2024-08-29T14:05:44Z","published":"2024-02-02T09:45:33Z","title":"Can LLMs perform structured graph reasoning?","summary":" Pretrained Large Language Models (LLMs) have demonstrated various reasoning\ncapabilities through language-based prompts alone, particularly in unstructured\ntask settings (tasks purely based on language semantics). However, LLMs often\nstruggle with structured tasks, because of the inherent incompatibility of\ninput representation. Reducing structured tasks to uni-dimensional language\nsemantics often renders the problem trivial. Keeping the trade-off between LLM\ncompatibility and structure complexity in mind, we design various graph\nreasoning tasks as a proxy to semi-structured tasks in this paper, in order to\ntest the ability to navigate through representations beyond plain text in\nvarious LLMs. Particularly, we design 10 distinct problems of graph traversal,\neach representing increasing levels of complexity, and benchmark 5 different\ninstruct-finetuned LLMs (GPT-4, GPT-3.5, Claude-2, Llama-2 and Palm-2) on the\naforementioned tasks. Further, we analyse the performance of models across\nvarious settings such as varying sizes of graphs as well as different forms of\nk-shot prompting. We highlight various limitations, biases and properties of\nLLMs through this benchmarking process, such as an inverse relation to the\naverage degrees of freedom of traversal per node in graphs, the overall\nnegative impact of k-shot prompting on graph reasoning tasks, and a positive\nresponse bias which prevents LLMs from identifying the absence of a valid\nsolution. Finally, we introduce a new prompting technique specially designed\nfor graph traversal tasks (PathCompare), which demonstrates a notable increase\nin the performance of LLMs in comparison to standard prompting techniques such\nas Chain-of-Thought (CoT).\n","authors":["Palaash Agrawal","Shavak Vasania","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2402.01805v4.pdf","comment":"International Conference on Pattern Recognition (ICPR), 2024"},{"id":"http://arxiv.org/abs/2408.16542v1","updated":"2024-08-29T14:00:57Z","published":"2024-08-29T14:00:57Z","title":"SALSA: Speedy ASR-LLM Synchronous Aggregation","summary":" Harnessing pre-trained LLMs to improve ASR systems, particularly for\nlow-resource languages, is now an emerging area of research. Existing methods\nrange from using LLMs for ASR error correction to tightly coupled systems that\nreplace the ASR decoder with the LLM. These approaches either increase decoding\ntime or require expensive training of the cross-attention layers. We propose\nSALSA, which couples the decoder layers of the ASR to the LLM decoder, while\nsynchronously advancing both decoders. Such coupling is performed with a simple\nprojection of the last decoder state, and is thus significantly more training\nefficient than earlier approaches. A challenge of our proposed coupling is\nhandling the mismatch between the tokenizers of the LLM and ASR systems. We\nhandle this mismatch using cascading tokenization with respect to the LLM and\nASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS\nbenchmark, yielding substantial WER reductions of up to 38%.\n","authors":["Ashish Mittal","Darshan Prabhu","Sunita Sarawagi","Preethi Jyothi"],"pdf_url":"https://arxiv.org/pdf/2408.16542v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2406.19307v2","updated":"2024-08-29T13:51:34Z","published":"2024-06-27T16:30:50Z","title":"The Odyssey of Commonsense Causality: From Foundational Benchmarks to\n Cutting-Edge Reasoning","summary":" Understanding commonsense causality is a unique mark of intelligence for\nhumans. It helps people understand the principles of the real world better and\nbenefits the decision-making process related to causation. For instance,\ncommonsense causality is crucial in judging whether a defendant's action causes\nthe plaintiff's loss in determining legal liability. Despite its significance,\na systematic exploration of this topic is notably lacking. Our comprehensive\nsurvey bridges this gap by focusing on taxonomies, benchmarks, acquisition\nmethods, qualitative reasoning, and quantitative measurements in commonsense\ncausality, synthesizing insights from over 200 representative articles. Our\nwork aims to provide a systematic overview, update scholars on recent\nadvancements, provide a pragmatic guide for beginners, and highlight promising\nfuture research directions in this vital field.\n","authors":["Shaobo Cui","Zhijing Jin","Bernhard Schölkopf","Boi Faltings"],"pdf_url":"https://arxiv.org/pdf/2406.19307v2.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2408.14874v2","updated":"2024-08-29T13:49:40Z","published":"2024-08-27T08:43:32Z","title":"Inverse-Q*: Token Level Reinforcement Learning for Aligning Large\n Language Models Without Preference Data","summary":" Reinforcement Learning from Human Feedback (RLHF) has proven effective in\naligning large language models with human intentions, yet it often relies on\ncomplex methodologies like Proximal Policy Optimization (PPO) that require\nextensive hyper-parameter tuning and present challenges in sample efficiency\nand stability. In this paper, we introduce Inverse-Q*, an innovative framework\nthat transcends traditional RL methods by optimizing token-level reinforcement\nlearning without the need for additional reward or value models. Inverse-Q*\nleverages direct preference optimization techniques but extends them by\nestimating the conditionally optimal policy directly from the model's\nresponses, facilitating more granular and flexible policy shaping. Our approach\nreduces reliance on human annotation and external supervision, making it\nespecially suitable for low-resource settings. We present extensive\nexperimental results demonstrating that Inverse-Q* not only matches but\npotentially exceeds the effectiveness of PPO in terms of convergence speed and\nthe alignment of model responses with human preferences. Our findings suggest\nthat Inverse-Q* offers a practical and robust alternative to conventional RLHF\napproaches, paving the way for more efficient and adaptable model training\napproaches.\n","authors":["Han Xia","Songyang Gao","Qiming Ge","Zhiheng Xi","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2408.14874v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16518v1","updated":"2024-08-29T13:28:52Z","published":"2024-08-29T13:28:52Z","title":"CNIMA: A Universal Evaluation Framework and Automated Approach for\n Assessing Second Language Dialogues","summary":" We develop CNIMA (Chinese Non-Native Interactivity Measurement and\nAutomation), a Chinese-as-a-second-language labelled dataset with 10K\ndialogues. We annotate CNIMA using an evaluation framework -- originally\nintroduced for English-as-a-second-language dialogues -- that assesses\nmicro-level features (e.g.\\ backchannels) and macro-level interactivity labels\n(e.g.\\ topic management) and test the framework's transferability from English\nto Chinese. We found the framework robust across languages and revealed\nuniversal and language-specific relationships between micro-level and\nmacro-level features. Next, we propose an approach to automate the evaluation\nand find strong performance, creating a new tool for automated second language\nassessment. Our system can be adapted to other languages easily as it uses\nlarge language models and as such does not require large-scale annotated\ntraining data.\n","authors":["Rena Gao","Jingxuan Wu","Carsten Roever","Xuetong Wu","Jing Wu","Long Lv","Jey Han Lau"],"pdf_url":"https://arxiv.org/pdf/2408.16518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16502v1","updated":"2024-08-29T13:01:42Z","published":"2024-08-29T13:01:42Z","title":"LLMs vs Established Text Augmentation Techniques for Classification:\n When do the Benefits Outweight the Costs?","summary":" The generative large language models (LLMs) are increasingly being used for\ndata augmentation tasks, where text samples are LLM-paraphrased and then used\nfor classifier fine-tuning. However, a research that would confirm a clear\ncost-benefit advantage of LLMs over more established augmentation methods is\nlargely missing. To study if (and when) is the LLM-based augmentation\nadvantageous, we compared the effects of recent LLM augmentation methods with\nestablished ones on 6 datasets, 3 classifiers and 2 fine-tuning methods. We\nalso varied the number of seeds and collected samples to better explore the\ndownstream model accuracy space. Finally, we performed a cost-benefit analysis\nand show that LLM-based methods are worthy of deployment only when very small\nnumber of seeds is used. Moreover, in many cases, established methods lead to\nsimilar or better model accuracies.\n","authors":["Jan Cegin","Jakub Simko","Peter Brusilovsky"],"pdf_url":"https://arxiv.org/pdf/2408.16502v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2408.16493v1","updated":"2024-08-29T12:44:01Z","published":"2024-08-29T12:44:01Z","title":"Learning from Negative Samples in Generative Biomedical Entity Linking","summary":" Generative models have become widely used in biomedical entity linking\n(BioEL) due to their excellent performance and efficient memory usage. However,\nthese models are usually trained only with positive samples--entities that\nmatch the input mention's identifier--and do not explicitly learn from hard\nnegative samples, which are entities that look similar but have different\nmeanings. To address this limitation, we introduce ANGEL (Learning from\nNegative Samples in Generative Biomedical Entity Linking), the first framework\nthat trains generative BioEL models using negative samples. Specifically, a\ngenerative model is initially trained to generate positive samples from the\nknowledge base for given input entities. Subsequently, both correct and\nincorrect outputs are gathered from the model's top-k predictions. The model is\nthen updated to prioritize the correct predictions through direct preference\noptimization. Our models fine-tuned with ANGEL outperform the previous best\nbaseline models by up to an average top-1 accuracy of 1.4% on five benchmarks.\nWhen incorporating our framework into pre-training, the performance improvement\nfurther increases to 1.7%, demonstrating its effectiveness in both the\npre-training and fine-tuning stages. Our code is available at\nhttps://github.com/dmis-lab/ANGEL.\n","authors":["Chanhwi Kim","Hyunjae Kim","Sihyeon Park","Jiwoo Lee","Mujeen Sung","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2408.16493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11512v2","updated":"2024-08-29T12:25:14Z","published":"2024-08-21T10:44:10Z","title":"IKUN for WMT24 General MT Task: LLMs Are here for Multilingual Machine\n Translation","summary":" This paper introduces two multilingual systems, IKUN and IKUN-C, developed\nfor the general machine translation task in WMT24. IKUN and IKUN-C represent an\nopen system and a constrained system, respectively, built on Llama-3-8b and\nMistral-7B-v0.3. Both systems are designed to handle all 11 language directions\nusing a single model. According to automatic evaluation metrics, IKUN-C\nachieved 6 first-place and 3 second-place finishes among all constrained\nsystems, while IKUN secured 1 first-place and 2 second-place finishes across\nboth open and constrained systems. These encouraging results suggest that large\nlanguage models (LLMs) are nearing the level of proficiency required for\neffective multilingual machine translation. The systems are based on a\ntwo-stage approach: first, continuous pre-training on monolingual data in 10\nlanguages, followed by fine-tuning on high-quality parallel data for 11\nlanguage directions. The primary difference between IKUN and IKUN-C lies in\ntheir monolingual pre-training strategy. IKUN-C is pre-trained using\nconstrained monolingual data, whereas IKUN leverages monolingual data from the\nOSCAR dataset. In the second phase, both systems are fine-tuned on parallel\ndata sourced from NTREX, Flores, and WMT16-23 for all 11 language pairs.\n","authors":["Baohao Liao","Christian Herold","Shahram Khadivi","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2408.11512v2.pdf","comment":"typo: 120K -> 12K vocabulary size"},{"id":"http://arxiv.org/abs/2408.16482v1","updated":"2024-08-29T12:18:04Z","published":"2024-08-29T12:18:04Z","title":"Self-Alignment: Improving Alignment of Cultural Values in LLMs via\n In-Context Learning","summary":" Improving the alignment of Large Language Models (LLMs) with respect to the\ncultural values that they encode has become an increasingly important topic. In\nthis work, we study whether we can exploit existing knowledge about cultural\nvalues at inference time to adjust model responses to cultural value probes. We\npresent a simple and inexpensive method that uses a combination of in-context\nlearning (ICL) and human survey data, and show that we can improve the\nalignment to cultural values across 5 models that include both English-centric\nand multilingual LLMs. Importantly, we show that our method could prove useful\nin test languages other than English and can improve alignment to the cultural\nvalues that correspond to a range of culturally diverse countries.\n","authors":["Rochelle Choenni","Ekaterina Shutova"],"pdf_url":"https://arxiv.org/pdf/2408.16482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16446v1","updated":"2024-08-29T11:19:57Z","published":"2024-08-29T11:19:57Z","title":"Is text normalization relevant for classifying medieval charters?","summary":" This study examines the impact of historical text normalization on the\nclassification of medieval charters, specifically focusing on document dating\nand locating. Using a data set of Middle High German charters from a digital\narchive, we evaluate various classifiers, including traditional and\ntransformer-based models, with and without normalization. Our results indicate\nthat the given normalization minimally improves locating tasks but reduces\naccuracy for dating, implying that original texts contain crucial features that\nnormalization may obscure. We find that support vector machines and gradient\nboosting outperform other models, questioning the efficiency of transformers\nfor this use case. Results suggest a selective approach to historical text\nnormalization, emphasizing the significance of preserving some textual\ncharacteristics that are critical for classification tasks in document\nanalysis.\n","authors":["Florian Atzenhofer-Baumgartner","Tamás Kovács"],"pdf_url":"https://arxiv.org/pdf/2408.16446v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n improvements or corrections"},{"id":"http://arxiv.org/abs/2408.16444v1","updated":"2024-08-29T11:13:23Z","published":"2024-08-29T11:13:23Z","title":"SurveySum: A Dataset for Summarizing Multiple Scientific Articles into a\n Survey Section","summary":" Document summarization is a task to shorten texts into concise and\ninformative summaries. This paper introduces a novel dataset designed for\nsummarizing multiple scientific articles into a section of a survey. Our\ncontributions are: (1) SurveySum, a new dataset addressing the gap in\ndomain-specific summarization tools; (2) two specific pipelines to summarize\nscientific articles into a section of a survey; and (3) the evaluation of these\npipelines using multiple metrics to compare their performance. Our results\nhighlight the importance of high-quality retrieval stages and the impact of\ndifferent configurations on the quality of generated summaries.\n","authors":["Leandro Carísio Fernandes","Gustavo Bartz Guedes","Thiago Soares Laitz","Thales Sales Almeida","Rodrigo Nogueira","Roberto Lotufo","Jayr Pereira"],"pdf_url":"https://arxiv.org/pdf/2408.16444v1.pdf","comment":"15 pages, 6 figures, 1 table. Submitted to BRACIS 2024"},{"id":"http://arxiv.org/abs/2408.16440v1","updated":"2024-08-29T11:05:54Z","published":"2024-08-29T11:05:54Z","title":"Instruction-tuned Large Language Models for Machine Translation in the\n Medical Domain","summary":" Large Language Models (LLMs) have shown promising results on machine\ntranslation for high resource language pairs and domains. However, in\nspecialised domains (e.g. medical) LLMs have shown lower performance compared\nto standard neural machine translation models. The consistency in the machine\ntranslation of terminology is crucial for users, researchers, and translators\nin specialised domains. In this study, we compare the performance between\nbaseline LLMs and instruction-tuned LLMs in the medical domain. In addition, we\nintroduce terminology from specialised medical dictionaries into the\ninstruction formatted datasets for fine-tuning LLMs. The instruction-tuned LLMs\nsignificantly outperform the baseline models with automatic metrics.\n","authors":["Miguel Rios"],"pdf_url":"https://arxiv.org/pdf/2408.16440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15496v2","updated":"2024-08-29T10:35:52Z","published":"2024-08-28T02:47:27Z","title":"ReMamba: Equip Mamba with Effective Long-Sequence Modeling","summary":" While the Mamba architecture demonstrates superior inference efficiency and\ncompetitive performance on short-context natural language processing (NLP)\ntasks, empirical evidence suggests its capacity to comprehend long contexts is\nlimited compared to transformer-based models. In this study, we investigate the\nlong-context efficiency issues of the Mamba models and propose ReMamba, which\nenhances Mamba's ability to comprehend long contexts. ReMamba incorporates\nselective compression and adaptation techniques within a two-stage re-forward\nprocess, incurring minimal additional inference costs overhead. Experimental\nresults on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,\nimproving over the baselines by 3.2 and 1.6 points, respectively, and attaining\nperformance almost on par with same-size transformer models.\n","authors":["Danlong Yuan","Jiahao Liu","Bei Li","Huishuai Zhang","Jingang Wang","Xunliang Cai","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11288v2","updated":"2024-08-29T10:10:55Z","published":"2024-04-17T11:52:47Z","title":"A Preference-driven Paradigm for Enhanced Translation with Large\n Language Models","summary":" Recent research has shown that large language models (LLMs) can achieve\nremarkable translation performance through supervised fine-tuning (SFT) using\nonly a small amount of parallel data. However, SFT simply instructs the model\nto imitate the reference translations at the token level, making it vulnerable\nto the noise present in the references. Hence, the assistance from SFT often\nreaches a plateau once the LLMs have achieved a certain level of translation\ncapability, and further increasing the size of parallel data does not provide\nadditional benefits. To overcome this plateau associated with imitation-based\nSFT, we propose a preference-based approach built upon the Plackett-Luce model.\nThe objective is to steer LLMs towards a more nuanced understanding of\ntranslation preferences from a holistic view, while also being more resilient\nin the absence of gold translations. We further build a dataset named MAPLE to\nverify the effectiveness of our approach, which includes multiple translations\nof varying quality for each source sentence. Extensive experiments demonstrate\nthe superiority of our approach in \"breaking the plateau\" across diverse LLMs\nand test settings. Our in-depth analysis underscores the pivotal role of\ndiverse translations and accurate preference scores in the success of our\napproach.\n","authors":["Dawei Zhu","Sony Trenous","Xiaoyu Shen","Dietrich Klakow","Bill Byrne","Eva Hasler"],"pdf_url":"https://arxiv.org/pdf/2404.11288v2.pdf","comment":"Accepted to NAACL 2024 (long, main)"},{"id":"http://arxiv.org/abs/2408.16390v1","updated":"2024-08-29T09:52:01Z","published":"2024-08-29T09:52:01Z","title":"MQM-Chat: Multidimensional Quality Metrics for Chat Translation","summary":" The complexities of chats pose significant challenges for machine translation\nmodels. Recognizing the need for a precise evaluation metric to address the\nissues of chat translation, this study introduces Multidimensional Quality\nMetrics for Chat Translation (MQM-Chat). Through the experiments of five models\nusing MQM-Chat, we observed that all models generated certain fundamental\nerrors, while each of them has different shortcomings, such as omission, overly\ncorrecting ambiguous source content, and buzzword issues, resulting in the loss\nof stylized information. Our findings underscore the effectiveness of MQM-Chat\nin evaluating chat translation, emphasizing the importance of stylized content\nand dialogue consistency for future studies.\n","authors":["Yunmeng Li","Jun Suzuki","Makoto Morishita","Kaori Abe","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2408.16390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19097v2","updated":"2024-08-29T09:35:24Z","published":"2024-02-29T12:25:45Z","title":"TEncDM: Understanding the Properties of Diffusion Model in the Space of\n Language Model Encodings","summary":" This paper presents the Text Encoding Diffusion Model (TEncDM), a novel\napproach to diffusion modeling that operates in the space of pre-trained\nlanguage model encodings. In contrast to traditionally used embeddings,\nencodings integrate contextual information. In our approach, we also employ a\ntransformer-based decoder, specifically designed to incorporate context in the\ntoken prediction process. We conduct a comprehensive examination of the\ninfluence of the encoder, decoder, noise scheduler, and self-conditioning on\nzero-shot generation. Furthermore, we compare TEncDM with previous approaches\non three conditional text generation tasks: QQP, XSum, and Wiki-Auto. The\nresults show that TEncDM exhibits superior performance compared to existing\nnon-autoregressive diffusion models.\n","authors":["Alexander Shabalin","Viacheslav Meshchaninov","Egor Chimbulatov","Vladislav Lapikov","Roman Kim","Grigory Bartosh","Dmitry Molchanov","Sergey Markov","Dmitry Vetrov"],"pdf_url":"https://arxiv.org/pdf/2402.19097v2.pdf","comment":"14 pages, 13 figures"},{"id":"http://arxiv.org/abs/2404.01602v2","updated":"2024-08-29T08:49:14Z","published":"2024-04-02T02:46:18Z","title":"Helmsman of the Masses? Evaluate the Opinion Leadership of Large\n Language Models in the Werewolf Game","summary":" Large language models (LLMs) have exhibited memorable strategic behaviors in\nsocial deductive games. However, the significance of opinion leadership\nexhibited by LLM-based agents has been largely overlooked, which is crucial for\npractical applications in multi-agent and human-AI interaction settings.\nOpinion leaders are individuals who have a noticeable impact on the beliefs and\nbehaviors of others within a social group. In this work, we employ the Werewolf\ngame as a simulation platform to assess the opinion leadership of LLMs. The\ngame includes the role of the Sheriff, tasked with summarizing arguments and\nrecommending decision options, and therefore serves as a credible proxy for an\nopinion leader. We develop a framework integrating the Sheriff role and devise\ntwo novel metrics based on the critical characteristics of opinion leaders. The\nfirst metric measures the reliability of the opinion leader, and the second\nassesses the influence of the opinion leader on other players' decisions. We\nconduct extensive experiments to evaluate LLMs of different scales. In\naddition, we collect a Werewolf question-answering dataset (WWQA) to assess and\nenhance LLM's grasp of the game rules, and we also incorporate human\nparticipants for further analysis. The results suggest that the Werewolf game\nis a suitable test bed to evaluate the opinion leadership of LLMs, and few LLMs\npossess the capacity for opinion leadership.\n","authors":["Silin Du","Xiaowei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01602v2.pdf","comment":"Published as a conference paper at COLM 2024. 37 pages, 6 figures, 27\n tables"},{"id":"http://arxiv.org/abs/2407.09816v4","updated":"2024-08-29T08:45:58Z","published":"2024-07-13T09:22:33Z","title":"MaskMoE: Boosting Token-Level Learning via Routing Mask in\n Mixture-of-Experts","summary":" Scaling the size of a model enhances its capabilities but significantly\nincreases computation complexity. Mixture-of-Experts models (MoE) address the\nissue by allowing model size to scale up without substantially increasing\ntraining or inference costs. In MoE, there is an important module called the\nrouter, which is used to distribute each token to the experts. Currently, the\nmainstream routing methods include dynamic routing and fixed routing. Despite\ntheir promising results, MoE models encounter several challenges. Primarily,\nfor dynamic routing methods, the dispersion of training tokens across multiple\nexperts can lead to underfitting, particularly for infrequent tokens.\nAdditionally, though fixed routing methods can mitigate that issue, they\ncompromise on the diversity of representations. In this paper, we propose\n\\textbf{MaskMoE}, a method designed to enhance token-level learning by\nemploying a routing \\textbf{mask}ing technique within the\n\\textbf{M}ixture-\\textbf{o}f-\\textbf{E}xperts model. MaskMoE is capable of\nmaintaining representation diversity while achieving more comprehensive\ntraining. Experimental results demonstrate that our method outperforms previous\ndominant Mixture-of-Experts models in terms of both perplexity (PPL) and\ndownstream task performance.\n","authors":["Zhenpeng Su","Zijia Lin","Xue Bai","Xing Wu","Yizhe Xiong","Haoran Lian","Guangyuan Ma","Hui Chen","Guiguang Ding","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2407.09816v4.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.15533v2","updated":"2024-08-29T08:45:30Z","published":"2024-08-28T04:44:43Z","title":"LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via\n Layer-wise Relevance Propagation","summary":" Retrieval-Augmented Generation (RAG) has become a primary technique for\nmitigating hallucinations in large language models (LLMs). However, incomplete\nknowledge extraction and insufficient understanding can still mislead LLMs to\nproduce irrelevant or even contradictory responses, which means hallucinations\npersist in RAG. In this paper, we propose LRP4RAG, a method based on the\nLayer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations\nin RAG. Specifically, we first utilize LRP to compute the relevance between the\ninput and output of the RAG generator. We then apply further extraction and\nresampling to the relevance matrix. The processed relevance data are input into\nmultiple classifiers to determine whether the output contains hallucinations.\nTo the best of our knowledge, this is the first time that LRP has been used for\ndetecting RAG hallucinations, and extensive experiments demonstrate that\nLRP4RAG outperforms existing baselines.\n","authors":["Haichuan Hu","Yuhan Sun","Quanjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.15533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16345v1","updated":"2024-08-29T08:30:33Z","published":"2024-08-29T08:30:33Z","title":"The Unreasonable Ineffectiveness of Nucleus Sampling on Mitigating Text\n Memorization","summary":" This work analyses the text memorization behavior of large language models\n(LLMs) when subjected to nucleus sampling. Stochastic decoding methods like\nnucleus sampling are typically applied to overcome issues such as monotonous\nand repetitive text generation, which are often observed with\nmaximization-based decoding techniques. We hypothesize that nucleus sampling\nmight also reduce the occurrence of memorization patterns, because it could\nlead to the selection of tokens outside the memorized sequence. To test this\nhypothesis we create a diagnostic dataset with a known distribution of\nduplicates that gives us some control over the likelihood of memorization of\ncertain parts of the training data. Our analysis of two GPT-Neo models\nfine-tuned on this dataset interestingly shows that (i) an increase of the\nnucleus size reduces memorization only modestly, and (ii) even when models do\nnot engage in \"hard\" memorization -- a verbatim reproduction of training\nsamples -- they may still display \"soft\" memorization whereby they generate\noutputs that echo the training data but without a complete one-by-one\nresemblance.\n","authors":["Luka Borec","Philipp Sadler","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2408.16345v1.pdf","comment":"9 pages, Accepted at INLG 2024 (International Natural Language\n Generation Conference)"},{"id":"http://arxiv.org/abs/2402.12326v2","updated":"2024-08-29T08:27:27Z","published":"2024-02-19T18:00:30Z","title":"PsychoGAT: A Novel Psychological Measurement Paradigm through\n Interactive Fiction Games with LLM Agents","summary":" Psychological measurement is essential for mental health, self-understanding,\nand personal development. Traditional methods, such as self-report scales and\npsychologist interviews, often face challenges with engagement and\naccessibility. While game-based and LLM-based tools have been explored to\nimprove user interest and automate assessment, they struggle to balance\nengagement with generalizability. In this work, we propose PsychoGAT\n(Psychological Game AgenTs) to achieve a generic gamification of psychological\nassessment. The main insight is that powerful LLMs can function both as adept\npsychologists and innovative game designers. By incorporating LLM agents into\ndesignated roles and carefully managing their interactions, PsychoGAT can\ntransform any standardized scales into personalized and engaging interactive\nfiction games. To validate the proposed method, we conduct psychometric\nevaluations to assess its effectiveness and employ human evaluators to examine\nthe generated content across various psychological constructs, including\ndepression, cognitive distortions, and personality traits. Results demonstrate\nthat PsychoGAT serves as an effective assessment tool, achieving statistically\nsignificant excellence in psychometric metrics such as reliability, convergent\nvalidity, and discriminant validity. Moreover, human evaluations confirm\nPsychoGAT's enhancements in content coherence, interactivity, interest,\nimmersion, and satisfaction.\n","authors":["Qisen Yang","Zekun Wang","Honghui Chen","Shenzhi Wang","Yifan Pu","Xin Gao","Wenhao Huang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.12326v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2407.14507v2","updated":"2024-08-29T08:24:42Z","published":"2024-07-19T17:59:03Z","title":"Internal Consistency and Self-Feedback in Large Language Models: A\n Survey","summary":" Large language models (LLMs) often exhibit deficient reasoning or generate\nhallucinations. To address these, studies prefixed with \"Self-\" such as\nSelf-Consistency, Self-Improve, and Self-Refine have been initiated. They share\na commonality: involving LLMs evaluating and updating themselves. Nonetheless,\nthese efforts lack a unified perspective on summarization, as existing surveys\npredominantly focus on categorization.\n In this paper, we summarize a theoretical framework, Internal Consistency,\noffering explanations for reasoning deficiencies and hallucinations. Internal\nConsistency refers to the consistency in expressions among LLMs' latent,\ndecoding, or response layers based on sampling methodologies. Then, we\nintroduce another effective theoretical framework capable of mining Internal\nConsistency, named Self-Feedback. This framework consists of two modules:\nSelf-Evaluation and Self-Update. The former captures Internal Consistency\nSignals, while the latter leverages the signals to enhance either the model's\nresponse or the model itself. This framework has been employed in numerous\nstudies.\n We systematically classify these studies by tasks and lines of work;\nsummarize relevant evaluation methods and benchmarks; and delve into the\nconcern, \"Does Self-Feedback Really Work?\" We also propose several critical\nviewpoints, including the \"Hourglass Evolution of Internal Consistency\",\n\"Consistency Is (Almost) Correctness\" hypothesis, and \"The Paradox of Latent\nand Explicit Reasoning\". The relevant resources are open-sourced at\nhttps://github.com/IAAR-Shanghai/ICSFSurvey.\n","authors":["Xun Liang","Shichao Song","Zifan Zheng","Hanyu Wang","Qingchen Yu","Xunkai Li","Rong-Hua Li","Peng Cheng","Zhonghao Wang","Feiyu Xiong","Zhiyu Li"],"pdf_url":"https://arxiv.org/pdf/2407.14507v2.pdf","comment":"24 pages, 9 figures, 7 tables, 14 equations"},{"id":"http://arxiv.org/abs/2408.16326v1","updated":"2024-08-29T08:02:09Z","published":"2024-08-29T08:02:09Z","title":"Critic-CoT: Boosting the reasoning abilities of large language model via\n Chain-of-thoughts Critic","summary":" Self-critic has become an important mechanism for enhancing the reasoning\nperformance of LLMs. However, current approaches mainly involve basic prompts\nwithout further training, which tend to be over-simplified, leading to limited\naccuracy.Moreover, there is a lack of in-depth investigation of the\nrelationship between LLM's ability to criticism and its task-solving\nperformance.To address these issues, we propose Critic-CoT, a novel framework\nthat pushes LLMs toward System-2-like critic capability, via step-wise CoT\nreasoning format and distant-supervision data construction, without the need\nfor human annotation. Experiments on GSM8K and MATH show that via filtering out\ninvalid solutions or iterative refinement, our enhanced model boosts\ntask-solving performance, which demonstrates the effectiveness of our method.\nFurther, we find that training on critique and refinement alone improves the\ngeneration. We hope our work could shed light on future research on improving\nthe reasoning and critic ability of LLMs.\n","authors":["Xin Zheng","Jie Lou","Boxi Cao","Xueru Wen","Yuqiu Ji","Hongyu Lin","Yaojie Lu","Xianpei Han","Debing Zhang","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2408.16326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16293v1","updated":"2024-08-29T06:49:20Z","published":"2024-08-29T06:49:20Z","title":"Physics of Language Models: Part 2.2, How to Learn From Mistakes on\n Grade-School Math Problems","summary":" Language models have demonstrated remarkable performance in solving reasoning\ntasks; however, even the strongest models still occasionally make reasoning\nmistakes. Recently, there has been active research aimed at improving reasoning\naccuracy, particularly by using pretrained language models to \"self-correct\"\ntheir mistakes via multi-round prompting. In this paper, we follow this line of\nwork but focus on understanding the usefulness of incorporating\n\"error-correction\" data directly into the pretraining stage. This data consists\nof erroneous solution steps immediately followed by their corrections. Using a\nsynthetic math dataset, we show promising results: this type of pretrain data\ncan help language models achieve higher reasoning accuracy directly (i.e.,\nthrough simple auto-regression, without multi-round prompting) compared to\npretraining on the same amount of error-free data. We also delve into many\ndetails, such as (1) how this approach differs from beam search, (2) how such\ndata can be prepared, (3) whether masking is needed on the erroneous tokens,\n(4) the amount of error required, (5) whether such data can be deferred to the\nfine-tuning stage, and many others.\n","authors":["Tian Ye","Zicheng Xu","Yuanzhi Li","Zeyuan Allen-Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.16293v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.20311"},{"id":"http://arxiv.org/abs/2408.16287v1","updated":"2024-08-29T06:38:55Z","published":"2024-08-29T06:38:55Z","title":"Measuring the Accuracy of Automatic Speech Recognition Solutions","summary":" For d/Deaf and hard of hearing (DHH) people, captioning is an essential\naccessibility tool. Significant developments in artificial intelligence (AI)\nmean that Automatic Speech Recognition (ASR) is now a part of many popular\napplications. This makes creating captions easy and broadly available - but\ntranscription needs high levels of accuracy to be accessible. Scientific\npublications and industry report very low error rates, claiming AI has reached\nhuman parity or even outperforms manual transcription. At the same time the DHH\ncommunity reports serious issues with the accuracy and reliability of ASR.\nThere seems to be a mismatch between technical innovations and the real-life\nexperience for people who depend on transcription. Independent and\ncomprehensive data is needed to capture the state of ASR. We measured the\nperformance of eleven common ASR services with recordings of Higher Education\nlectures. We evaluated the influence of technical conditions like streaming,\nthe use of vocabularies, and differences between languages. Our results show\nthat accuracy ranges widely between vendors and for the individual audio\nsamples. We also measured a significant lower quality for streaming ASR, which\nis used for live events. Our study shows that despite the recent improvements\nof ASR, common services lack reliability in accuracy.\n","authors":["Korbinian Kuhn","Verena Kersken","Benedikt Reuter","Niklas Egger","Gottfried Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2408.16287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16276v1","updated":"2024-08-29T05:47:14Z","published":"2024-08-29T05:47:14Z","title":"Enhancing AI-Driven Psychological Consultation: Layered Prompts with\n Large Language Models","summary":" Psychological consultation is essential for improving mental health and\nwell-being, yet challenges such as the shortage of qualified professionals and\nscalability issues limit its accessibility. To address these challenges, we\nexplore the use of large language models (LLMs) like GPT-4 to augment\npsychological consultation services. Our approach introduces a novel layered\nprompting system that dynamically adapts to user input, enabling comprehensive\nand relevant information gathering. We also develop empathy-driven and\nscenario-based prompts to enhance the LLM's emotional intelligence and\ncontextual understanding in therapeutic settings. We validated our approach\nthrough experiments using a newly collected dataset of psychological\nconsultation dialogues, demonstrating significant improvements in response\nquality. The results highlight the potential of our prompt engineering\ntechniques to enhance AI-driven psychological consultation, offering a scalable\nand accessible solution to meet the growing demand for mental health support.\n","authors":["Rafael Souza","Jia-Hao Lim","Alexander Davis"],"pdf_url":"https://arxiv.org/pdf/2408.16276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11911v6","updated":"2024-08-29T05:14:36Z","published":"2023-09-21T09:22:07Z","title":"InstructERC: Reforming Emotion Recognition in Conversation with\n Multi-task Retrieval-Augmented Large Language Models","summary":" The field of emotion recognition of conversation (ERC) has been focusing on\nseparating sentence feature encoding and context modeling, lacking exploration\nin generative paradigms based on unified designs. In this study, we propose a\nnovel approach, InstructERC, to reformulate the ERC task from a discriminative\nframework to a generative framework based on Large Language Models (LLMs).\nInstructERC makes three significant contributions: (1) it introduces a simple\nyet effective retrieval template module, which helps the model explicitly\nintegrate multi-granularity dialogue supervision information. (2) We introduce\ntwo additional emotion alignment tasks, namely speaker identification and\nemotion prediction tasks, to implicitly model the dialogue role relationships\nand future emotional tendencies in conversations. (3) Pioneeringly, we unify\nemotion labels across benchmarks through the feeling wheel to fit real\napplication scenarios. InstructERC still perform impressively on this unified\ndataset. Our LLM-based plugin framework significantly outperforms all previous\nmodels and achieves comprehensive SOTA on three commonly used ERC datasets.\nExtensive analysis of parameter-efficient and data-scaling experiments provides\nempirical guidance for applying it in practical scenarios.\n","authors":["Shanglin Lei","Guanting Dong","Xiaoping Wang","Keheng Wang","Runqi Qiao","Sirui Wang"],"pdf_url":"https://arxiv.org/pdf/2309.11911v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16264v1","updated":"2024-08-29T05:02:52Z","published":"2024-08-29T05:02:52Z","title":"LoraMap: Harnessing the Power of LoRA Connections","summary":" Large Language Models (LLMs) can benefit from mitigating hallucinations\nthrough fact-checking and overcoming substantial computational overhead with\nparameter-efficient techniques such as Low-Rank Adaptation (LoRA). While some\nstudies have explored the parallel integration of multiple LoRAs, these\napproaches need attention to the connections between them. This paper\ninvestigates methods to establish connections among multiple LoRAs. We create\nthree reasoning datasets tailored to fact-checking and fine-tune individual\nLoRAs, allowing them to view and reason from diverse perspectives. Then, we\nexplore strategies for allocating these reasoning LoRAs and introduce LoraMap,\nan approach to map connections between them. The results on the fact-checking\ntask demonstrate that the performance of LoraMap is superior to LoraHub, an\nexisting LoRA composition method. LoraMap also outperforms with significantly\nfewer parameters than LoraConcat, which concatenates LoRAs and further\nfine-tunes them.\n","authors":["Hyeryun Park","Jeongwon Kwak","Dongsuk Jang","Sumin Park","Jinwook Choi"],"pdf_url":"https://arxiv.org/pdf/2408.16264v1.pdf","comment":"13 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16241v1","updated":"2024-08-29T03:50:24Z","published":"2024-08-29T03:50:24Z","title":"Making the Most of your Model: Methods for Finetuning and Applying\n Pretrained Transformers","summary":" This thesis provides methods and analysis of models which make progress on\nthis goal. The techniques outlined are task agnostic, and should provide\nbenefit when used with nearly any transformer LM. We introduce two new\nfinetuning methods which add new capabilities to the models they are used on.\nThe first adds a recurrence mechanism, which removes the fixed-window sized\nconstraint and improves the efficiency of a transformer decoder. The second\nallows masked language models (MLMs) to be used for initialization of both the\nencoder and decoder of a non-autoregressive sequence-to-sequence transformer,\nopening up generative applications of models which were previously only used\nfor natural language understanding tasks.\n We also introduce two new techniques for improving the quality of predictions\nof any transformer decoder without additional finetuning. One, hidden state\noptimization, can be applied to any transformer decoder to improve the quality\nof predictions at inference time, especially for few-shot classification. The\nother, conditional beam search, allows practitioners to search for natural\nlanguage generation (NLG) model outputs with high likelihood while conditioning\non the event that the output is not degenerate (e.g. empty, repetitive, etc.).\n Finally, we provide theoretical and empirical insights on the divergence of\nmodel-likelihood and output quality which has widely been observed in prior\nwork. These insights apply to any model which represents a distribution over\ntext, and apply to language models which are not transformers or even\nautoregressive. We argue that the NLP community has, to some extent,\nmisunderstood the implications of these findings, and encourage a point of view\nwhich has more nuance.\n","authors":["Davis Yoshida"],"pdf_url":"https://arxiv.org/pdf/2408.16241v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2408.13985v2","updated":"2024-08-29T02:40:12Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n Models","summary":" With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Xuefeng Bai","Lemao Liu","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v2.pdf","comment":"14 pages, 6 figures. arXiv admin note: text overlap with\n arXiv:2305.17440 by other authors"},{"id":"http://arxiv.org/abs/2408.10903v5","updated":"2024-08-29T02:38:05Z","published":"2024-08-20T14:47:38Z","title":"BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General\n Role-Playing Language Model","summary":" The rapid advancement of large language models (LLMs) has revolutionized\nrole-playing, enabling the development of general role-playing models. However,\ncurrent role-playing training has two significant issues: (I) Using a\npredefined role profile to prompt dialogue training for specific scenarios\nusually leads to inconsistencies and even conflicts between the dialogue and\nthe profile, resulting in training biases. (II) The model learns to imitate the\nrole based solely on the profile, neglecting profile-dialogue alignment at the\nsentence level. In this work, we propose a simple yet effective framework\ncalled BEYOND DIALOGUE, designed to overcome these hurdles. This framework\ninnovatively introduces \"beyond dialogue\" tasks to align dialogue with profile\ntraits based on each specific scenario, thereby eliminating biases during\ntraining. Furthermore, by adopting an innovative prompting mechanism that\ngenerates reasoning outcomes for training, the framework allows the model to\nachieve fine-grained alignment between profile and dialogue at the sentence\nlevel. The aforementioned methods are fully automated and low-cost.\nAdditionally, the integration of automated dialogue and objective evaluation\nmethods forms a comprehensive framework, paving the way for general\nrole-playing. Experimental results demonstrate that our model excels in\nadhering to and reflecting various dimensions of role profiles, outperforming\nmost proprietary general and specialized role-playing baselines. All code and\ndatasets are available at https://github.com/yuyouyu32/BeyondDialogue.\n","authors":["Yeyong Yu","Runsheng Yu","Haojie Wei","Zhanqiu Zhang","Quan Qian"],"pdf_url":"https://arxiv.org/pdf/2408.10903v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16221v1","updated":"2024-08-29T02:35:53Z","published":"2024-08-29T02:35:53Z","title":"SSDM: Scalable Speech Dysfluency Modeling","summary":" Speech dysfluency modeling is the core module for spoken language learning,\nand speech therapy. However, there are three challenges. First, current\nstate-of-the-art solutions suffer from poor scalability. Second, there is a\nlack of a large-scale dysfluency corpus. Third, there is not an effective\nlearning framework. In this paper, we propose \\textit{SSDM: Scalable Speech\nDysfluency Modeling}, which (1) adopts articulatory gestures as scalable forced\nalignment; (2) introduces connectionist subsequence aligner (CSA) to achieve\ndysfluency alignment; (3) introduces a large-scale simulated dysfluency corpus\ncalled Libri-Dys; and (4) develops an end-to-end system by leveraging the power\nof large language models (LLMs). We expect SSDM to serve as a standard in the\narea of dysfluency modeling. Demo is available at\n\\url{https://eureka235.github.io}.\n","authors":["Jiachen Lian","Xuanru Zhou","Zoe Ezzes","Jet Vonk","Brittany Morin","David Baquirin","Zachary Mille","Maria Luisa Gorno Tempini","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2408.16221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21191v2","updated":"2024-08-29T02:27:19Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Sequential Recommendation with Large Language Models","summary":" Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data and recommend next items for the user.\nSignificant progress has been made in this domain by leveraging classification\nbased learning methods. Inspired by the recent paradigm of 'pretrain, prompt\nand predict' in NLP, we consider sequential recommendation as a sequence to\nsequence generation task and propose a novel model named Generative\nRecommendation (GenRec). Unlike classification based models that learn explicit\nuser and item representations, GenRec utilizes the sequence modeling capability\nof Transformer and adopts the masked item prediction objective to effectively\nlearn the hidden bidirectional sequential patterns. Different from existing\ngenerative sequential recommendation models, GenRec does not rely on manually\ndesigned hard prompts. The input to GenRec is textual user item sequence and\nthe output is top ranked next items. Moreover, GenRec is lightweight and\nrequires only a few hours to train effectively in low-resource settings, making\nit highly applicable to real-world scenarios and helping to democratize large\nlanguage models in the sequential recommendation domain. Our extensive\nexperiments have demonstrated that GenRec generalizes on various public\nreal-world datasets and achieves state-of-the-art results. Our experiments also\nvalidate the effectiveness of the the proposed masked item prediction objective\nthat improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16213v1","updated":"2024-08-29T02:12:58Z","published":"2024-08-29T02:12:58Z","title":"M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language\n Models for Chest X-ray Interpretation","summary":" The rapid evolution of artificial intelligence, especially in large language\nmodels (LLMs), has significantly impacted various domains, including\nhealthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs,\nbut with limitations: either underutilizing the multi-tasking capabilities of\nLLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM\ndesigned to enhance CXR interpretation. The model is trained on a visual\ninstruction-following dataset that integrates various task-specific datasets in\na conversational format. As a result, the model supports multiple tasks such as\nmedical report generation (MRG), visual grounding, and visual question\nanswering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by\nemploying a chain-of-thought prompting strategy, in which it identifies\nfindings in CXR images and subsequently generates corresponding reports. The\nmodel is adaptable to various MRG scenarios depending on the available inputs,\nsuch as single-image, multi-image, and multi-study contexts. In addition to\nMRG, M4CXR performs visual grounding at a level comparable to specialized\nmodels and also demonstrates outstanding performance in VQA. Both quantitative\nand qualitative assessments reveal M4CXR's versatility in MRG, visual\ngrounding, and VQA, while consistently maintaining clinical accuracy.\n","authors":["Jonggwon Park","Soobum Kim","Byungmu Yoon","Jihun Hyun","Kyoyun Choi"],"pdf_url":"https://arxiv.org/pdf/2408.16213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16209v1","updated":"2024-08-29T02:05:39Z","published":"2024-08-29T02:05:39Z","title":"From cart to truck: meaning shift through words in English in the last\n two centuries","summary":" This onomasiological study uses diachronic word embeddings to explore how\ndifferent words represented the same concepts over time, using historical word\ndata from 1800 to 2000. We identify shifts in energy, transport, entertainment,\nand computing domains, revealing connections between language and societal\nchanges.\n Our approach consisted in using diachronic word embeddings trained using\nword2vec with skipgram and aligning them using orthogonal Procrustes. We\ndiscuss possible difficulties linked to the relationships the method\nidentifies. Moreover, we look at the ethical aspects of interpreting results,\nhighlighting the need for expert insights to understand the method's\nsignificance.\n","authors":["Esteban Rodríguez Betancourt","Edgar Casasola Murillo"],"pdf_url":"https://arxiv.org/pdf/2408.16209v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.16208v1","updated":"2024-08-29T02:03:05Z","published":"2024-08-29T02:03:05Z","title":"ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology\n Report Generation Metrics","summary":" Given the rapidly expanding capabilities of generative AI models for\nradiology, there is a need for robust metrics that can accurately measure the\nquality of AI-generated radiology reports across diverse hospitals. We develop\nReXamine-Global, a LLM-powered, multi-site framework that tests metrics across\ndifferent writing styles and patient populations, exposing gaps in their\ngeneralization. First, our method tests whether a metric is undesirably\nsensitive to reporting style, providing different scores depending on whether\nAI-generated reports are stylistically similar to ground-truth reports or not.\nSecond, our method measures whether a metric reliably agrees with experts, or\nwhether metric and expert scores of AI-generated report quality diverge for\nsome sites. Using 240 reports from 6 hospitals around the world, we apply\nReXamine-Global to 7 established report evaluation metrics and uncover serious\ngaps in their generalizability. Developers can apply ReXamine-Global when\ndesigning new report evaluation metrics, ensuring their robustness across\nsites. Additionally, our analysis of existing metrics can guide users of those\nmetrics towards evaluation procedures that work reliably at their sites of\ninterest.\n","authors":["Oishi Banerjee","Agustina Saenz","Kay Wu","Warren Clements","Adil Zia","Dominic Buensalido","Helen Kavnoudias","Alain S. Abi-Ghanem","Nour El Ghawi","Cibele Luna","Patricia Castillo","Khaled Al-Surimi","Rayyan A. Daghistani","Yuh-Min Chen","Heng-sheng Chao","Lars Heiliger","Moon Kim","Johannes Haubold","Frederic Jonske","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2408.16208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16180v1","updated":"2024-08-29T00:18:12Z","published":"2024-08-29T00:18:12Z","title":"Benchmarking Japanese Speech Recognition on ASR-LLM Setups with\n Multi-Pass Augmented Generative Error Correction","summary":" With the strong representational power of large language models (LLMs),\ngenerative error correction (GER) for automatic speech recognition (ASR) aims\nto provide semantic and phonetic refinements to address ASR errors. This work\nexplores how LLM-based GER can enhance and expand the capabilities of Japanese\nlanguage processing, presenting the first GER benchmark for Japanese ASR with\n0.9-2.6k text utterances. We also introduce a new multi-pass augmented\ngenerative error correction (MPA GER) by integrating multiple system hypotheses\non the input side with corrections from multiple LLMs on the output side and\nthen merging them. To the best of our knowledge, this is the first\ninvestigation of the use of LLMs for Japanese GER, which involves second-pass\nlanguage modeling on the output transcriptions generated by the ASR system\n(e.g., N-best hypotheses). Our experiments demonstrated performance improvement\nin the proposed methods of ASR quality and generalization both in SPREDS-U1-ja\nand CSJ data.\n","authors":["Yuka Ko","Sheng Li","Chao-Han Huck Yang","Tatsuya Kawahara"],"pdf_url":"https://arxiv.org/pdf/2408.16180v1.pdf","comment":"submitted to SLT2024"},{"id":"http://arxiv.org/abs/2408.13985v2","updated":"2024-08-29T02:40:12Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n Models","summary":" With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Xuefeng Bai","Lemao Liu","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.11897v2","updated":"2024-08-29T23:45:48Z","published":"2024-05-20T09:30:03Z","title":"CReMa: Crisis Response through Computational Identification and Matching\n of Cross-Lingual Requests and Offers Shared on Social Media","summary":" During times of crisis, social media platforms play a crucial role in\nfacilitating communication and coordinating resources. In the midst of chaos\nand uncertainty, communities often rely on these platforms to share urgent\npleas for help, extend support, and organize relief efforts. However, the\noverwhelming volume of conversations during such periods can escalate to\nunprecedented levels, necessitating the automated identification and matching\nof requests and offers to streamline relief operations. Additionally, there is\na notable absence of studies conducted in multi-lingual settings, despite the\nfact that any geographical area can have a diverse linguistic population.\nTherefore, we propose CReMa (Crisis Response Matcher), a systematic approach\nthat integrates textual, temporal, and spatial features to address the\nchallenges of effectively identifying and matching requests and offers on\nsocial media platforms during emergencies. Our approach utilizes a\ncrisis-specific pre-trained model and a multi-lingual embedding space. We\nemulate human decision-making to compute temporal and spatial features and\nnon-linearly weigh the textual features. The results from our experiments are\npromising, outperforming strong baselines. Additionally, we introduce a novel\nmulti-lingual dataset simulating help-seeking and offering assistance on social\nmedia in 16 languages and conduct comprehensive cross-lingual experiments.\nFurthermore, we analyze a million-scale geotagged global dataset to understand\npatterns in seeking help and offering assistance on social media. Overall,\nthese contributions advance the field of crisis informatics and provide\nbenchmarks for future research in the area.\n","authors":["Rabindra Lamsal","Maria Rodriguez Read","Shanika Karunasekera","Muhammad Imran"],"pdf_url":"https://arxiv.org/pdf/2405.11897v2.pdf","comment":"\\copyright 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2408.16942v1","updated":"2024-08-29T23:39:11Z","published":"2024-08-29T23:39:11Z","title":"A longitudinal sentiment analysis of Sinophobia during COVID-19 using\n large language models","summary":" The COVID-19 pandemic has exacerbated xenophobia, particularly Sinophobia,\nleading to widespread discrimination against individuals of Chinese descent.\nLarge language models (LLMs) are pre-trained deep learning models used for\nnatural language processing (NLP) tasks. The ability of LLMs to understand and\ngenerate human-like text makes them particularly useful for analysing social\nmedia data to detect and evaluate sentiments. We present a sentiment analysis\nframework utilising LLMs for longitudinal sentiment analysis of the Sinophobic\nsentiments expressed in X (Twitter) during the COVID-19 pandemic. The results\nshow a significant correlation between the spikes in Sinophobic tweets,\nSinophobic sentiments and surges in COVID-19 cases, revealing that the\nevolution of the pandemic influenced public sentiment and the prevalence of\nSinophobic discourse. Furthermore, the sentiment analysis revealed a\npredominant presence of negative sentiments, such as annoyance and denial,\nwhich underscores the impact of political narratives and misinformation shaping\npublic opinion. The lack of empathetic sentiment which was present in previous\nstudies related to COVID-19 highlights the way the political narratives in\nmedia viewed the pandemic and how it blamed the Chinese community. Our study\nhighlights the importance of transparent communication in mitigating xenophobic\nsentiments during global crises.\n","authors":["Chen Wang","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2408.16942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18470v2","updated":"2024-08-29T23:13:56Z","published":"2024-04-29T07:11:39Z","title":"ECC Analyzer: Extract Trading Signal from Earnings Conference Calls\n using Large Language Model for Stock Performance Prediction","summary":" In the realm of financial analytics, leveraging unstructured data, such as\nearnings conference calls (ECCs), to forecast stock volatility is a critical\nchallenge that has attracted both academics and investors. While previous\nstudies have used multimodal deep learning-based models to obtain a general\nview of ECCs for volatility predicting, they often fail to capture detailed,\ncomplex information. Our research introduces a novel framework: \\textbf{ECC\nAnalyzer}, which utilizes large language models (LLMs) to extract richer, more\npredictive content from ECCs to aid the model's prediction performance. We use\nthe pre-trained large models to extract textual and audio features from ECCs\nand implement a hierarchical information extraction strategy to extract more\nfine-grained information. This strategy first extracts paragraph-level general\ninformation by summarizing the text and then extracts fine-grained focus\nsentences using Retrieval-Augmented Generation (RAG). These features are then\nfused through multimodal feature fusion to perform volatility prediction.\nExperimental results demonstrate that our model outperforms traditional\nanalytical benchmarks, confirming the effectiveness of advanced LLM techniques\nin financial analysis.\n","authors":["Yupeng Cao","Zhi Chen","Qingyun Pei","Nathan Jinseok Lee","K. P. Subbalakshmi","Papa Momar Ndiaye"],"pdf_url":"https://arxiv.org/pdf/2404.18470v2.pdf","comment":"9 pages, 1 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.16937v1","updated":"2024-08-29T23:13:45Z","published":"2024-08-29T23:13:45Z","title":"Plausible-Parrots @ MSP2023: Enhancing Semantic Plausibility Modeling\n using Entity and Event Knowledge","summary":" In this work, we investigate the effectiveness of injecting external\nknowledge to a large language model (LLM) to identify semantic plausibility of\nsimple events. Specifically, we enhance the LLM with fine-grained entity types,\nevent types and their definitions extracted from an external knowledge base.\nThese knowledge are injected into our system via designed templates. We also\naugment the data to balance the label distribution and adapt the task setting\nto real world scenarios in which event mentions are expressed as natural\nlanguage sentences. The experimental results show the effectiveness of the\ninjected knowledge on modeling semantic plausibility of events. An error\nanalysis further emphasizes the importance of identifying non-trivial entity\nand event types.\n","authors":["Chong Shen","Chenyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16937v1.pdf","comment":"10 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.04785v2","updated":"2024-08-29T22:18:08Z","published":"2024-03-02T22:33:17Z","title":"Large Language Multimodal Models for 5-Year Chronic Disease Cohort\n Prediction Using EHR Data","summary":" Chronic diseases such as diabetes are the leading causes of morbidity and\nmortality worldwide. Numerous research studies have been attempted with various\ndeep learning models in diagnosis. However, most previous studies had certain\nlimitations, including using publicly available datasets (e.g. MIMIC), and\nimbalanced data. In this study, we collected five-year electronic health\nrecords (EHRs) from the Taiwan hospital database, including 1,420,596 clinical\nnotes, 387,392 laboratory test results, and more than 1,505 laboratory test\nitems, focusing on research pre-training large language models. We proposed a\nnovel Large Language Multimodal Models (LLMMs) framework incorporating\nmultimodal data from clinical notes and laboratory test results for the\nprediction of chronic disease risk. Our method combined a text embedding\nencoder and multi-head attention layer to learn laboratory test values,\nutilizing a deep neural network (DNN) module to merge blood features with\nchronic disease semantics into a latent space. In our experiments, we observe\nthat clinicalBERT and PubMed-BERT, when combined with attention fusion, can\nachieve an accuracy of 73% in multiclass chronic diseases and diabetes\nprediction. By transforming laboratory test values into textual descriptions\nand employing the Flan T-5 model, we achieved a 76% Area Under the ROC Curve\n(AUROC), demonstrating the effectiveness of leveraging numerical text data for\ntraining and inference in language models. This approach significantly improves\nthe accuracy of early-stage diabetes prediction.\n","authors":["Jun-En Ding","Phan Nguyen Minh Thao","Wen-Chih Peng","Jian-Zhe Wang","Chun-Cheng Chug","Min-Chen Hsieh","Yun-Chien Tseng","Ling Chen","Dongsheng Luo","Chi-Te Wang","Pei-fu Chen","Feng Liu","Fang-Ming Hung"],"pdf_url":"https://arxiv.org/pdf/2403.04785v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16932v1","updated":"2024-08-29T22:14:21Z","published":"2024-08-29T22:14:21Z","title":"Event Extraction for Portuguese: A QA-driven Approach using ACE-2005","summary":" Event extraction is an Information Retrieval task that commonly consists of\nidentifying the central word for the event (trigger) and the event's arguments.\nThis task has been extensively studied for English but lags behind for\nPortuguese, partly due to the lack of task-specific annotated corpora. This\npaper proposes a framework in which two separated BERT-based models were\nfine-tuned to identify and classify events in Portuguese documents. We\ndecompose this task into two sub-tasks. Firstly, we use a token classification\nmodel to detect event triggers. To extract event arguments, we train a Question\nAnswering model that queries the triggers about their corresponding event\nargument roles. Given the lack of event annotated corpora in Portuguese, we\ntranslated the original version of the ACE-2005 dataset (a reference in the\nfield) into Portuguese, producing a new corpus for Portuguese event extraction.\nTo accomplish this, we developed an automatic translation pipeline. Our\nframework obtains F1 marks of 64.4 for trigger classification and 46.7 for\nargument classification setting, thus a new state-of-the-art reference for\nthese tasks in Portuguese.\n","authors":["Luís Filipe Cunha","Ricardo Campos","Alípio Jorge"],"pdf_url":"https://arxiv.org/pdf/2408.16932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16928v1","updated":"2024-08-29T22:05:08Z","published":"2024-08-29T22:05:08Z","title":"ACE-2005-PT: Corpus for Event Extraction in Portuguese","summary":" Event extraction is an NLP task that commonly involves identifying the\ncentral word (trigger) for an event and its associated arguments in text.\nACE-2005 is widely recognised as the standard corpus in this field. While other\ncorpora, like PropBank, primarily focus on annotating predicate-argument\nstructure, ACE-2005 provides comprehensive information about the overall event\nstructure and semantics. However, its limited language coverage restricts its\nusability. This paper introduces ACE-2005-PT, a corpus created by translating\nACE-2005 into Portuguese, with European and Brazilian variants. To speed up the\nprocess of obtaining ACE-2005-PT, we rely on automatic translators. This,\nhowever, poses some challenges related to automatically identifying the correct\nalignments between multi-word annotations in the original text and in the\ncorresponding translated sentence. To achieve this, we developed an alignment\npipeline that incorporates several alignment techniques: lemmatization, fuzzy\nmatching, synonym matching, multiple translations and a BERT-based word\naligner. To measure the alignment effectiveness, a subset of annotations from\nthe ACE-2005-PT corpus was manually aligned by a linguist expert. This subset\nwas then compared against our pipeline results which achieved exact and relaxed\nmatch scores of 70.55\\% and 87.55\\% respectively. As a result, we successfully\ngenerated a Portuguese version of the ACE-2005 corpus, which has been accepted\nfor publication by LDC.\n","authors":["Luís Filipe Cunha","Purificação Silvano","Ricardo Campos","Alípio Jorge"],"pdf_url":"https://arxiv.org/pdf/2408.16928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05893v4","updated":"2024-08-29T21:34:22Z","published":"2024-04-08T22:29:53Z","title":"Use of a Structured Knowledge Base Enhances Metadata Curation by Large\n Language Models","summary":" Metadata play a crucial role in ensuring the findability, accessibility,\ninteroperability, and reusability of datasets. This paper investigates the\npotential of large language models (LLMs), specifically GPT-4, to improve\nadherence to metadata standards. We conducted experiments on 200 random data\nrecords describing human samples relating to lung cancer from the NCBI\nBioSample repository, evaluating GPT-4's ability to suggest edits for adherence\nto metadata standards. We computed the adherence accuracy of field name-field\nvalue pairs through a peer review process, and we observed a marginal average\nimprovement in adherence to the standard data dictionary from 79% to 80%\n(p<0.5). We then prompted GPT-4 with domain information in the form of the\ntextual descriptions of CEDAR templates and recorded a significant improvement\nto 97% from 79% (p<0.01). These results indicate that, while LLMs may not be\nable to correct legacy metadata to ensure satisfactory adherence to standards\nwhen unaided, they do show promise for use in automated metadata curation when\nintegrated with a structured knowledge base\n","authors":["Sowmya S. Sundaram","Benjamin Solomon","Avani Khatri","Anisha Laumas","Purvesh Khatri","Mark A. Musen"],"pdf_url":"https://arxiv.org/pdf/2404.05893v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16893v1","updated":"2024-08-29T20:27:05Z","published":"2024-08-29T20:27:05Z","title":"Exploring Multiple Strategies to Improve Multilingual Coreference\n Resolution in CorefUD","summary":" Coreference resolution, the task of identifying expressions in text that\nrefer to the same entity, is a critical component in various natural language\nprocessing (NLP) applications. This paper presents our end-to-end neural\ncoreference resolution system, utilizing the CorefUD 1.1 dataset, which spans\n17 datasets across 12 languages. We first establish strong baseline models,\nincluding monolingual and cross-lingual variations, and then propose several\nextensions to enhance performance across diverse linguistic contexts. These\nextensions include cross-lingual training, incorporation of syntactic\ninformation, a Span2Head model for optimized headword prediction, and advanced\nsingleton modeling. We also experiment with headword span representation and\nlong-documents modeling through overlapping segments. The proposed extensions,\nparticularly the heads-only approach, singleton modeling, and long document\nprediction significantly improve performance across most datasets. We also\nperform zero-shot cross-lingual experiments, highlighting the potential and\nlimitations of cross-lingual transfer in coreference resolution. Our findings\ncontribute to the development of robust and scalable coreference systems for\nmultilingual coreference resolution. Finally, we evaluate our model on CorefUD\n1.1 test set and surpass the best model from CRAC 2023 shared task of a\ncomparable size by a large margin. Our nodel is available on GitHub:\n\\url{https://github.com/ondfa/coref-multiling}\n","authors":["Ondřej Pražák","Miloslav Konopík"],"pdf_url":"https://arxiv.org/pdf/2408.16893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06266v3","updated":"2024-08-29T20:26:19Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16889v1","updated":"2024-08-29T20:20:49Z","published":"2024-08-29T20:20:49Z","title":"LLaVA-Chef: A Multi-modal Generative Model for Food Recipes","summary":" In the rapidly evolving landscape of online recipe sharing within a\nglobalized context, there has been a notable surge in research towards\ncomprehending and generating food recipes. Recent advancements in large\nlanguage models (LLMs) like GPT-2 and LLaVA have paved the way for Natural\nLanguage Processing (NLP) approaches to delve deeper into various facets of\nfood-related tasks, encompassing ingredient recognition and comprehensive\nrecipe generation. Despite impressive performance and multi-modal adaptability\nof LLMs, domain-specific training remains paramount for their effective\napplication. This work evaluates existing LLMs for recipe generation and\nproposes LLaVA-Chef, a novel model trained on a curated dataset of diverse\nrecipe prompts in a multi-stage approach. First, we refine the mapping of\nvisual food image embeddings to the language space. Second, we adapt LLaVA to\nthe food domain by fine-tuning it on relevant recipe data. Third, we utilize\ndiverse prompts to enhance the model's recipe comprehension. Finally, we\nimprove the linguistic quality of generated recipes by penalizing the model\nwith a custom loss function. LLaVA-Chef demonstrates impressive improvements\nover pretrained LLMs and prior works. A detailed qualitative analysis reveals\nthat LLaVA-Chef generates more detailed recipes with precise ingredient\nmentions, compared to existing approaches.\n","authors":["Fnu Mohbat","Mohammed J. Zaki"],"pdf_url":"https://arxiv.org/pdf/2408.16889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15604v3","updated":"2024-08-29T20:05:27Z","published":"2024-05-24T14:38:11Z","title":"Text Generation: A Systematic Literature Review of Tasks, Evaluation,\n and Challenges","summary":" Text generation has become more accessible than ever, and the increasing\ninterest in these systems, especially those using large language models, has\nspurred an increasing number of related publications. We provide a systematic\nliterature review comprising 244 selected papers between 2017 and 2024. This\nreview categorizes works in text generation into five main tasks: open-ended\ntext generation, summarization, translation, paraphrasing, and question\nanswering. For each task, we review their relevant characteristics, sub-tasks,\nand specific challenges (e.g., missing datasets for multi-document\nsummarization, coherence in story generation, and complex reasoning for\nquestion answering). Additionally, we assess current approaches for evaluating\ntext generation systems and ascertain problems with current metrics. Our\ninvestigation shows nine prominent challenges common to all tasks and sub-tasks\nin recent text generation publications: bias, reasoning, hallucinations,\nmisuse, privacy, interpretability, transparency, datasets, and computing. We\nprovide a detailed analysis of these challenges, their potential solutions, and\nwhich gaps still require further engagement from the community. This systematic\nliterature review targets two main audiences: early career researchers in\nnatural language processing looking for an overview of the field and promising\nresearch directions, as well as experienced researchers seeking a detailed view\nof tasks, evaluation methodologies, open challenges, and recent mitigation\nstrategies.\n","authors":["Jonas Becker","Jan Philip Wahle","Bela Gipp","Terry Ruas"],"pdf_url":"https://arxiv.org/pdf/2405.15604v3.pdf","comment":"35 pages, 2 figures, 2 tables, Under review"},{"id":"http://arxiv.org/abs/2408.14772v2","updated":"2024-08-29T19:50:33Z","published":"2024-08-27T04:20:10Z","title":"A global AI community requires language-diverse publishing","summary":" In this provocation, we discuss the English dominance of the AI research\ncommunity, arguing that the requirement for English language publishing upholds\nand reinforces broader regimes of extraction in AI. While large language models\nand machine translation have been celebrated as a way to break down barriers,\nwe regard their use as a symptom of linguistic exclusion of scientists and\npotential readers. We propose alternative futures for a healthier publishing\nculture, organized around three themes: administering conferences in the\nlanguages of the country in which they are held, instructing peer reviewers not\nto adjudicate the language appropriateness of papers, and offering\nopportunities to publish and present in multiple languages. We welcome new\ntranslations of this piece. Please contact the authors if you would like to\ncontribute one.\n","authors":["Haley Lepp","Parth Sarin"],"pdf_url":"https://arxiv.org/pdf/2408.14772v2.pdf","comment":"Translations by Tianyu M. Fang (Mandarin Chinese), Michael Hardy\n (Guarani), Vandana Sarin and Vivek Sarin (Hindi), Roshna Omer Abdulrahman\n (Soran\\^i Kurdish), Gabriel Poesia (Portuguese), and Mat\\'ias Grinberg\n (Spanish). In the proceedings of the Global AI Cultures Workshop at the\n Twelfth International Conference on Learning Representations (ICLR) 2024,\n Vienna, Austria, May 7-11, 2024"},{"id":"http://arxiv.org/abs/2211.09944v3","updated":"2024-08-29T19:25:59Z","published":"2022-11-17T23:38:29Z","title":"MelHuBERT: A simplified HuBERT on Mel spectrograms","summary":" Self-supervised models have had great success in learning speech\nrepresentations that can generalize to various downstream tasks. However, most\nself-supervised models require a large amount of compute and multiple GPUs to\ntrain, significantly hampering the development of self-supervised learning. In\nan attempt to reduce the computation of training, we revisit the training of\nHuBERT, a highly successful self-supervised model. We improve and simplify\nseveral key components, including the loss function, input representation, and\ntraining in multiple stages. Our model, MelHuBERT, is able to achieve favorable\nperformance on phone recognition, speaker identification, and automatic speech\nrecognition against HuBERT, while saving 31.2% of the pre-training time, or\nequivalently 33.5% MACs per one second speech. The code and pre-trained models\nare available in https://github.com/nervjack2/MelHuBERT.\n","authors":["Tzu-Quan Lin","Hung-yi Lee","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2211.09944v3.pdf","comment":"ASRU 2023"},{"id":"http://arxiv.org/abs/2406.11402v2","updated":"2024-08-29T19:24:29Z","published":"2024-06-17T10:45:36Z","title":"Are Small Language Models Ready to Compete with Large Language Models\n for Practical Applications?","summary":" The rapid rise of Language Models (LMs) has expanded their use in several\napplications. Yet, due to constraints of model size, associated cost, or\nproprietary restrictions, utilizing state-of-the-art (SOTA) LLMs is not always\nfeasible. With open, smaller LMs emerging, more applications can leverage their\ncapabilities, but selecting the right LM can be challenging as smaller LMs\ndon't perform well universally. This work tries to bridge this gap by proposing\na framework to experimentally evaluate small, open LMs in practical settings\nthrough measuring semantic correctness of outputs across three practical\naspects: task types, application domains and reasoning types, using diverse\nprompt styles. It also conducts an in-depth comparison of 10 small, open LMs to\nidentify best LM and prompt style depending on specific application requirement\nusing the proposed framework. We also show that if selected appropriately, they\ncan outperform SOTA LLMs like DeepSeek-v2, GPT-4o-mini, Gemini-1.5-Pro, and\neven compete with GPT-4o.\n","authors":["Neelabh Sinha","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2406.11402v2.pdf","comment":"Submitted to ARR"},{"id":"http://arxiv.org/abs/2310.12404v2","updated":"2024-08-29T19:08:54Z","published":"2023-10-19T01:20:12Z","title":"Loop Copilot: Conducting AI Ensembles for Music Generation and Iterative\n Editing","summary":" Creating music is iterative, requiring varied methods at each stage. However,\nexisting AI music systems fall short in orchestrating multiple subsystems for\ndiverse needs. To address this gap, we introduce Loop Copilot, a novel system\nthat enables users to generate and iteratively refine music through an\ninteractive, multi-round dialogue interface. The system uses a large language\nmodel to interpret user intentions and select appropriate AI models for task\nexecution. Each backend model is specialized for a specific task, and their\noutputs are aggregated to meet the user's requirements. To ensure musical\ncoherence, essential attributes are maintained in a centralized table. We\nevaluate the effectiveness of the proposed system through semi-structured\ninterviews and questionnaires, highlighting its utility not only in\nfacilitating music creation but also its potential for broader applications.\n","authors":["Yixiao Zhang","Akira Maezawa","Gus Xia","Kazuhiko Yamamoto","Simon Dixon"],"pdf_url":"https://arxiv.org/pdf/2310.12404v2.pdf","comment":"Source code and demo video are available at\n \\url{https://sites.google.com/view/loop-copilot}"},{"id":"http://arxiv.org/abs/2408.16857v1","updated":"2024-08-29T18:47:41Z","published":"2024-08-29T18:47:41Z","title":"Modeling offensive content detection for TikTok","summary":" The advent of social media transformed interpersonal communication and\ninformation consumption processes. This digital landscape accommodates user\nintentions, also resulting in an increase of offensive language and harmful\nbehavior. Concurrently, social media platforms collect vast datasets comprising\nuser-generated content and behavioral information. These datasets are\ninstrumental for platforms deploying machine learning and data-driven\nstrategies, facilitating customer insights and countermeasures against social\nmanipulation mechanisms like disinformation and offensive content.\nNevertheless, the availability of such datasets, along with the application of\nvarious machine learning techniques, to researchers and practitioners, for\nspecific social media platforms regarding particular events, is limited. In\nparticular for TikTok, which offers unique tools for personalized content\ncreation and sharing, the existing body of knowledge would benefit from having\ndiverse comprehensive datasets and associated data analytics solutions on\noffensive content. While efforts from social media platforms, research, and\npractitioner communities are seen on this behalf, such content continues to\nproliferate. This translates to an essential need to make datasets publicly\navailable and build corresponding intelligent solutions. On this behalf, this\nresearch undertakes the collection and analysis of TikTok data containing\noffensive content, building a series of machine learning and deep learning\nmodels for offensive content detection. This is done aiming at answering the\nfollowing research question: \"How to develop a series of computational models\nto detect offensive content on TikTok?\". To this end, a Data Science\nmethodological approach is considered, 120.423 TikTok comments are collected,\nand on a balanced, binary classification approach, F1 score performance results\nof 0.863 is obtained.\n","authors":["Kasper Cools","Gideon Mailette de Buy Wenniger","Clara Maathuis"],"pdf_url":"https://arxiv.org/pdf/2408.16857v1.pdf","comment":"Accepted as a conference paper at DPSH 2024, 8 pages"},{"id":"http://arxiv.org/abs/2408.16809v1","updated":"2024-08-29T17:59:57Z","published":"2024-08-29T17:59:57Z","title":"See or Guess: Counterfactually Regularized Image Captioning","summary":" Image captioning, which generates natural language descriptions of the visual\ninformation in an image, is a crucial task in vision-language research.\nPrevious models have typically addressed this task by aligning the generative\ncapabilities of machines with human intelligence through statistical fitting of\nexisting datasets. While effective for normal images, they may struggle to\naccurately describe those where certain parts of the image are obscured or\nedited, unlike humans who excel in such cases. These weaknesses they exhibit,\nincluding hallucinations and limited interpretability, often hinder performance\nin scenarios with shifted association patterns. In this paper, we present a\ngeneric image captioning framework that employs causal inference to make\nexisting models more capable of interventional tasks, and counterfactually\nexplainable. Our approach includes two variants leveraging either total effect\nor natural direct effect. Integrating them into the training process enables\nmodels to handle counterfactual scenarios, increasing their generalizability.\nExtensive experiments on various datasets show that our method effectively\nreduces hallucinations and improves the model's faithfulness to images,\ndemonstrating high portability across both small-scale and large-scale\nimage-to-text models. The code is available at\nhttps://github.com/Aman-4-Real/See-or-Guess.\n","authors":["Qian Cao","Xu Chen","Ruihua Song","Xiting Wang","Xinting Huang","Yuchen Ren"],"pdf_url":"https://arxiv.org/pdf/2408.16809v1.pdf","comment":"Accepted by ACM MM 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.16770v1","updated":"2024-08-29T17:59:54Z","published":"2024-08-29T17:59:54Z","title":"3D Whole-body Grasp Synthesis with Directional Controllability","summary":" Synthesizing 3D whole-bodies that realistically grasp objects is useful for\nanimation, mixed reality, and robotics. This is challenging, because the hands\nand body need to look natural w.r.t. each other, the grasped object, as well as\nthe local scene (i.e., a receptacle supporting the object). Only recent work\ntackles this, with a divide-and-conquer approach; it first generates a\n\"guiding\" right-hand grasp, and then searches for bodies that match this.\nHowever, the guiding-hand synthesis lacks controllability and receptacle\nawareness, so it likely has an implausible direction (i.e., a body can't match\nthis without penetrating the receptacle) and needs corrections through major\npost-processing. Moreover, the body search needs exhaustive sampling and is\nexpensive. These are strong limitations. We tackle these with a novel method\ncalled CWGrasp. Our key idea is that performing geometry-based reasoning \"early\non,\" instead of \"too late,\" provides rich \"control\" signals for inference. To\nthis end, CWGrasp first samples a plausible reaching-direction vector (used\nlater for both the arm and hand) from a probabilistic model built via\nraycasting from the object and collision checking. Then, it generates a\nreaching body with a desired arm direction, as well as a \"guiding\" grasping\nhand with a desired palm direction that complies with the arm's one.\nEventually, CWGrasp refines the body to match the \"guiding\" hand, while\nplausibly contacting the scene. Notably, generating already-compatible \"parts\"\ngreatly simplifies the \"whole.\" Moreover, CWGrasp uniquely tackles both right-\nand left-hand grasps. We evaluate on the GRAB and ReplicaGrasp datasets.\nCWGrasp outperforms baselines, at lower runtime and budget, while all\ncomponents help performance. Code and models will be released.\n","authors":["Georgios Paschalidis","Romana Wilschut","Dimitrije Antić","Omid Taheri","Dimitrios Tzionas"],"pdf_url":"https://arxiv.org/pdf/2408.16770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16768v1","updated":"2024-08-29T17:59:45Z","published":"2024-08-29T17:59:45Z","title":"SAM2Point: Segment Any 3D as Videos in Zero-shot and Promptable Manners","summary":" We introduce SAM2Point, a preliminary exploration adapting Segment Anything\nModel 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point\ninterprets any 3D data as a series of multi-directional videos, and leverages\nSAM 2 for 3D-space segmentation, without further training or 2D-3D projection.\nOur framework supports various prompt types, including 3D points, boxes, and\nmasks, and can generalize across diverse scenarios, such as 3D objects, indoor\nscenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple\n3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight\nthe robust generalization capabilities of SAM2Point. To our best knowledge, we\npresent the most faithful implementation of SAM in 3D, which may serve as a\nstarting point for future research in promptable 3D segmentation. Online Demo:\nhttps://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\nhttps://github.com/ZiyuGuo99/SAM2Point .\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Chengzhuo Tong","Peng Gao","Chunyuan Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2408.16768v1.pdf","comment":"Work in progress. Online Demo:\n https://huggingface.co/spaces/ZiyuG/SAM2Point . Code:\n https://github.com/ZiyuGuo99/SAM2Point"},{"id":"http://arxiv.org/abs/2408.16769v1","updated":"2024-08-29T17:59:45Z","published":"2024-08-29T17:59:45Z","title":"PromptSmooth: Certifying Robustness of Medical Vision-Language Models\n via Prompt Learning","summary":" Medical vision-language models (Med-VLMs) trained on large datasets of\nmedical image-text pairs and later fine-tuned for specific tasks have emerged\nas a mainstream paradigm in medical image analysis. However, recent studies\nhave highlighted the susceptibility of these Med-VLMs to adversarial attacks,\nraising concerns about their safety and robustness. Randomized smoothing is a\nwell-known technique for turning any classifier into a model that is\ncertifiably robust to adversarial perturbations. However, this approach\nrequires retraining the Med-VLM-based classifier so that it classifies well\nunder Gaussian noise, which is often infeasible in practice. In this paper, we\npropose a novel framework called PromptSmooth to achieve efficient certified\nrobustness of Med-VLMs by leveraging the concept of prompt learning. Given any\npre-trained Med-VLM, PromptSmooth adapts it to handle Gaussian noise by\nlearning textual prompts in a zero-shot or few-shot manner, achieving a\ndelicate balance between accuracy and robustness, while minimizing the\ncomputational overhead. Moreover, PromptSmooth requires only a single model to\nhandle multiple noise levels, which substantially reduces the computational\ncost compared to traditional methods that rely on training a separate model for\neach noise level. Comprehensive experiments based on three Med-VLMs and across\nsix downstream datasets of various imaging modalities demonstrate the efficacy\nof PromptSmooth. Our code and models are available at\nhttps://github.com/nhussein/promptsmooth.\n","authors":["Noor Hussein","Fahad Shamshad","Muzammal Naseer","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2408.16769v1.pdf","comment":"Accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.16767v1","updated":"2024-08-29T17:59:40Z","published":"2024-08-29T17:59:40Z","title":"ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion\n Model","summary":" Advancements in 3D scene reconstruction have transformed 2D images from the\nreal world into 3D models, producing realistic 3D results from hundreds of\ninput photos. Despite great success in dense-view reconstruction scenarios,\nrendering a detailed scene from insufficient captured views is still an\nill-posed optimization problem, often resulting in artifacts and distortions in\nunseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction\nparadigm that reframes the ambiguous reconstruction challenge as a temporal\ngeneration task. The key insight is to unleash the strong generative prior of\nlarge pre-trained video diffusion models for sparse-view reconstruction.\nHowever, 3D view consistency struggles to be accurately preserved in directly\ngenerated video frames from pre-trained models. To address this, given limited\ninput views, the proposed ReconX first constructs a global point cloud and\nencodes it into a contextual space as the 3D structure condition. Guided by the\ncondition, the video diffusion model then synthesizes video frames that are\nboth detail-preserved and exhibit a high degree of 3D consistency, ensuring the\ncoherence of the scene from various perspectives. Finally, we recover the 3D\nscene from the generated video through a confidence-aware 3D Gaussian Splatting\noptimization scheme. Extensive experiments on various real-world datasets show\nthe superiority of our ReconX over state-of-the-art methods in terms of quality\nand generalizability.\n","authors":["Fangfu Liu","Wenqiang Sun","Hanyang Wang","Yikai Wang","Haowen Sun","Junliang Ye","Jun Zhang","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16767v1.pdf","comment":"Project page: https://liuff19.github.io/ReconX"},{"id":"http://arxiv.org/abs/2408.16766v1","updated":"2024-08-29T17:59:30Z","published":"2024-08-29T17:59:30Z","title":"CSGO: Content-Style Composition in Text-to-Image Generation","summary":" The diffusion model has shown exceptional capabilities in controlled image\ngeneration, which has further fueled interest in image style transfer. Existing\nworks mainly focus on training free-based methods (e.g., image inversion) due\nto the scarcity of specific data. In this study, we present a data construction\npipeline for content-style-stylized image triplets that generates and\nautomatically cleanses stylized data triplets. Based on this pipeline, we\nconstruct a dataset IMAGStyle, the first large-scale style transfer dataset\ncontaining 210k image triplets, available for the community to explore and\nresearch. Equipped with IMAGStyle, we propose CSGO, a style transfer model\nbased on end-to-end training, which explicitly decouples content and style\nfeatures employing independent feature injection. The unified CSGO implements\nimage-driven style transfer, text-driven stylized synthesis, and text\nediting-driven stylized synthesis. Extensive experiments demonstrate the\neffectiveness of our approach in enhancing style control capabilities in image\ngeneration. Additional visualization and access to the source code can be\nlocated on the project page: \\url{https://csgo-gen.github.io/}.\n","authors":["Peng Xing","Haofan Wang","Yanpeng Sun","Qixun Wang","Xu Bai","Hao Ai","Renyuan Huang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2408.16766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16762v1","updated":"2024-08-29T17:57:05Z","published":"2024-08-29T17:57:05Z","title":"UV-free Texture Generation with Denoising and Geodesic Heat Diffusions","summary":" Seams, distortions, wasted UV space, vertex-duplication, and varying\nresolution over the surface are the most prominent issues of the standard\nUV-based texturing of meshes. These issues are particularly acute when\nautomatic UV-unwrapping techniques are used. For this reason, instead of\ngenerating textures in automatically generated UV-planes like most\nstate-of-the-art methods, we propose to represent textures as coloured\npoint-clouds whose colours are generated by a denoising diffusion probabilistic\nmodel constrained to operate on the surface of 3D objects. Our sampling and\nresolution agnostic generative model heavily relies on heat diffusion over the\nsurface of the meshes for spatial communication between points. To enable\nprocessing of arbitrarily sampled point-cloud textures and ensure long-distance\ntexture consistency we introduce a fast re-sampling of the mesh spectral\nproperties used during the heat diffusion and introduce a novel\nheat-diffusion-based self-attention mechanism. Our code and pre-trained models\nare available at github.com/simofoti/UV3-TeD.\n","authors":["Simone Foti","Stefanos Zafeiriou","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2408.16762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16760v1","updated":"2024-08-29T17:56:33Z","published":"2024-08-29T17:56:33Z","title":"OmniRe: Omni Urban Scene Reconstruction","summary":" We introduce OmniRe, a holistic approach for efficiently reconstructing\nhigh-fidelity dynamic urban scenes from on-device logs. Recent methods for\nmodeling driving sequences using neural radiance fields or Gaussian Splatting\nhave demonstrated the potential of reconstructing challenging dynamic scenes,\nbut often overlook pedestrians and other non-vehicle dynamic actors, hindering\na complete pipeline for dynamic urban scene reconstruction. To that end, we\npropose a comprehensive 3DGS framework for driving scenes, named OmniRe, that\nallows for accurate, full-length reconstruction of diverse dynamic objects in a\ndriving log. OmniRe builds dynamic neural scene graphs based on Gaussian\nrepresentations and constructs multiple local canonical spaces that model\nvarious dynamic actors, including vehicles, pedestrians, and cyclists, among\nmany others. This capability is unmatched by existing methods. OmniRe allows us\nto holistically reconstruct different objects present in the scene,\nsubsequently enabling the simulation of reconstructed scenarios with all actors\nparticipating in real-time (~60Hz). Extensive evaluations on the Waymo dataset\nshow that our approach outperforms prior state-of-the-art methods\nquantitatively and qualitatively by a large margin. We believe our work fills a\ncritical gap in driving reconstruction.\n","authors":["Ziyu Chen","Jiawei Yang","Jiahui Huang","Riccardo de Lutio","Janick Martinez Esturo","Boris Ivanovic","Or Litany","Zan Gojcic","Sanja Fidler","Marco Pavone","Li Song","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16760v1.pdf","comment":"See the project page for code, video results and demos:\n https://ziyc.github.io/omnire/"},{"id":"http://arxiv.org/abs/2407.10972v2","updated":"2024-08-29T17:55:52Z","published":"2024-07-15T17:59:55Z","title":"VGBench: Evaluating Large Language Models on Vector Graphics\n Understanding and Generation","summary":" In the realm of vision models, the primary mode of representation is using\npixels to rasterize the visual world. Yet this is not always the best or unique\nway to represent visual content, especially for designers and artists who\ndepict the world using geometry primitives such as polygons. Vector graphics\n(VG), on the other hand, offer a textual representation of visual content,\nwhich can be more concise and powerful for content like cartoons, sketches and\nscientific figures. Recent studies have shown promising results on processing\nvector graphics with capable Large Language Models (LLMs). However, such works\nfocus solely on qualitative results, understanding, or a specific type of\nvector graphics. We propose VGBench, a comprehensive benchmark for LLMs on\nhandling vector graphics through diverse aspects, including (a) both visual\nunderstanding and generation, (b) evaluation of various vector graphics\nformats, (c) diverse question types, (d) wide range of prompting techniques,\n(e) under multiple LLMs and (f) comparison with VLMs on rasterized\nrepresentations. Evaluating on our collected 4279 understanding and 5845\ngeneration samples, we find that LLMs show strong capability on both aspects\nwhile exhibiting less desirable performance on low-level formats (SVG). Both\ndata and evaluation pipeline will be open-sourced at https://vgbench.github.io.\n","authors":["Bocheng Zou","Mu Cai","Jianrui Zhang","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2407.10972v2.pdf","comment":"Project Page: https://vgbench.github.io"},{"id":"http://arxiv.org/abs/2408.16757v1","updated":"2024-08-29T17:55:07Z","published":"2024-08-29T17:55:07Z","title":"Dissecting Out-of-Distribution Detection and Open-Set Recognition: A\n Critical Analysis of Methods and Benchmarks","summary":" Detecting test-time distribution shift has emerged as a key capability for\nsafely deployed machine learning models, with the question being tackled under\nvarious guises in recent years. In this paper, we aim to provide a consolidated\nview of the two largest sub-fields within the community: out-of-distribution\n(OOD) detection and open-set recognition (OSR). In particular, we aim to\nprovide rigorous empirical analysis of different methods across settings and\nprovide actionable takeaways for practitioners and researchers. Concretely, we\nmake the following contributions: (i) We perform rigorous cross-evaluation\nbetween state-of-the-art methods in the OOD detection and OSR settings and\nidentify a strong correlation between the performances of methods for them;\n(ii) We propose a new, large-scale benchmark setting which we suggest better\ndisentangles the problem tackled by OOD detection and OSR, re-evaluating\nstate-of-the-art OOD detection and OSR methods in this setting; (iii) We\nsurprisingly find that the best performing method on standard benchmarks\n(Outlier Exposure) struggles when tested at scale, while scoring rules which\nare sensitive to the deep feature magnitude consistently show promise; and (iv)\nWe conduct empirical analysis to explain these phenomena and highlight\ndirections for future research. Code:\n\\url{https://github.com/Visual-AI/Dissect-OOD-OSR}\n","authors":["Hongjun Wang","Sagar Vaze","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2408.16757v1.pdf","comment":"Accepted to IJCV, preprint version"},{"id":"http://arxiv.org/abs/2408.11817v2","updated":"2024-08-29T17:47:47Z","published":"2024-08-21T17:59:32Z","title":"GRAB: A Challenging GRaph Analysis Benchmark for Large Multimodal Models","summary":" Large multimodal models (LMMs) have exhibited proficiencies across many\nvisual tasks. Although numerous well-known benchmarks exist to evaluate model\nperformance, they increasingly have insufficient headroom. As such, there is a\npressing need for a new generation of benchmarks challenging enough for the\nnext generation of LMMs. One area that LMMs show potential is graph analysis,\nspecifically, the tasks an analyst might typically perform when interpreting\nfigures such as estimating the mean, intercepts or correlations of functions\nand data series. In this work, we introduce GRAB, a graph analysis benchmark,\nfit for current and future frontier LMMs. Our benchmark is entirely synthetic,\nensuring high-quality, noise-free questions. GRAB is comprised of 2170\nquestions, covering four tasks and 23 graph properties. We evaluate 20 LMMs on\nGRAB, finding it to be a challenging benchmark, with the highest performing\nmodel attaining a score of just 21.7%. Finally, we conduct various ablations to\ninvestigate where the models succeed and struggle. We release GRAB to encourage\nprogress in this important, growing domain.\n","authors":["Jonathan Roberts","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2408.11817v2.pdf","comment":"V2: Fixed references formatting"},{"id":"http://arxiv.org/abs/2408.16730v1","updated":"2024-08-29T17:21:58Z","published":"2024-08-29T17:21:58Z","title":"VideoLLM-MoD: Efficient Video-Language Streaming with Mixture-of-Depths\n Vision Computation","summary":" A well-known dilemma in large vision-language models (e.g., GPT-4, LLaVA) is\nthat while increasing the number of vision tokens generally enhances visual\nunderstanding, it also significantly raises memory and computational costs,\nespecially in long-term, dense video frame streaming scenarios. Although\nlearnable approaches like Q-Former and Perceiver Resampler have been developed\nto reduce the vision token burden, they overlook the context causally modeled\nby LLMs (i.e., key-value cache), potentially leading to missed visual cues when\naddressing user queries. In this paper, we introduce a novel approach to reduce\nvision compute by leveraging redundant vision tokens \"skipping layers\" rather\nthan decreasing the number of vision tokens. Our method, VideoLLM-MoD, is\ninspired by mixture-of-depths LLMs and addresses the challenge of numerous\nvision tokens in long-term or streaming video. Specifically, for each\ntransformer layer, we learn to skip the computation for a high proportion\n(e.g., 80\\%) of vision tokens, passing them directly to the next layer. This\napproach significantly enhances model efficiency, achieving approximately\n\\textasciitilde42\\% time and \\textasciitilde30\\% memory savings for the entire\ntraining. Moreover, our method reduces the computation in the context and avoid\ndecreasing the vision tokens, thus preserving or even improving performance\ncompared to the vanilla model. We conduct extensive experiments to demonstrate\nthe effectiveness of VideoLLM-MoD, showing its state-of-the-art results on\nmultiple benchmarks, including narration, forecasting, and summarization tasks\nin COIN, Ego4D, and Ego-Exo4D datasets.\n","authors":["Shiwei Wu","Joya Chen","Kevin Qinghong Lin","Qimeng Wang","Yan Gao","Qianli Xu","Tong Xu","Yao Hu","Enhong Chen","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2408.16730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09359v4","updated":"2024-08-29T17:21:27Z","published":"2024-04-14T21:14:47Z","title":"Evaluation Framework for Feedback Generation Methods in Skeletal\n Movement Assessment","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we propose terminology and\ncriteria for the classification, evaluation, and comparison of feedback\ngeneration solutions. We discuss the challenges associated with each feedback\ngeneration approach and use our proposed criteria to classify existing\nsolutions. To our knowledge, this is the first work that formulates feedback\ngeneration in skeletal movement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v4.pdf","comment":"Accepted to xAI4Biometrics 2024 at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16729v1","updated":"2024-08-29T17:20:59Z","published":"2024-08-29T17:20:59Z","title":"Prediction-Feedback DETR for Temporal Action Detection","summary":" Temporal Action Detection (TAD) is fundamental yet challenging for real-world\nvideo applications. Leveraging the unique benefits of transformers, various\nDETR-based approaches have been adopted in TAD. However, it has recently been\nidentified that the attention collapse in self-attention causes the performance\ndegradation of DETR for TAD. Building upon previous research, this paper newly\naddresses the attention collapse problem in cross-attention within DETR-based\nTAD methods. Moreover, our findings reveal that cross-attention exhibits\npatterns distinct from predictions, indicating a short-cut phenomenon. To\nresolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR),\nwhich utilizes predictions to restore the collapse and align the cross- and\nself-attention with predictions. Specifically, we devise novel\nprediction-feedback objectives using guidance from the relations of the\npredictions. As a result, Pred-DETR significantly alleviates the collapse and\nachieves state-of-the-art performance among DETR-based methods on various\nchallenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and\nFineAction.\n","authors":["Jihwan Kim","Miso Lee","Cheol-Ho Cho","Jihyun Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2408.16729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11933v2","updated":"2024-08-29T17:16:13Z","published":"2024-06-17T15:41:57Z","title":"OpticalRS-4M: Scaling Efficient Masked Autoencoder Learning on Large\n Remote Sensing Dataset","summary":" Masked Image Modeling (MIM) has become an essential method for building\nfoundational visual models in remote sensing (RS). However, the limitations in\nsize and diversity of existing RS datasets restrict the ability of MIM methods\nto learn generalizable representations. Additionally, conventional MIM\ntechniques, which require reconstructing all tokens, introduce unnecessary\ncomputational overhead. To address these issues, we present a new pre-training\npipeline for RS models, featuring the creation of a large-scale RS dataset and\nan efficient MIM approach. We curated a high-quality dataset named OpticalRS-4M\nby collecting publicly available RS datasets and processing them through\nexclusion, slicing, and deduplication. OpticalRS-4M comprises 4 million optical\nimages covering various RS tasks, such as object detection and pixel\nsegmentation. To enhance efficiency, we propose SelectiveMAE, a pre-training\nmethod that dynamically encodes and reconstructs semantically rich patch\ntokens, thereby reducing the inefficiencies of traditional MIM models caused by\nredundant background pixels in RS images. Extensive experiments demonstrate\nthat OpticalRS-4M significantly improves classification, detection, and\nsegmentation performance, while SelectiveMAE increases training efficiency over\n2 times. This highlights the effectiveness and scalability of our pipeline in\ndeveloping RS foundational models.\n","authors":["Fengxiang Wang","Hongzhen Wang","Di Wang","Zonghao Guo","Zhenyu Zhong","Long Lan","Jing Zhang","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2406.11933v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16719v1","updated":"2024-08-29T17:11:38Z","published":"2024-08-29T17:11:38Z","title":"H-SGANet: Hybrid Sparse Graph Attention Network for Deformable Medical\n Image Registration","summary":" The integration of Convolutional Neural Network (ConvNet) and Transformer has\nemerged as a strong candidate for image registration, leveraging the strengths\nof both models and a large parameter space. However, this hybrid model,\ntreating brain MRI volumes as grid or sequence structures, faces challenges in\naccurately representing anatomical connectivity, diverse brain regions, and\nvital connections contributing to the brain's internal architecture. Concerns\nalso arise regarding the computational expense and GPU memory usage associated\nwith this model. To tackle these issues, a lightweight hybrid sparse graph\nattention network (H-SGANet) has been developed. This network incorporates a\ncentral mechanism, Sparse Graph Attention (SGA), based on a Vision Graph Neural\nNetwork (ViG) with predetermined anatomical connections. The SGA module expands\nthe model's receptive field and seamlessly integrates into the network. To\nfurther amplify the advantages of the hybrid network, the Separable\nSelf-Attention (SSA) is employed as an enhanced token mixer, integrated with\ndepth-wise convolution to constitute SSAFormer. This strategic integration is\ndesigned to more effectively extract long-range dependencies. As a hybrid\nConvNet-ViG-Transformer model, H-SGANet offers threefold benefits for\nvolumetric medical image registration. It optimizes fixed and moving images\nconcurrently through a hybrid feature fusion layer and an end-to-end learning\nframework. Compared to VoxelMorph, a model with a similar parameter count,\nH-SGANet demonstrates significant performance enhancements of 3.5% and 1.5% in\nDice score on the OASIS dataset and LPBA40 dataset, respectively.\n","authors":["Yufeng Zhou","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2408.16719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16704v1","updated":"2024-08-29T16:58:10Z","published":"2024-08-29T16:58:10Z","title":"One-Shot Learning Meets Depth Diffusion in Multi-Object Videos","summary":" Creating editable videos that depict complex interactions between multiple\nobjects in various artistic styles has long been a challenging task in\nfilmmaking. Progress is often hampered by the scarcity of data sets that\ncontain paired text descriptions and corresponding videos that showcase these\ninteractions. This paper introduces a novel depth-conditioning approach that\nsignificantly advances this field by enabling the generation of coherent and\ndiverse videos from just a single text-video pair using a pre-trained\ndepth-aware Text-to-Image (T2I) model. Our method fine-tunes the pre-trained\nmodel to capture continuous motion by employing custom-designed spatial and\ntemporal attention mechanisms. During inference, we use the DDIM inversion to\nprovide structural guidance for video generation. This innovative technique\nallows for continuously controllable depth in videos, facilitating the\ngeneration of multiobject interactions while maintaining the concept generation\nand compositional strengths of the original T2I model across various artistic\nstyles, such as photorealism, animation, and impressionism.\n","authors":["Anisha Jain"],"pdf_url":"https://arxiv.org/pdf/2408.16704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16700v1","updated":"2024-08-29T16:51:07Z","published":"2024-08-29T16:51:07Z","title":"GradBias: Unveiling Word Influence on Bias in Text-to-Image Generative\n Models","summary":" Recent progress in Text-to-Image (T2I) generative models has enabled\nhigh-quality image generation. As performance and accessibility increase, these\nmodels are gaining significant attraction and popularity: ensuring their\nfairness and safety is a priority to prevent the dissemination and perpetuation\nof biases. However, existing studies in bias detection focus on closed sets of\npredefined biases (e.g., gender, ethnicity). In this paper, we propose a\ngeneral framework to identify, quantify, and explain biases in an open set\nsetting, i.e. without requiring a predefined set. This pipeline leverages a\nLarge Language Model (LLM) to propose biases starting from a set of captions.\nNext, these captions are used by the target generative model for generating a\nset of images. Finally, Vision Question Answering (VQA) is leveraged for bias\nevaluation. We show two variations of this framework: OpenBias and GradBias.\nOpenBias detects and quantifies biases, while GradBias determines the\ncontribution of individual prompt words on biases. OpenBias effectively detects\nboth well-known and novel biases related to people, objects, and animals and\nhighly aligns with existing closed-set bias detection methods and human\njudgment. GradBias shows that neutral words can significantly influence biases\nand it outperforms several baselines, including state-of-the-art foundation\nmodels. Code available here: https://github.com/Moreno98/GradBias.\n","authors":["Moreno D'Incà","Elia Peruzzo","Massimiliano Mancini","Xingqian Xu","Humphrey Shi","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2408.16700v1.pdf","comment":"Under review. Code: https://github.com/Moreno98/GradBias"},{"id":"http://arxiv.org/abs/2408.16690v1","updated":"2024-08-29T16:37:58Z","published":"2024-08-29T16:37:58Z","title":"Generic Objects as Pose Probes for Few-Shot View Synthesis","summary":" Radiance fields including NeRFs and 3D Gaussians demonstrate great potential\nin high-fidelity rendering and scene reconstruction, while they require a\nsubstantial number of posed images as inputs. COLMAP is frequently employed for\npreprocessing to estimate poses, while it necessitates a large number of\nfeature matches to operate effectively, and it struggles with scenes\ncharacterized by sparse features, large baselines between images, or a limited\nnumber of input images. We aim to tackle few-view NeRF reconstruction using\nonly 3 to 6 unposed scene images. Traditional methods often use calibration\nboards but they are not common in images. We propose a novel idea of utilizing\neveryday objects, commonly found in both images and real life, as \"pose\nprobes\". The probe object is automatically segmented by SAM, whose shape is\ninitialized from a cube. We apply a dual-branch volume rendering optimization\n(object NeRF and scene NeRF) to constrain the pose optimization and jointly\nrefine the geometry. Specifically, object poses of two views are first\nestimated by PnP matching in an SDF representation, which serves as initial\nposes. PnP matching, requiring only a few features, is suitable for\nfeature-sparse scenes. Additional views are incrementally incorporated to\nrefine poses from preceding views. In experiments, PoseProbe achieves\nstate-of-the-art performance in both pose estimation and novel view synthesis\nacross multiple datasets. We demonstrate its effectiveness, particularly in\nfew-view and large-baseline scenes where COLMAP struggles. In ablations, using\ndifferent objects in a scene yields comparable performance.\n","authors":["Zhirui Gao","Renjiao Yi","Chenyang Zhu","Ke Zhuang","Wei Chen","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16684v1","updated":"2024-08-29T16:31:05Z","published":"2024-08-29T16:31:05Z","title":"PartFormer: Awakening Latent Diverse Representation from Vision\n Transformer for Object Re-Identification","summary":" Extracting robust feature representation is critical for object\nre-identification to accurately identify objects across non-overlapping\ncameras. Although having a strong representation ability, the Vision\nTransformer (ViT) tends to overfit on most distinct regions of training data,\nlimiting its generalizability and attention to holistic object features.\nMeanwhile, due to the structural difference between CNN and ViT, fine-grained\nstrategies that effectively address this issue in CNN do not continue to be\nsuccessful in ViT. To address this issue, by observing the latent diverse\nrepresentation hidden behind the multi-head attention, we present PartFormer,\nan innovative adaptation of ViT designed to overcome the granularity\nlimitations in object Re-ID tasks. The PartFormer integrates a Head\nDisentangling Block (HDB) that awakens the diverse representation of multi-head\nself-attention without the typical loss of feature richness induced by\nconcatenation and FFN layers post-attention. To avoid the homogenization of\nattention heads and promote robust part-based feature learning, two head\ndiversity constraints are imposed: attention diversity constraint and\ncorrelation diversity constraint. These constraints enable the model to exploit\ndiverse and discriminative feature representations from different attention\nheads. Comprehensive experiments on various object Re-ID benchmarks demonstrate\nthe superiority of the PartFormer. Specifically, our framework significantly\noutperforms state-of-the-art by 2.4\\% mAP scores on the most challenging MSMT17\ndataset.\n","authors":["Lei Tan","Pingyang Dai","Jie Chen","Liujuan Cao","Yongjian Wu","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.16684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18915v3","updated":"2024-08-29T16:07:30Z","published":"2024-06-27T06:12:01Z","title":"Manipulate-Anything: Automating Real-World Robots using Vision-Language\n Models","summary":" Large-scale endeavors like and widespread community efforts such as\nOpen-X-Embodiment have contributed to growing the scale of robot demonstration\ndata. However, there is still an opportunity to improve the quality, quantity,\nand diversity of robot demonstration data. Although vision-language models have\nbeen shown to automatically generate demonstration data, their utility has been\nlimited to environments with privileged state information, they require\nhand-designed skills, and are limited to interactions with few object\ninstances. We propose Manipulate-Anything, a scalable automated generation\nmethod for real-world robotic manipulation. Unlike prior work, our method can\noperate in real-world environments without any privileged state information,\nhand-designed skills, and can manipulate any static object. We evaluate our\nmethod using two setups. First, Manipulate-Anything successfully generates\ntrajectories for all 7 real-world and 14 simulation tasks, significantly\noutperforming existing methods like VoxPoser. Second, Manipulate-Anything's\ndemonstrations can train more robust behavior cloning policies than training\nwith human demonstrations, or from data generated by VoxPoser, Scaling-up, and\nCode-As-Policies. We believe Manipulate-Anything can be a scalable method for\nboth generating data for robotics and solving novel tasks in a zero-shot\nsetting. Project page: https://robot-ma.github.io/.\n","authors":["Jiafei Duan","Wentao Yuan","Wilbert Pumacay","Yi Ru Wang","Kiana Ehsani","Dieter Fox","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2406.18915v3.pdf","comment":"Project page: https://robot-ma.github.io/. All supplementary\n material, prompts and code can be found on the project page"},{"id":"http://arxiv.org/abs/2408.16662v1","updated":"2024-08-29T16:05:22Z","published":"2024-08-29T16:05:22Z","title":"Space3D-Bench: Spatial 3D Question Answering Benchmark","summary":" Answering questions about the spatial properties of the environment poses\nchallenges for existing language and vision foundation models due to a lack of\nunderstanding of the 3D world notably in terms of relationships between\nobjects. To push the field forward, multiple 3D Q&A datasets were proposed\nwhich, overall, provide a variety of questions, but they individually focus on\nparticular aspects of 3D reasoning or are limited in terms of data modalities.\nTo address this, we present Space3D-Bench - a collection of 1000 general\nspatial questions and answers related to scenes of the Replica dataset which\noffers a variety of data modalities: point clouds, posed RGB-D images,\nnavigation meshes and 3D object detections. To ensure that the questions cover\na wide range of 3D objectives, we propose an indoor spatial questions taxonomy\ninspired by geographic information systems and use it to balance the dataset\naccordingly. Moreover, we provide an assessment system that grades natural\nlanguage responses based on predefined ground-truth answers by leveraging a\nVision Language Model's comprehension of both text and images to compare the\nresponses with ground-truth textual information or relevant visual data.\nFinally, we introduce a baseline called RAG3D-Chat integrating the world\nunderstanding of foundation models with rich context retrieval, achieving an\naccuracy of 67% on the proposed dataset.\n","authors":["Emilia Szymanska","Mihai Dusmanu","Jan-Willem Buurlage","Mahdi Rad","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2408.16662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16661v1","updated":"2024-08-29T16:05:05Z","published":"2024-08-29T16:05:05Z","title":"Eigen-Cluster VIS: Improving Weakly-supervised Video Instance\n Segmentation by Leveraging Spatio-temporal Consistency","summary":" The performance of Video Instance Segmentation (VIS) methods has improved\nsignificantly with the advent of transformer networks. However, these networks\noften face challenges in training due to the high annotation cost. To address\nthis, unsupervised and weakly-supervised methods have been developed to reduce\nthe dependency on annotations. This work introduces a novel weakly-supervised\nmethod called Eigen-cluster VIS that, without requiring any mask annotations,\nachieves competitive accuracy compared to other VIS approaches. This method is\nbased on two key innovations: a Temporal Eigenvalue Loss (TEL) and a clip-level\nQuality Cluster Coefficient (QCC). The TEL ensures temporal coherence by\nleveraging the eigenvalues of the Laplacian matrix derived from graph adjacency\nmatrices. By minimizing the mean absolute error (MAE) between the eigenvalues\nof adjacent frames, this loss function promotes smooth transitions and stable\nsegmentation boundaries over time, reducing temporal discontinuities and\nimproving overall segmentation quality. The QCC employs the K-means method to\nensure the quality of spatio-temporal clusters without relying on ground truth\nmasks. Using the Davies-Bouldin score, the QCC provides an unsupervised measure\nof feature discrimination, allowing the model to self-evaluate and adapt to\nvarying object distributions, enhancing robustness during the testing phase.\nThese enhancements are computationally efficient and straightforward, offering\nsignificant performance gains without additional annotated data. The proposed\nEigen-Cluster VIS method is evaluated on the YouTube-VIS 2019/2021 and OVIS\ndatasets, demonstrating that it effectively narrows the performance gap between\nthe fully-supervised and weakly-supervised VIS approaches. The code is\navailable on: https://github.com/farnooshar/EigenClusterVIS\n","authors":["Farnoosh Arefi","Amir M. Mansourian","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2408.16661v1.pdf","comment":"12 pages, 6 Figures, 5 tabels"},{"id":"http://arxiv.org/abs/2407.04559v2","updated":"2024-08-29T15:58:09Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n than Measuring Coherence, Grounding, and Repetition","summary":" Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16647v1","updated":"2024-08-29T15:52:56Z","published":"2024-08-29T15:52:56Z","title":"DriveGenVLM: Real-world Video Generation for Vision Language Model based\n Autonomous Driving","summary":" The advancement of autonomous driving technologies necessitates increasingly\nsophisticated methods for understanding and predicting real-world scenarios.\nVision language models (VLMs) are emerging as revolutionary tools with\nsignificant potential to influence autonomous driving. In this paper, we\npropose the DriveGenVLM framework to generate driving videos and use VLMs to\nunderstand them. To achieve this, we employ a video generation framework\ngrounded in denoising diffusion probabilistic models (DDPM) aimed at predicting\nreal-world video sequences. We then explore the adequacy of our generated\nvideos for use in VLMs by employing a pre-trained model known as Efficient\nIn-context Learning on Egocentric Videos (EILEV). The diffusion model is\ntrained with the Waymo open dataset and evaluated using the Fr\\'echet Video\nDistance (FVD) score to ensure the quality and realism of the generated videos.\nCorresponding narrations are provided by EILEV for these generated videos,\nwhich may be beneficial in the autonomous driving domain. These narrations can\nenhance traffic scene understanding, aid in navigation, and improve planning\ncapabilities. The integration of video generation with VLMs in the DriveGenVLM\nframework represents a significant step forward in leveraging advanced AI\nmodels to address complex challenges in autonomous driving.\n","authors":["Yongjie Fu","Anmol Jain","Xuan Di","Xu Chen","Zhaobin Mo"],"pdf_url":"https://arxiv.org/pdf/2408.16647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16645v1","updated":"2024-08-29T15:51:06Z","published":"2024-08-29T15:51:06Z","title":"SODAWideNet++: Combining Attention and Convolutions for Salient Object\n Detection","summary":" Salient Object Detection (SOD) has traditionally relied on feature refinement\nmodules that utilize the features of an ImageNet pre-trained backbone. However,\nthis approach limits the possibility of pre-training the entire network because\nof the distinct nature of SOD and image classification. Additionally, the\narchitecture of these backbones originally built for Image classification is\nsub-optimal for a dense prediction task like SOD. To address these issues, we\npropose a novel encoder-decoder-style neural network called SODAWideNet++ that\nis designed explicitly for SOD. Inspired by the vision transformers ability to\nattain a global receptive field from the initial stages, we introduce the\nAttention Guided Long Range Feature Extraction (AGLRFE) module, which combines\nlarge dilated convolutions and self-attention. Specifically, we use attention\nfeatures to guide long-range information extracted by multiple dilated\nconvolutions, thus taking advantage of the inductive biases of a convolution\noperation and the input dependency brought by self-attention. In contrast to\nthe current paradigm of ImageNet pre-training, we modify 118K annotated images\nfrom the COCO semantic segmentation dataset by binarizing the annotations to\npre-train the proposed model end-to-end. Further, we supervise the background\npredictions along with the foreground to push our model to generate accurate\nsaliency predictions. SODAWideNet++ performs competitively on five different\ndatasets while only containing 35% of the trainable parameters compared to the\nstate-of-the-art models. The code and pre-computed saliency maps are provided\nat https://github.com/VimsLab/SODAWideNetPlusPlus.\n","authors":["Rohit Venkata Sai Dulam","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2408.16645v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2408.16638v1","updated":"2024-08-29T15:42:06Z","published":"2024-08-29T15:42:06Z","title":"3D Pose-Based Temporal Action Segmentation for Figure Skating: A\n Fine-Grained and Jump Procedure-Aware Annotation Approach","summary":" Understanding human actions from videos is essential in many domains,\nincluding sports. In figure skating, technical judgments are performed by\nwatching skaters' 3D movements, and its part of the judging procedure can be\nregarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure\nskating that automatically assign temporal semantics to video are actively\nresearched. However, there is a lack of datasets and effective methods for TAS\ntasks requiring 3D pose data. In this study, we first created the FS-Jump3D\ndataset of complex and dynamic figure skating jumps using optical markerless\nmotion capture. We also propose a new fine-grained figure skating jump TAS\ndataset annotation method with which TAS models can learn jump procedures. In\nthe experimental results, we validated the usefulness of 3D pose features as\ninput and the fine-grained dataset for the TAS model in figure skating.\nFS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D.\n","authors":["Ryota Tanaka","Tomohiro Suzuki","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2408.16638v1.pdf","comment":"10 pages, 7th ACM International Workshop on Multimedia Content\n Analysis in Sports"},{"id":"http://arxiv.org/abs/2405.20743v2","updated":"2024-08-29T15:31:58Z","published":"2024-05-31T10:13:17Z","title":"Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent\n Codes","summary":" Trajectory forecasting is crucial for video surveillance analytics, as it\nenables the anticipation of future movements for a set of agents, e.g.\nbasketball players engaged in intricate interactions with long-term intentions.\nDeep generative models offer a natural learning approach for trajectory\nforecasting, yet they encounter difficulties in achieving an optimal balance\nbetween sampling fidelity and diversity. We address this challenge by\nleveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a\ndiscrete latent space to tackle the issue of posterior collapse. Specifically,\nwe introduce an instance-based codebook that allows tailored latent\nrepresentations for each example. In a nutshell, the rows of the codebook are\ndynamically adjusted to reflect contextual information (i.e., past motion\npatterns extracted from the observed trajectories). In this way, the\ndiscretization process gains flexibility, leading to improved reconstructions.\nNotably, instance-level dynamics are injected into the codebook through\nlow-rank updates, which restrict the customization of the codebook to a lower\ndimension space. The resulting discrete space serves as the basis of the\nsubsequent step, which regards the training of a diffusion-based predictive\nmodel. We show that such a two-fold framework, augmented with instance-level\ndiscretization, leads to accurate and diverse forecasts, yielding\nstate-of-the-art performance on three established benchmarks.\n","authors":["Riccardo Benaglia","Angelo Porrello","Pietro Buzzega","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2405.20743v2.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16623v1","updated":"2024-08-29T15:31:51Z","published":"2024-08-29T15:31:51Z","title":"Turbulence Strength $C_n^2$ Estimation from Video using Physics-based\n Deep Learning","summary":" Images captured from a long distance suffer from dynamic image distortion due\nto turbulent flow of air cells with random temperatures, and thus refractive\nindices. This phenomenon, known as image dancing, is commonly characterized by\nits refractive-index structure constant $C_n^2$ as a measure of the turbulence\nstrength. For many applications such as atmospheric forecast model,\nlong-range/astronomy imaging, and aviation safety, optical communication\ntechnology, $C_n^2$ estimation is critical for accurately sensing the turbulent\nenvironment. Previous methods for $C_n^2$ estimation include estimation from\nmeteorological data (temperature, relative humidity, wind shear, etc.) for\nsingle-point measurements, two-ended pathlength measurements from optical\nscintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$\nfrom passive video cameras for low cost and hardware complexity. In this paper,\nwe present a comparative analysis of classical image gradient methods for\n$C_n^2$ estimation and modern deep learning-based methods leveraging\nconvolutional neural networks. To enable this, we collect a dataset of video\ncapture along with reference scintillometer measurements for ground truth, and\nwe release this unique dataset to the scientific community. We observe that\ndeep learning methods can achieve higher accuracy when trained on similar data,\nbut suffer from generalization errors to other, unseen imagery as compared to\nclassical methods. To overcome this trade-off, we present a novel physics-based\nnetwork architecture that combines learned convolutional layers with a\ndifferentiable image gradient method that maintains high accuracy while being\ngeneralizable across image datasets.\n","authors":["Ripon Kumar Saha","Esen Salcin","Jihoo Kim","Joseph Smith","Suren Jayasuriya"],"pdf_url":"https://arxiv.org/pdf/2408.16623v1.pdf","comment":"Code Available: https://github.com/Riponcs/Cn2Estimation"},{"id":"http://arxiv.org/abs/2408.16622v1","updated":"2024-08-29T15:31:43Z","published":"2024-08-29T15:31:43Z","title":"Sparse Signal Reconstruction for Overdispersed Low-photon Count\n Biomedical Imaging Using $\\ell_p$ Total Variation","summary":" The negative binomial model, which generalizes the Poisson distribution\nmodel, can be found in applications involving low-photon signal recovery,\nincluding medical imaging. Recent studies have explored several regularization\nterms for the negative binomial model, such as the $\\ell_p$ quasi-norm with $0\n< p < 1$, $\\ell_1$ norm, and the total variation (TV) quasi-seminorm for\npromoting sparsity in signal recovery. These penalty terms have been shown to\nimprove image reconstruction outcomes. In this paper, we investigate the\n$\\ell_p$ quasi-seminorm, both isotropic and anisotropic $\\ell_p$ TV\nquasi-seminorms, within the framework of the negative binomial statistical\nmodel. This problem can be formulated as an optimization problem, which we\nsolve using a gradient-based approach. We present comparisons between the\nnegative binomial and Poisson statistical models using the $\\ell_p$ TV\nquasi-seminorm as well as common penalty terms. Our experimental results\nhighlight the efficacy of the proposed method.\n","authors":["Yu Lu","Roummel F. Marcia"],"pdf_url":"https://arxiv.org/pdf/2408.16622v1.pdf","comment":"5 pages, Accepted by the IEEE International Symposium on Biomedical\n Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2408.13140v2","updated":"2024-08-29T15:31:35Z","published":"2024-08-23T15:02:09Z","title":"Verification of Geometric Robustness of Neural Networks via Piecewise\n Linear Approximation and Lipschitz Optimisation","summary":" We address the problem of verifying neural networks against geometric\ntransformations of the input image, including rotation, scaling, shearing, and\ntranslation. The proposed method computes provably sound piecewise linear\nconstraints for the pixel values by using sampling and linear approximations in\ncombination with branch-and-bound Lipschitz optimisation. The method obtains\nprovably tighter over-approximations of the perturbation region than the\npresent state-of-the-art. We report results from experiments on a comprehensive\nset of verification benchmarks on MNIST and CIFAR10. We show that our proposed\nimplementation resolves up to 32% more verification cases than present\napproaches.\n","authors":["Ben Batten","Yang Zheng","Alessandro De Palma","Panagiotis Kouvaros","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2408.13140v2.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.16621v1","updated":"2024-08-29T15:28:42Z","published":"2024-08-29T15:28:42Z","title":"Towards Infusing Auxiliary Knowledge for Distracted Driver Detection","summary":" Distracted driving is a leading cause of road accidents globally.\nIdentification of distracted driving involves reliably detecting and\nclassifying various forms of driver distraction (e.g., texting, eating, or\nusing in-car devices) from in-vehicle camera feeds to enhance road safety. This\ntask is challenging due to the need for robust models that can generalize to a\ndiverse set of driver behaviors without requiring extensive annotated datasets.\nIn this paper, we propose KiD3, a novel method for distracted driver detection\n(DDD) by infusing auxiliary knowledge about semantic relations between entities\nin a scene and the structural configuration of the driver's pose. Specifically,\nwe construct a unified framework that integrates the scene graphs, and driver\npose information with the visual cues in video frames to create a holistic\nrepresentation of the driver's actions.Our results indicate that KiD3 achieves\na 13.64% accuracy improvement over the vision-only baseline by incorporating\nsuch auxiliary knowledge with visual information.\n","authors":["Ishwar B Balappanawar","Ashmit Chamoli","Ruwan Wickramarachchi","Aditya Mishra","Ponnurangam Kumaraguru","Amit P. Sheth"],"pdf_url":"https://arxiv.org/pdf/2408.16621v1.pdf","comment":"Accepted at KiL 2024: Workshop on Knowledge-infused Learning\n co-located with 30th ACM KDD Conference"},{"id":"http://arxiv.org/abs/2408.14698v2","updated":"2024-08-29T15:14:48Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n Integration in Adobe Express","summary":" As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v2.pdf","comment":"CIKM 2024 (International Conference on Information and Knowledge\n Management), Multimodal Search and Recommendations Workshop"},{"id":"http://arxiv.org/abs/2401.12972v3","updated":"2024-08-29T15:11:29Z","published":"2024-01-23T18:58:35Z","title":"On the Efficacy of Text-Based Input Modalities for Action Anticipation","summary":" Anticipating future actions is a highly challenging task due to the diversity\nand scale of potential future actions; yet, information from different\nmodalities help narrow down plausible action choices. Each modality can provide\ndiverse and often complementary context for the model to learn from. While\nprevious multi-modal methods leverage information from modalities such as video\nand audio, we primarily explore how text descriptions of actions and objects\ncan also lead to more accurate action anticipation by providing additional\ncontextual cues, e.g., about the environment and its contents. We propose a\nMulti-modal Contrastive Anticipative Transformer (M-CAT), a video transformer\narchitecture that jointly learns from multi-modal features and text\ndescriptions of actions and objects. We train our model in two stages, where\nthe model first learns to align video clips with descriptions of future\nactions, and is subsequently fine-tuned to predict future actions. Compared to\nexisting methods, M-CAT has the advantage of learning additional context from\ntwo types of text inputs: rich descriptions of future actions during\npre-training, and, text descriptions for detected objects and actions during\nmodality feature fusion. Through extensive experimental evaluation, we\ndemonstrate that our model outperforms previous methods on the EpicKitchens\ndatasets, and show that using simple text descriptions of actions and objects\naid in more effective action anticipation. In addition, we examine the impact\nof object and action information obtained via text, and perform extensive\nablations.\n","authors":["Apoorva Beedu","Harish Haresamudram","Karan Samel","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2401.12972v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16582v1","updated":"2024-08-29T14:48:00Z","published":"2024-08-29T14:48:00Z","title":"FastForensics: Efficient Two-Stream Design for Real-Time Image\n Manipulation Detection","summary":" With the rise in popularity of portable devices, the spread of falsified\nmedia on social platforms has become rampant. This necessitates the timely\nidentification of authentic content. However, most advanced detection methods\nare computationally heavy, hindering their real-time application. In this\npaper, we describe an efficient two-stream architecture for real-time image\nmanipulation detection. Our method consists of two-stream branches targeting\nthe cognitive and inspective perspectives. In the cognitive branch, we propose\nefficient wavelet-guided Transformer blocks to capture the global manipulation\ntraces related to frequency. This block contains an interactive wavelet-guided\nself-attention module that integrates wavelet transformation with efficient\nattention design, interacting with the knowledge from the inspective branch.\nThe inspective branch consists of simple convolutions that capture fine-grained\ntraces and interact bidirectionally with Transformer blocks to provide mutual\nsupport. Our method is lightweight ($\\sim$ 8M) but achieves competitive\nperformance compared to many other counterparts, demonstrating its efficacy in\nimage manipulation detection and its potential for portable integration.\n","authors":["Yangxiang Zhang","Yuezun Li","Ao Luo","Jiaran Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2408.16582v1.pdf","comment":"BMVC 2024"},{"id":"http://arxiv.org/abs/2404.11054v3","updated":"2024-08-29T14:43:25Z","published":"2024-04-17T03:56:28Z","title":"Mumpy: Multilateral Temporal-view Pyramid Transformer for Video\n Inpainting Detection","summary":" The task of video inpainting detection is to expose the pixel-level inpainted\nregions within a video sequence. Existing methods usually focus on leveraging\nspatial and temporal inconsistencies. However, these methods typically employ\nfixed operations to combine spatial and temporal clues, limiting their\napplicability in different scenarios. In this paper, we introduce a novel\nMultilateral Temporal-view Pyramid Transformer ({\\em MumPy}) that collaborates\nspatial-temporal clues flexibly. Our method utilizes a newly designed\nmultilateral temporal-view encoder to extract various collaborations of\nspatial-temporal clues and introduces a deformable window-based temporal-view\ninteraction module to enhance the diversity of these collaborations.\nSubsequently, we develop a multi-pyramid decoder to aggregate the various types\nof features and generate detection maps. By adjusting the contribution strength\nof spatial and temporal clues, our method can effectively identify inpainted\nregions. We validate our method on existing datasets and also introduce a new\nchallenging and large-scale Video Inpainting dataset based on the YouTube-VOS\ndataset, which employs several more recent inpainting methods. The results\ndemonstrate the superiority of our method in both in-domain and cross-domain\nevaluation scenarios.\n","authors":["Ying Zhang","Yuezun Li","Bo Peng","Jiaran Zhou","Huiyu Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.11054v3.pdf","comment":"BMVC 2024"},{"id":"http://arxiv.org/abs/2408.14810v2","updated":"2024-08-29T14:38:22Z","published":"2024-08-27T06:49:21Z","title":"Generalist Segmentation Algorithm for Photoreceptors Analysis in\n Adaptive Optics Imaging","summary":" Analyzing the cone photoreceptor pattern in images obtained from the living\nhuman retina using quantitative methods can be crucial for the early detection\nand management of various eye conditions. Confocal adaptive optics scanning\nlight ophthalmoscope (AOSLO) imaging enables visualization of the cones from\nreflections of waveguiding cone photoreceptors. While there have been\nsignificant improvements in automated algorithms for segmenting cones in\nconfocal AOSLO images, the process of labelling data remains labor-intensive\nand manual. This paper introduces a method based on deep learning (DL) for\ndetecting and segmenting cones in AOSLO images. The models were trained on a\nsemi-automatically labelled dataset of 20 AOSLO batches of images of 18\nparticipants for 0$^{\\circ}$, 1$^{\\circ}$, and 2$^{\\circ}$ from the foveal\ncenter. F1 scores were 0.968, 0.958, and 0.954 for 0$^{\\circ}$, 1$^{\\circ}$,\nand 2$^{\\circ}$, respectively, which is better than previously reported DL\napproaches. Our method minimizes the need for labelled data by only\nnecessitating a fraction of labelled cones, which is especially beneficial in\nthe field of ophthalmology, where labelled data can often be limited.\n","authors":["Mikhail Kulyabin","Aline Sindel","Hilde Pedersen","Stuart Gilson","Rigmor Baraas","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2408.14810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16563v1","updated":"2024-08-29T14:30:45Z","published":"2024-08-29T14:30:45Z","title":"MST-KD: Multiple Specialized Teachers Knowledge Distillation for Fair\n Face Recognition","summary":" As in school, one teacher to cover all subjects is insufficient to distill\nequally robust information to a student. Hence, each subject is taught by a\nhighly specialised teacher. Following a similar philosophy, we propose a\nmultiple specialized teacher framework to distill knowledge to a student\nnetwork. In our approach, directed at face recognition use cases, we train four\nteachers on one specific ethnicity, leading to four highly specialized and\nbiased teachers. Our strategy learns a project of these four teachers into a\ncommon space and distill that information to a student network. Our results\nhighlighted increased performance and reduced bias for all our experiments. In\naddition, we further show that having biased/specialized teachers is crucial by\nshowing that our approach achieves better results than when knowledge is\ndistilled from four teachers trained on balanced datasets. Our approach\nrepresents a step forward to the understanding of the importance of\nethnicity-specific features.\n","authors":["Eduarda Caldeira","Jaime S. Cardoso","Ana F. Sequeira","Pedro C. Neto"],"pdf_url":"https://arxiv.org/pdf/2408.16563v1.pdf","comment":"Accepted at ECCV 2024 ABAW"},{"id":"http://arxiv.org/abs/2408.16547v1","updated":"2024-08-29T14:10:14Z","published":"2024-08-29T14:10:14Z","title":"OP-Align: Object-level and Part-level Alignment for Self-supervised\n Category-level Articulated Object Pose Estimation","summary":" Category-level articulated object pose estimation focuses on the pose\nestimation of unknown articulated objects within known categories. Despite its\nsignificance, this task remains challenging due to the varying shapes and poses\nof objects, expensive dataset annotation costs, and complex real-world\nenvironments. In this paper, we propose a novel self-supervised approach that\nleverages a single-frame point cloud to solve this task. Our model consistently\ngenerates reconstruction with a canonical pose and joint state for the entire\ninput object, and it estimates object-level poses that reduce overall pose\nvariance and part-level poses that align each part of the input with its\ncorresponding part of the reconstruction. Experimental results demonstrate that\nour approach significantly outperforms previous self-supervised methods and is\ncomparable to the state-of-the-art supervised methods. To assess the\nperformance of our model in real-world scenarios, we also introduce a new\nreal-world articulated object benchmark dataset.\n","authors":["Yuchen Che","Ryo Furukawa","Asako Kanezaki"],"pdf_url":"https://arxiv.org/pdf/2408.16547v1.pdf","comment":"to be published in ECCV2024"},{"id":"http://arxiv.org/abs/2408.16544v1","updated":"2024-08-29T14:02:47Z","published":"2024-08-29T14:02:47Z","title":"Spurfies: Sparse Surface Reconstruction using Local Geometry Priors","summary":" We introduce Spurfies, a novel method for sparse-view surface reconstruction\nthat disentangles appearance and geometry information to utilize local geometry\npriors trained on synthetic data. Recent research heavily focuses on 3D\nreconstruction using dense multi-view setups, typically requiring hundreds of\nimages. However, these methods often struggle with few-view scenarios. Existing\nsparse-view reconstruction techniques often rely on multi-view stereo networks\nthat need to learn joint priors for geometry and appearance from a large amount\nof data. In contrast, we introduce a neural point representation that\ndisentangles geometry and appearance to train a local geometry prior using a\nsubset of the synthetic ShapeNet dataset only. During inference, we utilize\nthis surface prior as additional constraint for surface and appearance\nreconstruction from sparse input views via differentiable volume rendering,\nrestricting the space of possible solutions. We validate the effectiveness of\nour method on the DTU dataset and demonstrate that it outperforms previous\nstate of the art by 35% in surface quality while achieving competitive novel\nview synthesis quality. Moreover, in contrast to previous works, our method can\nbe applied to larger, unbounded scenes, such as Mip-NeRF 360.\n","authors":["Kevin Raj","Christopher Wewer","Raza Yunus","Eddy Ilg","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2408.16544v1.pdf","comment":"https://geometric-rl.mpi-inf.mpg.de/spurfies/"},{"id":"http://arxiv.org/abs/2408.16540v1","updated":"2024-08-29T13:58:34Z","published":"2024-08-29T13:58:34Z","title":"GRPose: Learning Graph Relations for Human Image Generation with Pose\n Priors","summary":" Recent methods using diffusion models have made significant progress in human\nimage generation with various additional controls such as pose priors. However,\nexisting approaches still struggle to generate high-quality images with\nconsistent pose alignment, resulting in unsatisfactory outputs. In this paper,\nwe propose a framework delving into the graph relations of pose priors to\nprovide control information for human image generation. The main idea is to\nestablish a graph topological structure between the pose priors and latent\nrepresentation of diffusion models to capture the intrinsic associations\nbetween different pose parts. A Progressive Graph Integrator (PGI) is designed\nto learn the spatial relationships of the pose priors with the graph structure,\nadopting a hierarchical strategy within an Adapter to gradually propagate\ninformation across different pose parts. A pose perception loss is further\nintroduced based on a pretrained pose estimation network to minimize the pose\ndifferences. Extensive qualitative and quantitative experiments conducted on\nthe Human-Art and LAION-Human datasets demonstrate that our model achieves\nsuperior performance, with a 9.98% increase in pose average precision compared\nto the latest benchmark model. The code is released on *******.\n","authors":["Xiangchen Yin","Donglin Di","Lei Fan","Hao Li","Chen Wei","Xiaofei Gou","Yang Song","Xiao Sun","Xun Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16540v1.pdf","comment":"The code will be released at https://github.com/XiangchenYin/GRPose"},{"id":"http://arxiv.org/abs/2405.04964v2","updated":"2024-08-29T13:44:20Z","published":"2024-05-08T11:09:24Z","title":"Frequency-Assisted Mamba for Remote Sensing Image Super-Resolution","summary":" Recent progress in remote sensing image (RSI) super-resolution (SR) has\nexhibited remarkable performance using deep neural networks, e.g.,\nConvolutional Neural Networks and Transformers. However, existing SR methods\noften suffer from either a limited receptive field or quadratic computational\noverhead, resulting in sub-optimal global representation and unacceptable\ncomputational costs in large-scale RSI. To alleviate these issues, we develop\nthe first attempt to integrate the Vision State Space Model (Mamba) for RSI-SR,\nwhich specializes in processing large-scale RSI by capturing long-range\ndependency with linear complexity. To achieve better SR reconstruction,\nbuilding upon Mamba, we devise a Frequency-assisted Mamba framework, dubbed\nFMSR, to explore the spatial and frequent correlations. In particular, our FMSR\nfeatures a multi-level fusion architecture equipped with the Frequency\nSelection Module (FSM), Vision State Space Module (VSSM), and Hybrid Gate\nModule (HGM) to grasp their merits for effective spatial-frequency fusion.\nConsidering that global and local dependencies are complementary and both\nbeneficial for SR, we further recalibrate these multi-level features for\naccurate feature fusion via learnable scaling adaptors. Extensive experiments\non AID, DOTA, and DIOR benchmarks demonstrate that our FMSR outperforms\nstate-of-the-art Transformer-based methods HAT-L in terms of PSNR by 0.11 dB on\naverage, while consuming only 28.05% and 19.08% of its memory consumption and\ncomplexity, respectively. Code will be available at\nhttps://github.com/XY-boy/FreMamba\n","authors":["Yi Xiao","Qiangqiang Yuan","Kui Jiang","Yuzeng Chen","Qiang Zhang","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2405.04964v2.pdf","comment":"Accepted by IEEE TMM"},{"id":"http://arxiv.org/abs/2408.16520v1","updated":"2024-08-29T13:31:15Z","published":"2024-08-29T13:31:15Z","title":"Towards Modality-agnostic Label-efficient Segmentation with\n Entropy-Regularized Distribution Alignment","summary":" Label-efficient segmentation aims to perform effective segmentation on input\ndata using only sparse and limited ground-truth labels for training. This topic\nis widely studied in 3D point cloud segmentation due to the difficulty of\nannotating point clouds densely, while it is also essential for cost-effective\nsegmentation on 2D images. Until recently, pseudo-labels have been widely\nemployed to facilitate training with limited ground-truth labels, and promising\nprogress has been witnessed in both the 2D and 3D segmentation. However,\nexisting pseudo-labeling approaches could suffer heavily from the noises and\nvariations in unlabelled data, which would result in significant discrepancies\nbetween generated pseudo-labels and current model predictions during training.\nWe analyze that this can further confuse and affect the model learning process,\nwhich shows to be a shared problem in label-efficient learning across both 2D\nand 3D modalities. To address this issue, we propose a novel learning strategy\nto regularize the pseudo-labels generated for training, thus effectively\nnarrowing the gaps between pseudo-labels and model predictions. More\nspecifically, our method introduces an Entropy Regularization loss and a\nDistribution Alignment loss for label-efficient learning, resulting in an ERDA\nlearning strategy. Interestingly, by using KL distance to formulate the\ndistribution alignment loss, ERDA reduces to a deceptively simple\ncross-entropy-based loss which optimizes both the pseudo-label generation\nmodule and the segmentation model simultaneously. In addition, we innovate in\nthe pseudo-label generation to make our ERDA consistently effective across both\n2D and 3D data modalities for segmentation. Enjoying simplicity and more\nmodality-agnostic pseudo-label generation, our method has shown outstanding\nperformance in fully utilizing all unlabeled data points for training across\n...\n","authors":["Liyao Tang","Zhe Chen","Shanshan Zhao","Chaoyue Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.16520v1.pdf","comment":"Extended version of arXiv:2305.15832; Code at\n https://github.com/LiyaoTang/ERDA"},{"id":"http://arxiv.org/abs/2408.02674v2","updated":"2024-08-29T13:29:36Z","published":"2024-07-22T06:13:22Z","title":"On Feasibility of Intent Obfuscating Attacks","summary":" Intent obfuscation is a common tactic in adversarial situations, enabling the\nattacker to both manipulate the target system and avoid culpability.\nSurprisingly, it has rarely been implemented in adversarial attacks on machine\nlearning systems. We are the first to propose using intent obfuscation to\ngenerate adversarial examples for object detectors: by perturbing another\nnon-overlapping object to disrupt the target object, the attacker hides their\nintended target. We conduct a randomized experiment on 5 prominent detectors --\nYOLOv3, SSD, RetinaNet, Faster R-CNN, and Cascade R-CNN -- using both targeted\nand untargeted attacks and achieve success on all models and attacks. We\nanalyze the success factors characterizing intent obfuscating attacks,\nincluding target object confidence and perturb object sizes. We then\ndemonstrate that the attacker can exploit these success factors to increase\nsuccess rates for all models and attacks. Finally, we discuss main takeaways\nand legal repercussions.\n","authors":["Zhaobin Li","Patrick Shafto"],"pdf_url":"https://arxiv.org/pdf/2408.02674v2.pdf","comment":"33 pages, 21 Figures. Includes technical appendix. To appear in AIES\n 2024"},{"id":"http://arxiv.org/abs/2406.19006v2","updated":"2024-08-29T13:23:45Z","published":"2024-06-27T08:45:31Z","title":"VideoMambaPro: A Leap Forward for Mamba in Video Understanding","summary":" Video understanding requires the extraction of rich spatio-temporal\nrepresentations, which transformer models achieve through self-attention.\nUnfortunately, self-attention poses a computational burden. In NLP, Mamba has\nsurfaced as an efficient alternative for transformers. However, Mamba's\nsuccesses do not trivially extend to computer vision tasks, including those in\nvideo analysis. In this paper, we theoretically analyze the differences between\nself-attention and Mamba. We identify two limitations in Mamba's token\nprocessing: historical decay and element contradiction. We propose\nVideoMambaPro (VMP) that solves the identified limitations by adding masked\nbackward computation and elemental residual connections to a VideoMamba\nbackbone. VideoMambaPro shows state-of-the-art video action recognition\nperformance compared to transformer models, and surpasses VideoMamba by clear\nmargins: 7.9% and 8.1% top-1 on Kinetics-400 and Something-Something V2,\nrespectively. Our VideoMambaPro-M model achieves 91.9% top-1 on Kinetics-400,\nonly 0.2% below InternVideo2-6B but with only 1.2% of its parameters. The\ncombination of high performance and efficiency makes VideoMambaPro an\ninteresting alternative for transformer models.\n","authors":["Hui Lu","Albert Ali Salah","Ronald Poppe"],"pdf_url":"https://arxiv.org/pdf/2406.19006v2.pdf","comment":"Model weights are lost due to management error, will re-calculate and\n update the results"},{"id":"http://arxiv.org/abs/2212.12130v6","updated":"2024-08-29T13:08:14Z","published":"2022-12-23T03:54:59Z","title":"Learning to Detect and Segment for Open Vocabulary Object Detection","summary":" Open vocabulary object detection has been greatly advanced by the recent\ndevelopment of vision-language pretrained model, which helps recognize novel\nobjects with only semantic categories. The prior works mainly focus on\nknowledge transferring to the object proposal classification and employ\nclass-agnostic box and mask prediction. In this work, we propose CondHead, a\nprincipled dynamic network design to better generalize the box regression and\nmask segmentation for open vocabulary setting. The core idea is to\nconditionally parameterize the network heads on semantic embedding and thus the\nmodel is guided with class-specific knowledge to better detect novel\ncategories. Specifically, CondHead is composed of two streams of network heads,\nthe dynamically aggregated head and the dynamically generated head. The former\nis instantiated with a set of static heads that are conditionally aggregated,\nthese heads are optimized as experts and are expected to learn sophisticated\nprediction. The latter is instantiated with dynamically generated parameters\nand encodes general class-specific information. With such a conditional design,\nthe detection model is bridged by the semantic embedding to offer strongly\ngeneralizable class-wise box and mask prediction. Our method brings significant\nimprovement to the state-of-the-art open vocabulary object detection methods\nwith very minor overhead, e.g., it surpasses a RegionClip model by 3.0\ndetection AP on novel categories, with only 1.1% more computation.\n","authors":["Tao Wang","Nan Li"],"pdf_url":"https://arxiv.org/pdf/2212.12130v6.pdf","comment":"Accepted to CVPR2023, code will be available later"},{"id":"http://arxiv.org/abs/2408.16506v1","updated":"2024-08-29T13:08:12Z","published":"2024-08-29T13:08:12Z","title":"Alignment is All You Need: A Training-free Augmentation Strategy for\n Pose-guided Video Generation","summary":" Character animation is a transformative field in computer graphics and\nvision, enabling dynamic and realistic video animations from static images.\nDespite advancements, maintaining appearance consistency in animations remains\na challenge. Our approach addresses this by introducing a training-free\nframework that ensures the generated video sequence preserves the reference\nimage's subtleties, such as physique and proportions, through a dual alignment\nstrategy. We decouple skeletal and motion priors from pose information,\nenabling precise control over animation generation. Our method also improves\npixel-level alignment for conditional control from the reference character,\nenhancing the temporal consistency and visual cohesion of animations. Our\nmethod significantly enhances the quality of video generation without the need\nfor large datasets or expensive computational resources.\n","authors":["Xiaoyu Jin","Zunnan Xu","Mingwen Ou","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16506v1.pdf","comment":"CVG@ICML 2024"},{"id":"http://arxiv.org/abs/2408.16504v1","updated":"2024-08-29T13:02:12Z","published":"2024-08-29T13:02:12Z","title":"A Simple and Generalist Approach for Panoptic Segmentation","summary":" Generalist vision models aim for one and the same architecture for a variety\nof vision tasks. While such shared architecture may seem attractive, generalist\nmodels tend to be outperformed by their bespoken counterparts, especially in\nthe case of panoptic segmentation. We address this problem by introducing two\nkey contributions, without compromising the desirable properties of generalist\nmodels. These contributions are: (i) a positional-embedding (PE) based loss for\nimproved centroid regressions; (ii) Edge Distance Sampling (EDS) for the better\nseparation of instance boundaries. The PE-based loss facilitates a better\nper-pixel regression of the associated instance's centroid, whereas EDS\ncontributes by carefully handling the void regions (caused by missing labels)\nand smaller instances. These two simple yet effective modifications\nsignificantly improve established baselines, while achieving state-of-the-art\nresults among all generalist solutions. More specifically, our method achieves\na panoptic quality(PQ) of 52.5 on the COCO dataset, which is an improvement of\n10 points over the best model with similar approach (Painter), and is superior\nby 2 to the best performing diffusion-based method Pix2Seq-$\\mathcal{D}$.\nFurthermore, we provide insights into and an in-depth analysis of our\ncontributions through exhaustive experiments. Our source code and model weights\nwill be made publicly available.\n","authors":["Nedyalko Prisadnikov","Wouter Van Gansbeke","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.16504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16503v1","updated":"2024-08-29T13:02:01Z","published":"2024-08-29T13:02:01Z","title":"Locally Grouped and Scale-Guided Attention for Dense Pest Counting","summary":" This study introduces a new dense pest counting problem to predict densely\ndistributed pests captured by digital traps. Unlike traditional detection-based\ncounting models for sparsely distributed objects, trap-based pest counting must\ndeal with dense pest distributions that pose challenges such as severe\nocclusion, wide pose variation, and similar appearances in colors and textures.\nTo address these problems, it is essential to incorporate the local attention\nmechanism, which identifies locally important and unimportant areas to learn\nlocally grouped features, thereby enhancing discriminative performance.\nAccordingly, this study presents a novel design that integrates locally grouped\nand scale-guided attention into a multiscale CenterNet framework. To group\nlocal features with similar attributes, a straightforward method is introduced\nusing the heatmap predicted by the first hourglass containing pest centroid\ninformation, which eliminates the need for complex clustering models. To\nenhance attentiveness, the pixel attention module transforms the heatmap into a\nlearnable map. Subsequently, scale-guided attention is deployed to make the\nobject and background features more discriminative, achieving multiscale\nfeature fusion. Through experiments, the proposed model is verified to enhance\nobject features based on local grouping and discriminative feature attention\nlearning. Additionally, the proposed model is highly effective in overcoming\nocclusion and pose variation problems, making it more suitable for dense pest\ncounting. In particular, the proposed model outperforms state-of-the-art models\nby a large margin, with a remarkable contribution to dense pest counting.\n","authors":["Chang-Hwan Son"],"pdf_url":"https://arxiv.org/pdf/2408.16503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16501v1","updated":"2024-08-29T13:00:37Z","published":"2024-08-29T13:00:37Z","title":"UAV-Based Human Body Detector Selection and Fusion for Geolocated\n Saliency Map Generation","summary":" The problem of reliably detecting and geolocating objects of different\nclasses in soft real-time is essential in many application areas, such as\nSearch and Rescue performed using Unmanned Aerial Vehicles (UAVs). This\nresearch addresses the complementary problems of system contextual vision-based\ndetector selection, allocation, and execution, in addition to the fusion of\ndetection results from teams of UAVs for the purpose of accurately and reliably\ngeolocating objects of interest in a timely manner. In an offline step, an\napplication-independent evaluation of vision-based detectors from a system\nperspective is first performed. Based on this evaluation, the most appropriate\nalgorithms for online object detection for each platform are selected\nautomatically before a mission, taking into account a number of practical\nsystem considerations, such as the available communication links, video\ncompression used, and the available computational resources. The detection\nresults are fused using a method for building maps of salient locations which\ntakes advantage of a novel sensor model for vision-based detections for both\npositive and negative observations. A number of simulated and real flight\nexperiments are also presented, validating the proposed method.\n","authors":["Piotr Rudol","Patrick Doherty","Mariusz Wzorek","Chattrakul Sombattheera"],"pdf_url":"https://arxiv.org/pdf/2408.16501v1.pdf","comment":"42 pages, 19 figures"},{"id":"http://arxiv.org/abs/2408.16500v1","updated":"2024-08-29T12:59:12Z","published":"2024-08-29T12:59:12Z","title":"CogVLM2: Visual Language Models for Image and Video Understanding","summary":" Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in\npursuit of enhanced vision-language fusion, efficient higher-resolution\narchitecture, and broader modalities and applications. Here we propose the\nCogVLM2 family, a new generation of visual language models for image and video\nunderstanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image\nunderstanding model, CogVLM2 inherits the visual expert architecture with\nimproved training recipes in both pre-training and post-training stages,\nsupporting input resolution up to $1344 \\times 1344$ pixels. As a video\nunderstanding model, CogVLM2-Video integrates multi-frame input with timestamps\nand proposes automated temporal grounding data construction. Notably, CogVLM2\nfamily has achieved state-of-the-art results on benchmarks like MMBench,\nMM-Vet, TextVQA, MVBench and VCGBench. All models are open-sourced in\nhttps://github.com/THUDM/CogVLM2 and https://github.com/THUDM/GLM-4,\ncontributing to the advancement of the field.\n","authors":["Wenyi Hong","Weihan Wang","Ming Ding","Wenmeng Yu","Qingsong Lv","Yan Wang","Yean Cheng","Shiyu Huang","Junhui Ji","Zhao Xue","Lei Zhao","Zhuoyi Yang","Xiaotao Gu","Xiaohan Zhang","Guanyu Feng","Da Yin","Zihan Wang","Ji Qi","Xixuan Song","Peng Zhang","Debing Liu","Bin Xu","Juanzi Li","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2408.16500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16486v1","updated":"2024-08-29T12:34:01Z","published":"2024-08-29T12:34:01Z","title":"Adapting Vision-Language Models to Open Classes via Test-Time Prompt\n Tuning","summary":" Adapting pre-trained models to open classes is a challenging problem in\nmachine learning. Vision-language models fully explore the knowledge of text\nmodality, demonstrating strong zero-shot recognition performance, which is\nnaturally suited for various open-set problems. More recently, some research\nfocuses on fine-tuning such models to downstream tasks. Prompt tuning methods\nachieved huge improvements by learning context vectors on few-shot data.\nHowever, through the evaluation under open-set adaptation setting with the test\ndata including new classes, we find that there exists a dilemma that learned\nprompts have worse generalization abilities than hand-crafted prompts. In this\npaper, we consider combining the advantages of both and come up with a\ntest-time prompt tuning approach, which leverages the maximum concept matching\n(MCM) scores as dynamic weights to generate an input-conditioned prompt for\neach image during test. Through extensive experiments on 11 different datasets,\nwe show that our proposed method outperforms all comparison methods on average\nconsidering both base and new classes. The code is available at\nhttps://github.com/gaozhengqing/TTPT\n","authors":["Zhengqing Gao","Xiang Ao","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16486v1.pdf","comment":"PRCV 2024"},{"id":"http://arxiv.org/abs/2403.06702v3","updated":"2024-08-29T12:27:12Z","published":"2024-03-11T13:17:55Z","title":"Fast Text-to-3D-Aware Face Generation and Manipulation via Direct\n Cross-modal Mapping and Geometric Regularization","summary":" Text-to-3D-aware face (T3D Face) generation and manipulation is an emerging\nresearch hot spot in machine learning, which still suffers from low efficiency\nand poor quality. In this paper, we propose an End-to-End Efficient and\nEffective network for fast and accurate T3D face generation and manipulation,\ntermed $E^3$-FaceNet. Different from existing complex generation paradigms,\n$E^3$-FaceNet resorts to a direct mapping from text instructions to 3D-aware\nvisual space. We introduce a novel Style Code Enhancer to enhance cross-modal\nsemantic alignment, alongside an innovative Geometric Regularization objective\nto maintain consistency across multi-view generations. Extensive experiments on\nthree benchmark datasets demonstrate that $E^3$-FaceNet can not only achieve\npicture-like 3D face generation and manipulation, but also improve inference\nspeed by orders of magnitudes. For instance, compared with Latent3D,\n$E^3$-FaceNet speeds up the five-view generations by almost 470 times, while\nstill exceeding in generation quality. Our code is released at\nhttps://github.com/Aria-Zhangjl/E3-FaceNet.\n","authors":["Jinlu Zhang","Yiyi Zhou","Qiancheng Zheng","Xiaoxiong Du","Gen Luo","Jun Peng","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.06702v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16481v1","updated":"2024-08-29T12:16:55Z","published":"2024-08-29T12:16:55Z","title":"A Deep-Learning-Based Lable-free No-Reference Image Quality Assessment\n Metric: Application in Sodium MRI Denoising","summary":" New multinuclear MRI techniques, such as sodium MRI, generally suffer from\nlow image quality due to an inherently low signal. Postprocessing methods, such\nas image denoising, have been developed for image enhancement. However, the\nassessment of these enhanced images is challenging especially considering when\nthere is a lack of high resolution and high signal images as reference, such as\nin sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are\napproaches to solve this problem. Existing learning-based NR-IQA metrics rely\non labels derived from subjective human opinions or metrics like\nSignal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate\nground truths, resulting in unreliable assessment. We note that deep learning\n(DL) models have a unique characteristic in that they are specialized to a\ncharacteristic training set, meaning that deviations between the input testing\ndata from the training data will reduce prediction accuracy. Therefore, we\npropose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM),\nwhich does not depend on ground-truth images or labels. MSM measures the\ndifference between the input image and the model's prediction for evaluating\nthe quality of the input image. Experiments conducted on both simulated\ndistorted proton T1-weighted MR images and denoised sodium MR images\ndemonstrate that MSM exhibits a superior evaluation performance on various\nsimulated noises and distortions. MSM also has a substantial agreement with the\nexpert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528,\noutperforming the existing NR-IQA metrics.\n","authors":["Shuaiyu Yuan","Tristan Whitmarsh","Dimitri A Kessler","Otso Arponen","Mary A McLean","Gabrielle Baxter","Frank Riemer","Aneurin J Kennerley","William J Brackenbury","Fiona J Gilbert","Joshua D Kaggie"],"pdf_url":"https://arxiv.org/pdf/2408.16481v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.16478v1","updated":"2024-08-29T12:15:10Z","published":"2024-08-29T12:15:10Z","title":"MICDrop: Masking Image and Depth Features via Complementary Dropout for\n Domain-Adaptive Semantic Segmentation","summary":" Unsupervised Domain Adaptation (UDA) is the task of bridging the domain gap\nbetween a labeled source domain, e.g., synthetic data, and an unlabeled target\ndomain. We observe that current UDA methods show inferior results on fine\nstructures and tend to oversegment objects with ambiguous appearance. To\naddress these shortcomings, we propose to leverage geometric information, i.e.,\ndepth predictions, as depth discontinuities often coincide with segmentation\nboundaries. We show that naively incorporating depth into current UDA methods\ndoes not fully exploit the potential of this complementary information. To this\nend, we present MICDrop, which learns a joint feature representation by masking\nimage encoder features while inversely masking depth encoder features. With\nthis simple yet effective complementary masking strategy, we enforce the use of\nboth modalities when learning the joint feature representation. To aid this\nprocess, we propose a feature fusion module to improve both global as well as\nlocal information sharing while being robust to errors in the depth\npredictions. We show that our method can be plugged into various recent UDA\nmethods and consistently improve results across standard UDA benchmarks,\nobtaining new state-of-the-art performances.\n","authors":["Linyan Yang","Lukas Hoyer","Mark Weber","Tobias Fischer","Dengxin Dai","Laura Leal-Taixé","Marc Pollefeys","Daniel Cremers","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.16478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16472v1","updated":"2024-08-29T12:04:03Z","published":"2024-08-29T12:04:03Z","title":"Creating a Segmented Pointcloud of Grapevines by Combining Multiple\n Viewpoints Through Visual Odometry","summary":" Grapevine winter pruning is a labor-intensive and repetitive process that\nsignificantly influences the quality and quantity of the grape harvest and\nproduced wine of the following season. It requires a careful and expert\ndetection of the point to be cut. Because of its complexity, repetitive nature\nand time constraint, the task requires skilled labor that needs to be trained.\nThis extended abstract presents the computer vision pipeline employed in\nproject Vinum, using detectron2 as a segmentation network and keypoint visual\nodometry to merge different observation into a single pointcloud used to make\ninformed pruning decisions.\n","authors":["Michael Adlerstein","Angelo Bratta","João Carlos Virgolino Soares","Giovanni Dessy","Miguel Fernandes","Matteo Gatti","Claudio Semini"],"pdf_url":"https://arxiv.org/pdf/2408.16472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16471v1","updated":"2024-08-29T12:01:23Z","published":"2024-08-29T12:01:23Z","title":"Improving 3D deep learning segmentation with biophysically motivated\n cell synthesis","summary":" Biomedical research increasingly relies on 3D cell culture models and\nAI-based analysis can potentially facilitate a detailed and accurate feature\nextraction on a single-cell level. However, this requires for a precise\nsegmentation of 3D cell datasets, which in turn demands high-quality ground\ntruth for training. Manual annotation, the gold standard for ground truth data,\nis too time-consuming and thus not feasible for the generation of large 3D\ntraining datasets. To address this, we present a novel framework for generating\n3D training data, which integrates biophysical modeling for realistic cell\nshape and alignment. Our approach allows the in silico generation of coherent\nmembrane and nuclei signals, that enable the training of segmentation models\nutilizing both channels for improved performance. Furthermore, we present a new\nGAN training scheme that generates not only image data but also matching\nlabels. Quantitative evaluation shows superior performance of biophysical\nmotivated synthetic training data, even outperforming manual annotation and\npretrained models. This underscores the potential of incorporating biophysical\nmodeling for enhancing synthetic training data quality.\n","authors":["Roman Bruch","Mario Vitacolonna","Elina Nürnberg","Simeon Sauer","Rüdiger Rudolf","Markus Reischl"],"pdf_url":"https://arxiv.org/pdf/2408.16471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16469v1","updated":"2024-08-29T12:00:11Z","published":"2024-08-29T12:00:11Z","title":"Multi-source Domain Adaptation for Panoramic Semantic Segmentation","summary":" Panoramic semantic segmentation has received widespread attention recently\ndue to its comprehensive 360\\degree field of view. However, labeling such\nimages demands greater resources compared to pinhole images. As a result, many\nunsupervised domain adaptation methods for panoramic semantic segmentation have\nemerged, utilizing real pinhole images or low-cost synthetic panoramic images.\nBut, the segmentation model lacks understanding of the panoramic structure when\nonly utilizing real pinhole images, and it lacks perception of real-world\nscenes when only adopting synthetic panoramic images. Therefore, in this paper,\nwe propose a new task of multi-source domain adaptation for panoramic semantic\nsegmentation, aiming to utilize both real pinhole and synthetic panoramic\nimages in the source domains, enabling the segmentation model to perform well\non unlabeled real panoramic images in the target domain. Further, we propose\nDeformation Transform Aligner for Panoramic Semantic Segmentation (DTA4PASS),\nwhich converts all pinhole images in the source domains into panoramic-like\nimages, and then aligns the converted source domains with the target domain.\nSpecifically, DTA4PASS consists of two main components: Unpaired Semantic\nMorphing (USM) and Distortion Gating Alignment (DGA). Firstly, in USM, the\nSemantic Dual-view Discriminator (SDD) assists in training the diffeomorphic\ndeformation network, enabling the effective transformation of pinhole images\nwithout paired panoramic views. Secondly, DGA assigns pinhole-like and\npanoramic-like features to each image by gating, and aligns these two features\nthrough uncertainty estimation. DTA4PASS outperforms the previous\nstate-of-the-art methods by 1.92% and 2.19% on the outdoor and indoor\nmulti-source domain adaptation scenarios, respectively. The source code will be\nreleased.\n","authors":["Jing Jiang","Sicheng Zhao","Jiankun Zhu","Wenbo Tang","Zhaopan Xu","Jidong Yang","Pengfei Xu","Hongxun Yao"],"pdf_url":"https://arxiv.org/pdf/2408.16469v1.pdf","comment":"9 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.06831v2","updated":"2024-08-29T11:57:39Z","published":"2024-03-11T15:48:17Z","title":"HDRTransDC: High Dynamic Range Image Reconstruction with Transformer\n Deformation Convolution","summary":" High Dynamic Range (HDR) imaging aims to generate an artifact-free HDR image\nwith realistic details by fusing multi-exposure Low Dynamic Range (LDR) images.\nCaused by large motion and severe under-/over-exposure among input LDR images,\nHDR imaging suffers from ghosting artifacts and fusion distortions. To address\nthese critical issues, we propose an HDR Transformer Deformation Convolution\n(HDRTransDC) network to generate high-quality HDR images, which consists of the\nTransformer Deformable Convolution Alignment Module (TDCAM) and the Dynamic\nWeight Fusion Block (DWFB). To solve the ghosting artifacts, the proposed TDCAM\nextracts long-distance content similar to the reference feature in the entire\nnon-reference features, which can accurately remove misalignment and fill the\ncontent occluded by moving objects. For the purpose of eliminating fusion\ndistortions, we propose DWFB to spatially adaptively select useful information\nacross frames to effectively fuse multi-exposed features. Extensive experiments\nshow that our method quantitatively and qualitatively achieves state-of-the-art\nperformance.\n","authors":["Shuaikang Shang","Xuejing Kang","Anlong Ming"],"pdf_url":"https://arxiv.org/pdf/2403.06831v2.pdf","comment":"We request to withdraw our manuscript due to identified issues:\n inaccuracies in the description of a submodule's composition, principles, and\n functionality in Section 3.2, and potential problems in metric calculation in\n Sections 4.2 and 4.3. To prevent the spread of misleading information, we\n believe it is necessary to temporarily withdraw the manuscript for further\n research and verification"},{"id":"http://arxiv.org/abs/2408.16467v1","updated":"2024-08-29T11:56:02Z","published":"2024-08-29T11:56:02Z","title":"Spiking Diffusion Models","summary":" Recent years have witnessed Spiking Neural Networks (SNNs) gaining attention\nfor their ultra-low energy consumption and high biological plausibility\ncompared with traditional Artificial Neural Networks (ANNs). Despite their\ndistinguished properties, the application of SNNs in the computationally\nintensive field of image generation is still under exploration. In this paper,\nwe propose the Spiking Diffusion Models (SDMs), an innovative family of\nSNN-based generative models that excel in producing high-quality samples with\nsignificantly reduced energy consumption. In particular, we propose a\nTemporal-wise Spiking Mechanism (TSM) that allows SNNs to capture more temporal\nfeatures from a bio-plasticity perspective. In addition, we propose a\nthreshold-guided strategy that can further improve the performances by up to\n16.7% without any additional training. We also make the first attempt to use\nthe ANN-SNN approach for SNN-based generation tasks. Extensive experimental\nresults reveal that our approach not only exhibits comparable performance to\nits ANN counterpart with few spiking time steps, but also outperforms previous\nSNN-based generative models by a large margin. Moreover, we also demonstrate\nthe high-quality generation ability of SDM on large-scale datasets, e.g., LSUN\nbedroom. This development marks a pivotal advancement in the capabilities of\nSNN-based generation, paving the way for future research avenues to realize\nlow-energy and low-latency generative applications. Our code is available at\nhttps://github.com/AndyCao1125/SDM.\n","authors":["Jiahang Cao","Hanzhong Guo","Ziqing Wang","Deming Zhou","Hao Cheng","Qiang Zhang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16467v1.pdf","comment":"Accepted by IEEE Transactions on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2408.16451v1","updated":"2024-08-29T11:31:28Z","published":"2024-08-29T11:31:28Z","title":"Weakly Supervised Object Detection for Automatic Tooth-marked Tongue\n Recognition","summary":" Tongue diagnosis in Traditional Chinese Medicine (TCM) is a crucial\ndiagnostic method that can reflect an individual's health status. Traditional\nmethods for identifying tooth-marked tongues are subjective and inconsistent\nbecause they rely on practitioner experience. We propose a novel fully\nautomated Weakly Supervised method using Vision transformer and Multiple\ninstance learning WSVM for tongue extraction and tooth-marked tongue\nrecognition. Our approach first accurately detects and extracts the tongue\nregion from clinical images, removing any irrelevant background information.\nThen, we implement an end-to-end weakly supervised object detection method. We\nutilize Vision Transformer (ViT) to process tongue images in patches and employ\nmultiple instance loss to identify tooth-marked regions with only image-level\nannotations. WSVM achieves high accuracy in tooth-marked tongue classification,\nand visualization experiments demonstrate its effectiveness in pinpointing\nthese regions. This automated approach enhances the objectivity and accuracy of\ntooth-marked tongue diagnosis. It provides significant clinical value by\nassisting TCM practitioners in making precise diagnoses and treatment\nrecommendations. Code is available at https://github.com/yc-zh/WSVM.\n","authors":["Yongcun Zhang","Jiajun Xu","Yina He","Shaozi Li","Zhiming Luo","Huangwei Lei"],"pdf_url":"https://arxiv.org/pdf/2408.16451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16450v1","updated":"2024-08-29T11:30:21Z","published":"2024-08-29T11:30:21Z","title":"What to Preserve and What to Transfer: Faithful, Identity-Preserving\n Diffusion-based Hairstyle Transfer","summary":" Hairstyle transfer is a challenging task in the image editing field that\nmodifies the hairstyle of a given face image while preserving its other\nappearance and background features. The existing hairstyle transfer approaches\nheavily rely on StyleGAN, which is pre-trained on cropped and aligned face\nimages. Hence, they struggle to generalize under challenging conditions such as\nextreme variations of head poses or focal lengths. To address this issue, we\npropose a one-stage hairstyle transfer diffusion model, HairFusion, that\napplies to real-world scenarios. Specifically, we carefully design a\nhair-agnostic representation as the input of the model, where the original hair\ninformation is thoroughly eliminated. Next, we introduce a hair align\ncross-attention (Align-CA) to accurately align the reference hairstyle with the\nface image while considering the difference in their face shape. To enhance the\npreservation of the face image's original features, we leverage adaptive hair\nblending during the inference, where the output's hair regions are estimated by\nthe cross-attention map in Align-CA and blended with non-hair areas of the face\nimage. Our experimental results show that our method achieves state-of-the-art\nperformance compared to the existing methods in preserving the integrity of\nboth the transferred hairstyle and the surrounding features. The codes are\navailable at https://github.com/cychungg/HairFusion.\n","authors":["Chaeyeon Chung","Sunghyun Park","Jeongho Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2408.16450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16448v1","updated":"2024-08-29T11:24:51Z","published":"2024-08-29T11:24:51Z","title":"Enhancing Sound Source Localization via False Negative Elimination","summary":" Sound source localization aims to localize objects emitting the sound in\nvisual scenes. Recent works obtaining impressive results typically rely on\ncontrastive learning. However, the common practice of randomly sampling\nnegatives in prior arts can lead to the false negative issue, where the sounds\nsemantically similar to visual instance are sampled as negatives and\nincorrectly pushed away from the visual anchor/query. As a result, this\nmisalignment of audio and visual features could yield inferior performance. To\naddress this issue, we propose a novel audio-visual learning framework which is\ninstantiated with two individual learning schemes: self-supervised predictive\nlearning (SSPL) and semantic-aware contrastive learning (SACL). SSPL explores\nimage-audio positive pairs alone to discover semantically coherent similarities\nbetween audio and visual features, while a predictive coding module for feature\nalignment is introduced to facilitate the positive-only learning. In this\nregard SSPL acts as a negative-free method to eliminate false negatives. By\ncontrast, SACL is designed to compact visual features and remove false\nnegatives, providing reliable visual anchor and audio negatives for contrast.\nDifferent from SSPL, SACL releases the potential of audio-visual contrastive\nlearning, offering an effective alternative to achieve the same goal.\nComprehensive experiments demonstrate the superiority of our approach over the\nstate-of-the-arts. Furthermore, we highlight the versatility of the learned\nrepresentation by extending the approach to audio-visual event classification\nand object detection tasks. Code and models are available at:\nhttps://github.com/zjsong/SACL.\n","authors":["Zengjie Song","Jiangshe Zhang","Yuxi Wang","Junsong Fan","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16448v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2203.13412"},{"id":"http://arxiv.org/abs/2408.16445v1","updated":"2024-08-29T11:16:34Z","published":"2024-08-29T11:16:34Z","title":"Mismatched: Evaluating the Limits of Image Matching Approaches and\n Benchmarks","summary":" Three-dimensional (3D) reconstruction from two-dimensional images is an\nactive research field in computer vision, with applications ranging from\nnavigation and object tracking to segmentation and three-dimensional modeling.\nTraditionally, parametric techniques have been employed for this task. However,\nrecent advancements have seen a shift towards learning-based methods. Given the\nrapid pace of research and the frequent introduction of new image matching\nmethods, it is essential to evaluate them. In this paper, we present a\ncomprehensive evaluation of various image matching methods using a\nstructure-from-motion pipeline. We assess the performance of these methods on\nboth in-domain and out-of-domain datasets, identifying key limitations in both\nthe methods and benchmarks. We also investigate the impact of edge detection as\na pre-processing step. Our analysis reveals that image matching for 3D\nreconstruction remains an open challenge, necessitating careful selection and\ntuning of models for specific scenarios, while also highlighting mismatches in\nhow metrics currently represent method performance.\n","authors":["Sierra Bonilla","Chiara Di Vece","Rema Daher","Xinwei Ju","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2408.16445v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16442v1","updated":"2024-08-29T11:07:48Z","published":"2024-08-29T11:07:48Z","title":"Integrating Features for Recognizing Human Activities through Optimized\n Parameters in Graph Convolutional Networks and Transformer Architectures","summary":" Human activity recognition is a major field of study that employs computer\nvision, machine vision, and deep learning techniques to categorize human\nactions. The field of deep learning has made significant progress, with\narchitectures that are extremely effective at capturing human dynamics. This\nstudy emphasizes the influence of feature fusion on the accuracy of activity\nrecognition. This technique addresses the limitation of conventional models,\nwhich face difficulties in identifying activities because of their limited\ncapacity to understand spatial and temporal features. The technique employs\nsensory data obtained from four publicly available datasets: HuGaDB, PKU-MMD,\nLARa, and TUG. The accuracy and F1-score of two deep learning models,\nspecifically a Transformer model and a Parameter-Optimized Graph Convolutional\nNetwork (PO-GCN), were evaluated using these datasets. The feature fusion\ntechnique integrated the final layer features from both models and inputted\nthem into a classifier. Empirical evidence demonstrates that PO-GCN outperforms\nstandard models in activity recognition. HuGaDB demonstrated a 2.3% improvement\nin accuracy and a 2.2% increase in F1-score. TUG showed a 5% increase in\naccuracy and a 0.5% rise in F1-score. On the other hand, LARa and PKU-MMD\nachieved lower accuracies of 64% and 69% respectively. This indicates that the\nintegration of features enhanced the performance of both the Transformer model\nand PO-GCN.\n","authors":["Mohammad Belal","Taimur Hassan","Abdelfatah Hassan","Nael Alsheikh","Noureldin Elhendawi","Irfan Hussain"],"pdf_url":"https://arxiv.org/pdf/2408.16442v1.pdf","comment":"6 pages, 1 figure, conference"},{"id":"http://arxiv.org/abs/2408.16431v1","updated":"2024-08-29T10:47:17Z","published":"2024-08-29T10:47:17Z","title":"Discriminative Spatial-Semantic VOS Solution: 1st Place Solution for 6th\n LSVOS","summary":" Video object segmentation (VOS) is a crucial task in computer vision, but\ncurrent VOS methods struggle with complex scenes and prolonged object motions.\nTo address these challenges, the MOSE dataset aims to enhance object\nrecognition and differentiation in complex environments, while the LVOS dataset\nfocuses on segmenting objects exhibiting long-term, intricate movements. This\nreport introduces a discriminative spatial-temporal VOS model that utilizes\ndiscriminative object features as query representations. The semantic\nunderstanding of spatial-semantic modules enables it to recognize object parts,\nwhile salient features highlight more distinctive object characteristics. Our\nmodel, trained on extensive VOS datasets, achieved first place\n(\\textbf{80.90\\%} $\\mathcal{J \\& F}$) on the test set of the 6th LSVOS\nchallenge in the VOS Track, demonstrating its effectiveness in tackling the\naforementioned challenges. The code will be available at\n\\href{https://github.com/yahooo-m/VOS-Solution}{code}.\n","authors":["Deshui Miao","Yameng Gu","Xin Li","Zhenyu He","Yaowei Wang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16431v1.pdf","comment":"1st Place Solution for 6th LSVOS VOS Track. arXiv admin note:\n substantial text overlap with arXiv:2406.04600"},{"id":"http://arxiv.org/abs/2408.16426v1","updated":"2024-08-29T10:36:29Z","published":"2024-08-29T10:36:29Z","title":"COIN: Control-Inpainting Diffusion Prior for Human and Camera Motion\n Estimation","summary":" Estimating global human motion from moving cameras is challenging due to the\nentanglement of human and camera motions. To mitigate the ambiguity, existing\nmethods leverage learned human motion priors, which however often result in\noversmoothed motions with misaligned 2D projections. To tackle this problem, we\npropose COIN, a control-inpainting motion diffusion prior that enables\nfine-grained control to disentangle human and camera motions. Although\npre-trained motion diffusion models encode rich motion priors, we find it\nnon-trivial to leverage such knowledge to guide global motion estimation from\nRGB videos. COIN introduces a novel control-inpainting score distillation\nsampling method to ensure well-aligned, consistent, and high-quality motion\nfrom the diffusion prior within a joint optimization framework. Furthermore, we\nintroduce a new human-scene relation loss to alleviate the scale ambiguity by\nenforcing consistency among the humans, camera, and scene. Experiments on three\nchallenging benchmarks demonstrate the effectiveness of COIN, which outperforms\nthe state-of-the-art methods in terms of global human motion estimation and\ncamera motion estimation. As an illustrative example, COIN outperforms the\nstate-of-the-art method by 33% in world joint position error (W-MPJPE) on the\nRICH dataset.\n","authors":["Jiefeng Li","Ye Yuan","Davis Rempe","Haotian Zhang","Pavlo Molchanov","Cewu Lu","Jan Kautz","Umar Iqbal"],"pdf_url":"https://arxiv.org/pdf/2408.16426v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.16412v1","updated":"2024-08-29T10:20:05Z","published":"2024-08-29T10:20:05Z","title":"Text-Enhanced Zero-Shot Action Recognition: A training-free approach","summary":" Vision-language models (VLMs) have demonstrated remarkable performance across\nvarious visual tasks, leveraging joint learning of visual and textual\nrepresentations. While these models excel in zero-shot image tasks, their\napplication to zero-shot video action recognition (ZSVAR) remains challenging\ndue to the dynamic and temporal nature of actions. Existing methods for ZS-VAR\ntypically require extensive training on specific datasets, which can be\nresource-intensive and may introduce domain biases. In this work, we propose\nText-Enhanced Action Recognition (TEAR), a simple approach to ZS-VAR that is\ntraining-free and does not require the availability of training data or\nextensive computational resources. Drawing inspiration from recent findings in\nvision and language literature, we utilize action descriptors for decomposition\nand contextual information to enhance zero-shot action recognition. Through\nexperiments on UCF101, HMDB51, and Kinetics-600 datasets, we showcase the\neffectiveness and applicability of our proposed approach in addressing the\nchallenges of ZS-VAR.\n","authors":["Massimo Bosetti","Shibingfeng Zhang","Bendetta Liberatori","Giacomo Zara","Elisa Ricci","Paolo Rota"],"pdf_url":"https://arxiv.org/pdf/2408.16412v1.pdf","comment":"accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2402.03973v2","updated":"2024-08-29T10:09:58Z","published":"2024-02-06T13:06:14Z","title":"A comparison between humans and AI at recognizing objects in unusual\n poses","summary":" Deep learning is closing the gap with human vision on several object\nrecognition benchmarks. Here we investigate this gap for challenging images\nwhere objects are seen in unusual poses. We find that humans excel at\nrecognizing objects in such poses. In contrast, state-of-the-art deep networks\nfor vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art\nlarge vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically\nbrittle on unusual poses, with the exception of Gemini showing excellent\nrobustness in that condition. As we limit image exposure time, human\nperformance degrades to the level of deep networks, suggesting that additional\nmental processes (requiring additional time) are necessary to identify objects\nin unusual poses. An analysis of error patterns of humans vs. networks reveals\nthat even time-limited humans are dissimilar to feed-forward deep networks. In\nconclusion, our comparison reveals that humans and deep networks rely on\ndifferent mechanisms for recognizing objects in unusual poses. Understanding\nthe nature of the mental processes taking place during extra viewing time may\nbe key to reproduce the robustness of human vision in silico.\n","authors":["Netta Ollikka","Amro Abbas","Andrea Perin","Markku Kilpeläinen","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2402.03973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16395v1","updated":"2024-08-29T09:57:55Z","published":"2024-08-29T09:57:55Z","title":"IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial\n Intelligence Evaluation in Histopathology","summary":" Histopathological image analysis is crucial for accurate cancer diagnosis and\ntreatment planning. While deep learning models, especially convolutional neural\nnetworks, have advanced this field, their \"black-box\" nature raises concerns\nabout interpretability and trustworthiness. Explainable Artificial Intelligence\n(XAI) techniques aim to address these concerns, but evaluating their\neffectiveness remains challenging. A significant issue with current\nocclusion-based XAI methods is that they often generate Out-of-Distribution\n(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce\nInpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a\nDenoising Diffusion Probabilistic Model to inpaint occluded regions in\nhistopathological images. By replacing cancerous areas with realistic,\nnon-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity.\nWe evaluate our method on the CAMELYON16 dataset through two phases: first, by\nassessing perceptual similarity using the Learned Perceptual Image Patch\nSimilarity (LPIPS) metric, and second, by quantifying the impact on model\npredictions through Area Under the Curve (AUC) analysis. Our results\ndemonstrate that IBO significantly improves perceptual fidelity, achieving\nnearly twice the improvement in LPIPS scores compared to the best existing\nocclusion strategy. Additionally, IBO increased the precision of XAI\nperformance prediction from 42% to 71% compared to traditional methods. These\nresults demonstrate IBO's potential to provide more reliable evaluations of XAI\ntechniques, benefiting histopathology and other applications. The source code\nfor this study is available at https://github.com/a-fsh-r/IBO.\n","authors":["Pardis Afshar","Sajjad Hashembeiki","Pouya Khani","Emad Fatemizadeh","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.16395v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.14700v2","updated":"2024-08-29T09:44:53Z","published":"2024-05-23T15:34:53Z","title":"Sparse-Tuning: Adapting Vision Transformers with Efficient Fine-tuning\n and Inference","summary":" Parameter-efficient fine-tuning (PEFT) has emerged as a popular solution for\nadapting pre-trained Vision Transformer (ViT) models to downstream\napplications. While current PEFT methods have achieved parameter efficiency,\nthey overlook the efficiency of computation and GPU memory during both\nfine-tuning and inference, falling short of practical requirements. In this\npaper, we propose \\textbf{Sparse-Tuning}, a novel PEFT method that accounts for\nthe information redundancy in images and videos to boost the above efficiency.\nBy sparsely preserving the semantic-relevant tokens and merging irrelevant\nones, Sparse-Tuning minimizes the quantity of tokens processed at each layer,\nleading to a quadratic reduction in computational and memory overhead. To align\nour token sparsification strategy suitably with fine-tuning purposes, we\nfurther design Dense Adapters that establish dense connections from shallow\nlayers to deeper layers. These Dense Adapters integrate multi-level local\nfeatures to enrich the current tokens, improving both token preservation and\nmodel adaptation. Empirical results on VTAB-1K, three image datasets, and two\nvideo datasets show that our Sparse-Tuning reduces GFLOPs to \\textbf{62\\%-70\\%}\nof the original ViT-B while achieving state-of-the-art performance. Source code\nis available at \\url{https://github.com/liuting20/Sparse-Tuning}.\n","authors":["Ting Liu","Xuyang Liu","Siteng Huang","Liangtao Shi","Zunnan Xu","Yi Xin","Quanjun Yin","Xiaohong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.14700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16380v1","updated":"2024-08-29T09:41:36Z","published":"2024-08-29T09:41:36Z","title":"Exploiting temporal information to detect conversational groups in\n videos and predict the next speaker","summary":" Studies in human human interaction have introduced the concept of F formation\nto describe the spatial arrangement of participants during social interactions.\nThis paper has two objectives. It aims at detecting F formations in video\nsequences and predicting the next speaker in a group conversation. The proposed\napproach exploits time information and human multimodal signals in video\nsequences. In particular, we rely on measuring the engagement level of people\nas a feature of group belonging. Our approach makes use of a recursive neural\nnetwork, the Long Short Term Memory (LSTM), to predict who will take the\nspeaker's turn in a conversation group. Experiments on the MatchNMingle dataset\nled to 85% true positives in group detection and 98% accuracy in predicting the\nnext speaker.\n","authors":["Lucrezia Tosato","Victor Fortier","Isabelle Bloch","Catherine Pelachaud"],"pdf_url":"https://arxiv.org/pdf/2408.16380v1.pdf","comment":"Accepted to Pattern Recognition Letter, 8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.15678v2","updated":"2024-08-29T09:37:38Z","published":"2024-08-28T10:07:17Z","title":"Deep Learning Based Speckle Filtering for Polarimetric SAR Images.\n Application to Sentinel-1","summary":" Speckle suppression in synthetic aperture radar (SAR) images is a key\nprocessing step which continues to be a research topic. A wide variety of\nmethods, using either spatially-based approaches or transform-based strategies,\nhave been developed and have shown to provide outstanding results. However,\nrecent advances in deep learning techniques and their application to SAR image\ndespeckling have been demonstrated to offer state-of-the-art results.\nUnfortunately, they have been mostly applied to single-polarimetric images. The\nextension of a deep learning-based approach for speckle removal to polarimetric\nSAR (PolSAR) images is complicated because of the complex nature of the\nmeasured covariance matrices for every image pixel, the properties of which\nmust be preserved during filtering. In this work, we propose a complete\nframework to remove speckle in polarimetric SAR images using a convolutional\nneural network. The methodology includes a reversible transformation of the\noriginal complex covariance matrix to obtain a set of real-valued intensity\nbands which are fed to the neural network. In addition, the proposed method\nincludes a change detection strategy to avoid the neural network to learn\nerroneous features in areas strongly affected by temporal changes, so that the\nnetwork only learns the underlying speckle component present in the data. The\nmethod is implemented and tested with dual-polarimetric images acquired by\nSentinel-1. Experiments show that the proposed approach offers exceptional\nresults in both speckle reduction and resolution preservation. More\nimportantly, it is also shown that the neural network is not generating\nartifacts or introducing bias in the filtered images, making them suitable for\nfurther polarimetric processing and exploitation.\n","authors":["Alejandro Mestre-Quereda","Juan M. Lopez-Sanchez"],"pdf_url":"https://arxiv.org/pdf/2408.15678v2.pdf","comment":"23 pages, 32 figures"},{"id":"http://arxiv.org/abs/2408.14342v2","updated":"2024-08-29T09:11:13Z","published":"2024-08-14T02:37:26Z","title":"Dual-Domain CLIP-Assisted Residual Optimization Perception Model for\n Metal Artifact Reduction","summary":" Metal artifacts in computed tomography (CT) imaging pose significant\nchallenges to accurate clinical diagnosis. The presence of high-density\nmetallic implants results in artifacts that deteriorate image quality,\nmanifesting in the forms of streaking, blurring, or beam hardening effects,\netc. Nowadays, various deep learning-based approaches, particularly generative\nmodels, have been proposed for metal artifact reduction (MAR). However, these\nmethods have limited perception ability in the diverse morphologies of\ndifferent metal implants with artifacts, which may generate spurious anatomical\nstructures and exhibit inferior generalization capability. To address the\nissues, we leverage visual-language model (VLM) to identify these morphological\nfeatures and introduce them into a dual-domain CLIP-assisted residual\noptimization perception model (DuDoCROP) for MAR. Specifically, a dual-domain\nCLIP (DuDoCLIP) is fine-tuned on the image domain and sinogram domain using\ncontrastive learning to extract semantic descriptions from anatomical\nstructures and metal artifacts. Subsequently, a diffusion model is guided by\nthe embeddings of DuDoCLIP, thereby enabling the dual-domain prior generation.\nAdditionally, we design prompt engineering for more precise image-text\ndescriptions that can enhance the model's perception capability. Then, a\ndownstream task is devised for the one-step residual optimization and\nintegration of dual-domain priors, while incorporating raw data fidelity.\nUltimately, a new perceptual indicator is proposed to validate the model's\nperception and generation performance. With the assistance of DuDoCLIP, our\nDuDoCROP exhibits at least 63.7% higher generalization capability compared to\nthe baseline model. Numerical experiments demonstrate that the proposed method\ncan generate more realistic image structures and outperform other SOTA\napproaches both qualitatively and quantitatively.\n","authors":["Xinrui Zhang","Ailong Cai","Shaoyu Wang","Linyuan Wang","Zhizhong Zheng","Lei Li","Bin Yan"],"pdf_url":"https://arxiv.org/pdf/2408.14342v2.pdf","comment":"14 pages, 18 figures"},{"id":"http://arxiv.org/abs/2408.13744v2","updated":"2024-08-29T09:08:54Z","published":"2024-08-25T07:08:58Z","title":"Enhancing Adaptive Deep Networks for Image Classification via\n Uncertainty-aware Decision Fusion","summary":" Handling varying computational resources is a critical issue in modern AI\napplications. Adaptive deep networks, featuring the dynamic employment of\nmultiple classifier heads among different layers, have been proposed to address\nclassification tasks under varying computing resources. Existing approaches\ntypically utilize the last classifier supported by the available resources for\ninference, as they believe that the last classifier always performs better\nacross all classes. However, our findings indicate that earlier classifier\nheads can outperform the last head for certain classes. Based on this\nobservation, we introduce the Collaborative Decision Making (CDM) module, which\nfuses the multiple classifier heads to enhance the inference performance of\nadaptive deep networks. CDM incorporates an uncertainty-aware fusion method\nbased on evidential deep learning (EDL), that utilizes the reliability\n(uncertainty values) from the first c-1 classifiers to improve the c-th\nclassifier' accuracy. We also design a balance term that reduces fusion\nsaturation and unfairness issues caused by EDL constraints to improve the\nfusion quality of CDM. Finally, a regularized training strategy that uses the\nlast classifier to guide the learning process of early classifiers is proposed\nto further enhance the CDM module's effect, called the Guided Collaborative\nDecision Making (GCDM) framework. The experimental evaluation demonstrates the\neffectiveness of our approaches. Results on ImageNet datasets show CDM and GCDM\nobtain 0.4% to 2.8% accuracy improvement (under varying computing resources) on\npopular adaptive networks. The code is available at the link\nhttps://github.com/Meteor-Stars/GCDM_AdaptiveNet.\n","authors":["Xu Zhang","Zhipeng Xie","Haiyang Yu","Qitong Wang","Peng Wang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13744v2.pdf","comment":"13 pages, 27 figures. In ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.16357v1","updated":"2024-08-29T08:56:48Z","published":"2024-08-29T08:56:48Z","title":"Law of Vision Representation in MLLMs","summary":" We present the \"Law of Vision Representation\" in multimodal large language\nmodels (MLLMs). It reveals a strong correlation between the combination of\ncross-modal alignment, correspondence in vision representation, and MLLM\nperformance. We quantify the two factors using the cross-modal Alignment and\nCorrespondence score (AC score). Through extensive experiments involving\nthirteen different vision representation settings and evaluations across eight\nbenchmarks, we find that the AC score is linearly correlated to model\nperformance. By leveraging this relationship, we are able to identify and train\nthe optimal vision representation only, which does not require finetuning the\nlanguage model every time, resulting in a 99.7% reduction in computational\ncost.\n","authors":["Shijia Yang","Bohan Zhai","Quanzeng You","Jianbo Yuan","Hongxia Yang","Chenfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16357v1.pdf","comment":"The code is available at\n https://github.com/bronyayang/Law_of_Vision_Representation_in_MLLMs"},{"id":"http://arxiv.org/abs/2401.03749v3","updated":"2024-08-29T08:52:40Z","published":"2024-01-08T09:20:46Z","title":"A Flying Bird Object Detection Method for Surveillance Video","summary":" Aiming at the specific characteristics of flying bird objects in surveillance\nvideo, such as the typically non-obvious features in single-frame images, small\nsize in most instances, and asymmetric shapes, this paper proposes a Flying\nBird Object Detection method for Surveillance Video (FBOD-SV). Firstly, a new\nfeature aggregation module, the Correlation Attention Feature Aggregation\n(Co-Attention-FA) module, is designed to aggregate the features of the flying\nbird object according to the bird object's correlation on multiple consecutive\nframes of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net)\nwith down-sampling followed by up-sampling is designed, which utilizes a large\nfeature layer that fuses fine spatial information and large receptive field\ninformation to detect special multi-scale (mostly small-scale) bird objects.\nFinally, the SimOTA dynamic label allocation method is applied to One-Category\nobject detection, and the SimOTA-OC dynamic label strategy is proposed to solve\nthe difficult problem of label allocation caused by irregular flying bird\nobjects. In this paper, the performance of the FBOD-SV is validated using\nexperimental datasets of flying bird objects in traction substation\nsurveillance videos. The experimental results show that the FBOD-SV effectively\nimproves the detection performance of flying bird objects in surveillance\nvideo.\n","authors":["Ziwei Sun","Zexi Hua","Hengchao Li","Yan Li"],"pdf_url":"https://arxiv.org/pdf/2401.03749v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16355v1","updated":"2024-08-29T08:51:25Z","published":"2024-08-29T08:51:25Z","title":"NeRF-CA: Dynamic Reconstruction of X-ray Coronary Angiography with\n Extremely Sparse-views","summary":" Dynamic three-dimensional (4D) reconstruction from two-dimensional X-ray\ncoronary angiography (CA) remains a significant clinical problem. Challenges\ninclude sparse-view settings, intra-scan motion, and complex vessel morphology\nsuch as structure sparsity and background occlusion. Existing CA reconstruction\nmethods often require extensive user interaction or large training datasets. On\nthe other hand, Neural Radiance Field (NeRF), a promising deep learning\ntechnique, has successfully reconstructed high-fidelity static scenes for\nnatural and medical scenes. Recent work, however, identified that sparse-views,\nbackground occlusion, and dynamics still pose a challenge when applying NeRF in\nthe X-ray angiography context. Meanwhile, many successful works for natural\nscenes propose regularization for sparse-view reconstruction or scene\ndecomposition to handle dynamics. However, these techniques do not directly\ntranslate to the CA context, where both challenges and background occlusion are\nsignificant. This paper introduces NeRF-CA, the first step toward a 4D CA\nreconstruction method that achieves reconstructions from sparse coronary\nangiograms with cardiac motion. We leverage the motion of the coronary artery\nto decouple the scene into a dynamic coronary artery component and static\nbackground. We combine this scene decomposition with tailored regularization\ntechniques. These techniques enforce the separation of the coronary artery from\nthe background by enforcing dynamic structure sparsity and scene smoothness. By\nuniquely combining these approaches, we achieve 4D reconstructions from as few\nas four angiogram sequences. This setting aligns with clinical workflows while\noutperforming state-of-the-art X-ray sparse-view NeRF reconstruction\ntechniques. We validate our approach quantitatively and qualitatively using 4D\nphantom datasets and ablation studies.\n","authors":["Kirsten W. H. Maas","Danny Ruijters","Anna Vilanova","Nicola Pezzotti"],"pdf_url":"https://arxiv.org/pdf/2408.16355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16343v1","updated":"2024-08-29T08:26:00Z","published":"2024-08-29T08:26:00Z","title":"Toward Robust Early Detection of Alzheimer's Disease via an Integrated\n Multimodal Learning Approach","summary":" Alzheimer's Disease (AD) is a complex neurodegenerative disorder marked by\nmemory loss, executive dysfunction, and personality changes. Early diagnosis is\nchallenging due to subtle symptoms and varied presentations, often leading to\nmisdiagnosis with traditional unimodal diagnostic methods due to their limited\nscope. This study introduces an advanced multimodal classification model that\nintegrates clinical, cognitive, neuroimaging, and EEG data to enhance\ndiagnostic accuracy. The model incorporates a feature tagger with a tabular\ndata coding architecture and utilizes the TimesBlock module to capture\nintricate temporal patterns in Electroencephalograms (EEG) data. By employing\nCross-modal Attention Aggregation module, the model effectively fuses Magnetic\nResonance Imaging (MRI) spatial information with EEG temporal data,\nsignificantly improving the distinction between AD, Mild Cognitive Impairment,\nand Normal Cognition. Simultaneously, we have constructed the first AD\nclassification dataset that includes three modalities: EEG, MRI, and tabular\ndata. Our innovative approach aims to facilitate early diagnosis and\nintervention, potentially slowing the progression of AD. The source code and\nour private ADMC dataset are available at https://github.com/JustlfC03/MSTNet.\n","authors":["Yifei Chen","Shenghao Zhu","Zhaojie Fang","Chang Liu","Binfeng Zou","Yuhe Wang","Shuo Chang","Fan Jia","Feiwei Qin","Jin Fan","Yong Peng","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16343v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.16340v1","updated":"2024-08-29T08:23:57Z","published":"2024-08-29T08:23:57Z","title":"Learned Image Transmission with Hierarchical Variational Autoencoder","summary":" In this paper, we introduce an innovative hierarchical joint source-channel\ncoding (HJSCC) framework for image transmission, utilizing a hierarchical\nvariational autoencoder (VAE). Our approach leverages a combination of\nbottom-up and top-down paths at the transmitter to autoregressively generate\nmultiple hierarchical representations of the original image. These\nrepresentations are then directly mapped to channel symbols for transmission by\nthe JSCC encoder. We extend this framework to scenarios with a feedback link,\nmodeling transmission over a noisy channel as a probabilistic sampling process\nand deriving a novel generative formulation for JSCC with feedback. Compared\nwith existing approaches, our proposed HJSCC provides enhanced adaptability by\ndynamically adjusting transmission bandwidth, encoding these representations\ninto varying amounts of channel symbols. Additionally, we introduce a rate\nattention module to guide the JSCC encoder in optimizing its encoding strategy\nbased on prior information. Extensive experiments on images of varying\nresolutions demonstrate that our proposed model outperforms existing baselines\nin rate-distortion performance and maintains robustness against channel noise.\n","authors":["Guangyi Zhang","Hanlei Li","Yunlong Cai","Qiyu Hu","Guanding Yu","Runmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16325v1","updated":"2024-08-29T08:00:07Z","published":"2024-08-29T08:00:07Z","title":"P2P-Bridge: Diffusion Bridges for 3D Point Cloud Denoising","summary":" In this work, we tackle the task of point cloud denoising through a novel\nframework that adapts Diffusion Schr\\\"odinger bridges to points clouds. Unlike\nprevious approaches that predict point-wise displacements from point features\nor learned noise distributions, our method learns an optimal transport plan\nbetween paired point clouds. Experiments on object datasets like PU-Net and\nreal-world datasets such as ScanNet++ and ARKitScenes show that P2P-Bridge\nachieves significant improvements over existing methods. While our approach\ndemonstrates strong results using only point coordinates, we also show that\nincorporating additional features, such as color information or point-wise\nDINOv2 features, further enhances the performance. Code and pretrained models\nare available at https://p2p-bridge.github.io.\n","authors":["Mathias Vogel","Keisuke Tateno","Marc Pollefeys","Federico Tombari","Marie-Julie Rakotosaona","Francis Engelmann"],"pdf_url":"https://arxiv.org/pdf/2408.16325v1.pdf","comment":"ECCV 2024 Project page: https://p2p-bridge.github.io"},{"id":"http://arxiv.org/abs/2408.16322v1","updated":"2024-08-29T07:49:31Z","published":"2024-08-29T07:49:31Z","title":"BEVal: A Cross-dataset Evaluation Study of BEV Segmentation Models for\n Autononomous Driving","summary":" Current research in semantic bird's-eye view segmentation for autonomous\ndriving focuses solely on optimizing neural network models using a single\ndataset, typically nuScenes. This practice leads to the development of highly\nspecialized models that may fail when faced with different environments or\nsensor setups, a problem known as domain shift. In this paper, we conduct a\ncomprehensive cross-dataset evaluation of state-of-the-art BEV segmentation\nmodels to assess their performance across different training and testing\ndatasets and setups, as well as different semantic categories. We investigate\nthe influence of different sensors, such as cameras and LiDAR, on the models'\nability to generalize to diverse conditions and scenarios. Additionally, we\nconduct multi-dataset training experiments that improve models' BEV\nsegmentation performance compared to single-dataset training. Our work\naddresses the gap in evaluating BEV segmentation models under cross-dataset\nvalidation. And our findings underscore the importance of enhancing model\ngeneralizability and adaptability to ensure more robust and reliable BEV\nsegmentation approaches for autonomous driving applications.\n","authors":["Manuel Alejandro Diaz-Zapata","Wenqian Liu","Robin Baruffa","Christian Laugier"],"pdf_url":"https://arxiv.org/pdf/2408.16322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16314v1","updated":"2024-08-29T07:32:01Z","published":"2024-08-29T07:32:01Z","title":"ResVG: Enhancing Relation and Semantic Understanding in Multiple\n Instances for Visual Grounding","summary":" Visual grounding aims to localize the object referred to in an image based on\na natural language query. Although progress has been made recently, accurately\nlocalizing target objects within multiple-instance distractions (multiple\nobjects of the same category as the target) remains a significant challenge.\nExisting methods demonstrate a significant performance drop when there are\nmultiple distractions in an image, indicating an insufficient understanding of\nthe fine-grained semantics and spatial relationships between objects. In this\npaper, we propose a novel approach, the Relation and Semantic-sensitive Visual\nGrounding (ResVG) model, to address this issue. Firstly, we enhance the model's\nunderstanding of fine-grained semantics by injecting semantic prior information\nderived from text queries into the model. This is achieved by leveraging\ntext-to-image generation models to produce images representing the semantic\nattributes of target objects described in queries. Secondly, we tackle the lack\nof training samples with multiple distractions by introducing a\nrelation-sensitive data augmentation method. This method generates additional\ntraining data by synthesizing images containing multiple objects of the same\ncategory and pseudo queries based on their spatial relationships. The proposed\nReSVG model significantly improves the model's ability to comprehend both\nobject semantics and spatial relations, leading to enhanced performance in\nvisual grounding tasks, particularly in scenarios with multiple-instance\ndistractions. We conduct extensive experiments to validate the effectiveness of\nour methods on five datasets. Code is available at\nhttps://github.com/minghangz/ResVG.\n","authors":["Minghang Zheng","Jiahua Zhang","Qingchao Chen","Yuxin Peng","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16314v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.16313v1","updated":"2024-08-29T07:22:16Z","published":"2024-08-29T07:22:16Z","title":"FA-YOLO: Research On Efficient Feature Selection YOLO Improved Algorithm\n Based On FMDS and AGMF Modules","summary":" Over the past few years, the YOLO series of models has emerged as one of the\ndominant methodologies in the realm of object detection. Many studies have\nadvanced these baseline models by modifying their architectures, enhancing data\nquality, and developing new loss functions. However, current models still\nexhibit deficiencies in processing feature maps, such as overlooking the fusion\nof cross-scale features and a static fusion approach that lacks the capability\nfor dynamic feature adjustment. To address these issues, this paper introduces\nan efficient Fine-grained Multi-scale Dynamic Selection Module (FMDS Module),\nwhich applies a more effective dynamic feature selection and fusion method on\nfine-grained multi-scale feature maps, significantly enhancing the detection\naccuracy of small, medium, and large-sized targets in complex environments.\nFurthermore, this paper proposes an Adaptive Gated Multi-branch Focus Fusion\nModule (AGMF Module), which utilizes multiple parallel branches to perform\ncomplementary fusion of various features captured by the gated unit branch,\nFMDS Module branch, and TripletAttention branch. This approach further enhances\nthe comprehensiveness, diversity, and integrity of feature fusion. This paper\nhas integrated the FMDS Module, AGMF Module, into Yolov9 to develop a novel\nobject detection model named FA-YOLO. Extensive experimental results show that\nunder identical experimental conditions, FA-YOLO achieves an outstanding 66.1%\nmean Average Precision (mAP) on the PASCAL VOC 2007 dataset, representing 1.0%\nimprovement over YOLOv9's 65.1%. Additionally, the detection accuracies of\nFA-YOLO for small, medium, and large targets are 44.1%, 54.6%, and 70.8%,\nrespectively, showing improvements of 2.0%, 3.1%, and 0.9% compared to YOLOv9's\n42.1%, 51.5%, and 69.9%.\n","authors":["Yukang Huo","Mingyuan Yao","Qingbin Tian","Tonghao Wang","Ruifeng Wang","Haihua Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16313v1.pdf","comment":"11 pages and 4 figures"},{"id":"http://arxiv.org/abs/2408.16310v1","updated":"2024-08-29T07:16:28Z","published":"2024-08-29T07:16:28Z","title":"Bootstrap Segmentation Foundation Model under Distribution Shift via\n Object-Centric Learning","summary":" Foundation models have made incredible strides in achieving zero-shot or\nfew-shot generalization, leveraging prompt engineering to mimic the\nproblem-solving approach of human intelligence. However, when it comes to some\nfoundation models like Segment Anything, there is still a challenge in\nperforming well on out-of-distribution data, including camouflaged and medical\nimages. Inconsistent prompting strategies during fine-tuning and testing\nfurther compound the issue, leading to decreased performance. Drawing\ninspiration from how human cognition processes new environments, we introduce\nSlotSAM, a method that reconstructs features from the encoder in a\nself-supervised manner to create object-centric representations. These\nrepresentations are then integrated into the foundation model, bolstering its\nobject-level perceptual capabilities while reducing the impact of\ndistribution-related variables. The beauty of SlotSAM lies in its simplicity\nand adaptability to various tasks, making it a versatile solution that\nsignificantly enhances the generalization abilities of foundation models.\nThrough limited parameter fine-tuning in a bootstrap manner, our approach paves\nthe way for improved generalization in novel environments. The code is\navailable at github.com/lytang63/SlotSAM.\n","authors":["Luyao Tang","Yuxuan Yuan","Chaoqi Chen","Kunze Huang","Xinghao Ding","Yue Huang"],"pdf_url":"https://arxiv.org/pdf/2408.16310v1.pdf","comment":"This work is accepted by ECCV 2024 EVAL-FoMo Workshop"},{"id":"http://arxiv.org/abs/2407.13307v2","updated":"2024-08-29T07:12:39Z","published":"2024-07-18T09:10:25Z","title":"Conformal Performance Range Prediction for Segmentation Output Quality\n Control","summary":" Recent works have introduced methods to estimate segmentation performance\nwithout ground truth, relying solely on neural network softmax outputs. These\ntechniques hold potential for intuitive output quality control. However, such\nperformance estimates rely on calibrated softmax outputs, which is often not\nthe case in modern neural networks. Moreover, the estimates do not take into\naccount inherent uncertainty in segmentation tasks. These limitations may\nrender precise performance predictions unattainable, restricting the practical\napplicability of performance estimation methods. To address these challenges,\nwe develop a novel approach for predicting performance ranges with statistical\nguarantees of containing the ground truth with a user specified probability.\nOur method leverages sampling-based segmentation uncertainty estimation to\nderive heuristic performance ranges, and applies split conformal prediction to\ntransform these estimates into rigorous prediction ranges that meet the desired\nguarantees. We demonstrate our approach on the FIVES retinal vessel\nsegmentation dataset and compare five commonly used sampling-based uncertainty\nestimation techniques. Our results show that it is possible to achieve the\ndesired coverage with small prediction ranges, highlighting the potential of\nperformance range prediction as a valuable tool for output quality control.\n","authors":["Anna M. Wundram","Paul Fischer","Michael Muehlebach","Lisa M. Koch","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2407.13307v2.pdf","comment":"Accepted as an oral presentation at MICCAI UNSURE 2024"},{"id":"http://arxiv.org/abs/2408.16305v1","updated":"2024-08-29T07:11:50Z","published":"2024-08-29T07:11:50Z","title":"Semantics-Oriented Multitask Learning for DeepFake Detection: A Joint\n Embedding Approach","summary":" In recent years, the multimedia forensics and security community has seen\nremarkable progress in multitask learning for DeepFake (i.e., face forgery)\ndetection. The prevailing strategy has been to frame DeepFake detection as a\nbinary classification problem augmented by manipulation-oriented auxiliary\ntasks. This strategy focuses on learning features specific to face\nmanipulations, which exhibit limited generalizability. In this paper, we delve\ndeeper into semantics-oriented multitask learning for DeepFake detection,\nleveraging the relationships among face semantics via joint embedding. We first\npropose an automatic dataset expansion technique that broadens current face\nforgery datasets to support semantics-oriented DeepFake detection tasks at both\nthe global face attribute and local face region levels. Furthermore, we resort\nto joint embedding of face images and their corresponding labels (depicted by\ntextual descriptions) for prediction. This approach eliminates the need for\nmanually setting task-agnostic and task-specific parameters typically required\nwhen predicting labels directly from images. In addition, we employ a bi-level\noptimization strategy to dynamically balance the fidelity loss weightings of\nvarious tasks, making the training process fully automated. Extensive\nexperiments on six DeepFake datasets show that our method improves the\ngeneralizability of DeepFake detection and, meanwhile, renders some degree of\nmodel interpretation by providing human-understandable explanations.\n","authors":["Mian Zou","Baosheng Yu","Yibing Zhan","Siwei Lyu","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2408.16305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16303v1","updated":"2024-08-29T07:09:33Z","published":"2024-08-29T07:09:33Z","title":"Enhanced Control for Diffusion Bridge in Image Restoration","summary":" Image restoration refers to the process of restoring a damaged low-quality\nimage back to its corresponding high-quality image. Typically, we use\nconvolutional neural networks to directly learn the mapping from low-quality\nimages to high-quality images achieving image restoration. Recently, a special\ntype of diffusion bridge model has achieved more advanced results in image\nrestoration. It can transform the direct mapping from low-quality to\nhigh-quality images into a diffusion process, restoring low-quality images\nthrough a reverse process. However, the current diffusion bridge restoration\nmodels do not emphasize the idea of conditional control, which may affect\nperformance. This paper introduces the ECDB model enhancing the control of the\ndiffusion bridge with low-quality images as conditions. Moreover, in response\nto the characteristic of diffusion models having low denoising level at larger\nvalues of \\(\\bm t \\), we also propose a Conditional Fusion Schedule, which more\neffectively handles the conditional feature information of various modules.\nExperimental results prove that the ECDB model has achieved state-of-the-art\nresults in many image restoration tasks, including deraining, inpainting and\nsuper-resolution. Code is avaliable at https://github.com/Hammour-steak/ECDB.\n","authors":["Conghan Yue","Zhengwei Peng","Junlong Ma","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15844v2","updated":"2024-08-29T07:08:21Z","published":"2024-08-28T15:04:52Z","title":"Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction","summary":" Video key frame extraction is important in various fields, such as video\nsummary, retrieval, and compression. Therefore, we suggest a video key frame\nextraction algorithm based on shot segmentation using Von Neumann entropy. The\nsegmentation of shots is achieved through the computation of Von Neumann\nentropy of the similarity matrix among frames within the video sequence. The\ninitial frame of each shot is selected as key frames, which combines the\ntemporal sequence information of frames. The experimental results show the\nextracted key frames can fully and accurately represent the original video\ncontent while minimizing the number of repeated frames.\n","authors":["Xueqing Zhang","Di Fu","Naihao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15844v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.15996v2","updated":"2024-08-29T06:54:11Z","published":"2024-08-28T17:59:05Z","title":"Spatio-Temporal Context Prompting for Zero-Shot Action Detection","summary":" Spatio-temporal action detection encompasses the tasks of localizing and\nclassifying individual actions within a video. Recent works aim to enhance this\nprocess by incorporating interaction modeling, which captures the relationship\nbetween people and their surrounding context. However, these approaches have\nprimarily focused on fully-supervised learning, and the current limitation lies\nin the lack of generalization capability to recognize unseen action categories.\nIn this paper, we aim to adapt the pretrained image-language models to detect\nunseen actions. To this end, we propose a method which can effectively leverage\nthe rich knowledge of visual-language models to perform Person-Context\nInteraction. Meanwhile, our Context Prompting module will utilize contextual\ninformation to prompt labels, thereby enhancing the generation of more\nrepresentative text features. Moreover, to address the challenge of recognizing\ndistinct actions by multiple people at the same timestamp, we design the\nInterest Token Spotting mechanism which employs pretrained visual knowledge to\nfind each person's interest context tokens, and then these tokens will be used\nfor prompting to generate text features tailored to each individual. To\nevaluate the ability to detect unseen actions, we propose a comprehensive\nbenchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our\nmethod achieves superior results compared to previous approaches and can be\nfurther extended to multi-action videos, bringing it closer to real-world\napplications. The code and data can be found in\nhttps://webber2933.github.io/ST-CLIP-project-page.\n","authors":["Wei-Jhe Huang","Min-Hung Chen","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15996v2.pdf","comment":"Project page: https://webber2933.github.io/ST-CLIP-project-page"},{"id":"http://arxiv.org/abs/2408.16296v1","updated":"2024-08-29T06:54:03Z","published":"2024-08-29T06:54:03Z","title":"Rethinking Sparse Lexical Representations for Image Retrieval in the Age\n of Rising Multi-Modal Large Language Models","summary":" In this paper, we rethink sparse lexical representations for image retrieval.\nBy utilizing multi-modal large language models (M-LLMs) that support visual\nprompting, we can extract image features and convert them into textual data,\nenabling us to utilize efficient sparse retrieval algorithms employed in\nnatural language processing for image retrieval tasks. To assist the LLM in\nextracting image features, we apply data augmentation techniques for key\nexpansion and analyze the impact with a metric for relevance between images and\ntextual data. We empirically show the superior precision and recall performance\nof our image retrieval method compared to conventional vision-language\nmodel-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a\nkeyword-based image retrieval scenario, where keywords serve as search queries.\nWe also demonstrate that the retrieval performance can be improved by\niteratively incorporating keywords into search queries.\n","authors":["Kengo Nakata","Daisuke Miyashita","Youyang Ng","Yasuto Hoshi","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2408.16296v1.pdf","comment":"Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer\n Vision in the Age of Deep Learning (TradiCV)"},{"id":"http://arxiv.org/abs/2407.18520v3","updated":"2024-08-29T06:52:45Z","published":"2024-07-26T05:29:24Z","title":"Text-Region Matching for Multi-Label Image Recognition with Missing\n Labels","summary":" Recently, large-scale visual language pre-trained (VLP) models have\ndemonstrated impressive performance across various downstream tasks. Motivated\nby these advancements, pioneering efforts have emerged in multi-label image\nrecognition with missing labels, leveraging VLP prompt-tuning technology.\nHowever, they usually cannot match text and vision features well, due to\ncomplicated semantics gaps and missing labels in a multi-label image. To tackle\nthis challenge, we propose $\\textbf{T}$ext-$\\textbf{R}$egion\n$\\textbf{M}$atching for optimizing $\\textbf{M}$ulti-$\\textbf{L}$abel prompt\ntuning, namely TRM-ML, a novel method for enhancing meaningful cross-modal\nmatching. Compared to existing methods, we advocate exploring the information\nof category-aware regions rather than the entire image or pixels, which\ncontributes to bridging the semantic gap between textual and visual\nrepresentations in a one-to-one matching manner. Concurrently, we further\nintroduce multimodal contrastive learning to narrow the semantic gap between\ntextual and visual modalities and establish intra-class and inter-class\nrelationships. Additionally, to deal with missing labels, we propose a\nmultimodal category prototype that leverages intra- and inter-category semantic\nrelationships to estimate unknown labels, facilitating pseudo-label generation.\nExtensive experiments on the MS-COCO, PASCAL VOC, Visual Genome, NUS-WIDE, and\nCUB-200-211 benchmark datasets demonstrate that our proposed framework\noutperforms the state-of-the-art methods by a significant margin. Our code is\navailable here: https://github.com/yu-gi-oh-leilei/TRM-ML.\n","authors":["Leilei Ma","Hongxing Xie","Lei Wang","Yanping Fu","Dengdi Sun","Haifeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.18520v3.pdf","comment":"Accepted to ACM International Conference on Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2408.16289v1","updated":"2024-08-29T06:40:34Z","published":"2024-08-29T06:40:34Z","title":"Convolutional Neural Network Compression Based on Low-Rank Decomposition","summary":" Deep neural networks typically impose significant computational loads and\nmemory consumption. Moreover, the large parameters pose constraints on\ndeploying the model on edge devices such as embedded systems. Tensor\ndecomposition offers a clear advantage in compressing large-scale weight\ntensors. Nevertheless, direct utilization of low-rank decomposition typically\nleads to significant accuracy loss. This paper proposes a model compression\nmethod that integrates Variational Bayesian Matrix Factorization (VBMF) with\northogonal regularization. Initially, the model undergoes over-parameterization\nand training, with orthogonal regularization applied to enhance its likelihood\nof achieving the accuracy of the original model. Secondly, VBMF is employed to\nestimate the rank of the weight tensor at each layer. Our framework is\nsufficiently general to apply to other convolutional neural networks and easily\nadaptable to incorporate other tensor decomposition methods. Experimental\nresults show that for both high and low compression ratios, our compression\nmodel exhibits advanced performance.\n","authors":["Yaping He","Linhao Jiang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16289v1.pdf","comment":"10 pages, 1 figures"},{"id":"http://arxiv.org/abs/2405.07288v2","updated":"2024-08-29T06:22:48Z","published":"2024-05-12T14:01:05Z","title":"Erasing Concepts from Text-to-Image Diffusion Models with Few-shot\n Unlearning","summary":" Generating images from text has become easier because of the scaling of\ndiffusion models and advancements in the field of vision and language. These\nmodels are trained using vast amounts of data from the Internet. Hence, they\noften contain undesirable content such as copyrighted material. As it is\nchallenging to remove such data and retrain the models, methods for erasing\nspecific concepts from pre-trained models have been investigated. We propose a\nnovel concept-erasure method that updates the text encoder using few-shot\nunlearning in which a few real images are used. The discussion regarding the\ngenerated images after erasing a concept has been lacking. While there are\nmethods for specifying the transition destination for concepts, the validity of\nthe specified concepts is unclear. Our method implicitly achieves this by\ntransitioning to the latent concepts inherent in the model or the images. Our\nmethod can erase a concept within 10 s, making concept erasure more accessible\nthan ever before. Implicitly transitioning to related concepts leads to more\nnatural concept erasure. We applied the proposed method to various concepts and\nconfirmed that concept erasure can be achieved tens to hundreds of times faster\nthan with current methods. By varying the parameters to be updated, we obtained\nresults suggesting that, like previous research, knowledge is primarily\naccumulated in the feed-forward networks of the text encoder. Our code is\navailable at \\url{https://github.com/fmp453/few-shot-erasing}\n","authors":["Masane Fuchi","Tomohiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2405.07288v2.pdf","comment":"25 pages, 28 figures, accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2407.07046v2","updated":"2024-08-29T06:15:55Z","published":"2024-07-09T17:07:29Z","title":"CorMulT: A Semi-supervised Modality Correlation-aware Multimodal\n Transformer for Sentiment Analysis","summary":" Multimodal sentiment analysis is an active research area that combines\nmultiple data modalities, e.g., text, image and audio, to analyze human\nemotions and benefits a variety of applications. Existing multimodal sentiment\nanalysis methods can be classified as modality interaction-based methods,\nmodality transformation-based methods and modality similarity-based methods.\nHowever, most of these methods highly rely on the strong correlations between\nmodalities, and cannot fully uncover and utilize the correlations between\nmodalities to enhance sentiment analysis. Therefore, these methods usually\nachieve bad performance for identifying the sentiment of multimodal data with\nweak correlations. To address this issue, we proposed a two-stage\nsemi-supervised model termed Correlation-aware Multimodal Transformer (CorMulT)\nwhich consists pre-training stage and prediction stage. At the pre-training\nstage, a modality correlation contrastive learning module is designed to\nefficiently learn modality correlation coefficients between different\nmodalities. At the prediction stage, the learned correlation coefficients are\nfused with modality representations to make the sentiment prediction. According\nto the experiments on the popular multimodal dataset CMU-MOSEI, CorMulT\nobviously surpasses state-of-the-art multimodal sentiment analysis methods.\n","authors":["Yangmin Li","Ruiqi Zhu","Wengen Li"],"pdf_url":"https://arxiv.org/pdf/2407.07046v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16102v3","updated":"2024-08-29T05:56:45Z","published":"2023-03-28T16:11:31Z","title":"KeyMatchNet: Zero-Shot Pose Estimation in 3D Point Clouds by Generalized\n Keypoint Matching","summary":" In this paper, we present KeyMatchNet, a novel network for zero-shot pose\nestimation in 3D point clouds. Our method uses only depth information, making\nit more applicable for many industrial use cases, as color information is\nseldom available. The network is composed of two parallel components for\ncomputing object and scene features. The features are then combined to create\nmatches used for pose estimation. The parallel structure allows for\npre-processing of the individual parts, which decreases the run-time. Using a\nzero-shot network allows for a very short set-up time, as it is not necessary\nto train models for new objects. However, as the network is not trained for the\nspecific object, zero-shot pose estimation methods generally have lower\naccuracy compared with conventional methods. To address this, we reduce the\ncomplexity of the task by including the scenario information during training.\nThis is typically not feasible as collecting real data for new tasks\ndrastically increases the cost. However, for zero-shot pose estimation,\ntraining for new objects is not necessary and the expensive data collection can\nthus be performed only once. Our method is trained on 1,500 objects and is only\ntested on unseen objects. We demonstrate that the trained network can not only\naccurately estimate poses for novel objects, but also demonstrate the ability\nof the network on objects outside of the trained class. Test results are also\nshown on real data. We believe that the presented method is valuable for many\nreal-world scenarios. Project page available at keymatchnet.github.io\n","authors":["Frederik Hagelskjær","Rasmus Laurvig Haugaard"],"pdf_url":"https://arxiv.org/pdf/2303.16102v3.pdf","comment":"8 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16277v1","updated":"2024-08-29T05:56:34Z","published":"2024-08-29T05:56:34Z","title":"Fine-grained Classification of Port Wine Stains Using Optical Coherence\n Tomography Angiography","summary":" Accurate classification of port wine stains (PWS, vascular malformations\npresent at birth), is critical for subsequent treatment planning. However, the\ncurrent method of classifying PWS based on the external skin appearance rarely\nreflects the underlying angiopathological heterogeneity of PWS lesions,\nresulting in inconsistent outcomes with the common vascular-targeted\nphotodynamic therapy (V-PDT) treatments. Conversely, optical coherence\ntomography angiography (OCTA) is an ideal tool for visualizing the vascular\nmalformations of PWS. Previous studies have shown no significant correlation\nbetween OCTA quantitative metrics and the PWS subtypes determined by the\ncurrent classification approach. This study proposes a new classification\napproach for PWS using both OCT and OCTA. By examining the hypodermic\nhistopathology and vascular structure of PWS, we have devised a fine-grained\nclassification method that subdivides PWS into five distinct types. To assess\nthe angiopathological differences of various PWS subtypes, we have analyzed six\nmetrics related to vascular morphology and depth information of PWS lesions.\nThe five PWS types present significant differences across all metrics compared\nto the conventional subtypes. Our findings suggest that an angiopathology-based\nclassification accurately reflects the heterogeneity in PWS lesions. This\nresearch marks the first attempt to classify PWS based on angiopathology,\npotentially guiding more effective subtyping and treatment strategies for PWS.\n","authors":["Xiaofeng Deng","Defu Chen","Bowen Liu","Xiwan Zhang","Haixia Qiu","Wu Yuan","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2408.16277v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2408.14400v2","updated":"2024-08-29T05:37:38Z","published":"2024-08-26T16:34:13Z","title":"Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation\n for Global Solar Mapping","summary":" The transition to renewable energy, particularly solar, is key to mitigating\nclimate change. Google's Solar API aids this transition by estimating solar\npotential from aerial imagery, but its impact is constrained by geographical\ncoverage. This paper proposes expanding the API's reach using satellite\nimagery, enabling global solar potential assessment. We tackle challenges\ninvolved in building a Digital Surface Model (DSM) and roof instance\nsegmentation from lower resolution and single oblique views using deep learning\nmodels. Our models, trained on aligned satellite and aerial datasets, produce\n25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch\nerror and ~56% IOU on roof segmentation, they significantly enhance the Solar\nAPI's potential to promote solar adoption.\n","authors":["Vishal Batchu","Alex Wilson","Betty Peng","Carl Elkin","Umangi Jain","Christopher Van Arsdale","Ross Goroshin","Varun Gulshan"],"pdf_url":"https://arxiv.org/pdf/2408.14400v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.16273v1","updated":"2024-08-29T05:33:59Z","published":"2024-08-29T05:33:59Z","title":"SAU: A Dual-Branch Network to Enhance Long-Tailed Recognition via\n Generative Models","summary":" Long-tailed distributions in image recognition pose a considerable challenge\ndue to the severe imbalance between a few dominant classes with numerous\nexamples and many minority classes with few samples. Recently, the use of large\ngenerative models to create synthetic data for image classification has been\nrealized, but utilizing synthetic data to address the challenge of long-tailed\nrecognition remains relatively unexplored. In this work, we proposed the use of\nsynthetic data as a complement to long-tailed datasets to eliminate the impact\nof data imbalance. To tackle this real-synthetic mixed dataset, we designed a\ntwo-branch model that contains Synthetic-Aware and Unaware branches (SAU). The\ncore ideas are (1) a synthetic-unaware branch for classification that mixes\nreal and synthetic data and treats all data equally without distinguishing\nbetween them. (2) A synthetic-aware branch for improving the robustness of the\nfeature extractor by distinguishing between real and synthetic data and\nlearning their discrepancies. Extensive experimental results demonstrate that\nour method can improve the accuracy of long-tailed image recognition. Notably,\nour approach achieves state-of-the-art Top-1 accuracy and significantly\nsurpasses other methods on CIFAR-10-LT and CIFAR-100-LT datasets across various\nimbalance factors. Our code is available at https://github.com/lgX1123/gm4lt.\n","authors":["Guangxi Li","Yinsheng Song","Mingkai Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.16273v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.16272v1","updated":"2024-08-29T05:32:03Z","published":"2024-08-29T05:32:03Z","title":"Beyond Uncertainty: Evidential Deep Learning for Robust Video Temporal\n Grounding","summary":" Existing Video Temporal Grounding (VTG) models excel in accuracy but often\noverlook open-world challenges posed by open-vocabulary queries and untrimmed\nvideos. This leads to unreliable predictions for noisy, corrupted, and\nout-of-distribution data. Adapting VTG models to dynamically estimate\nuncertainties based on user input can address this issue. To this end, we\nintroduce SRAM, a robust network module that benefits from a two-stage\ncross-modal alignment task. More importantly, it integrates Deep Evidential\nRegression (DER) to explicitly and thoroughly quantify uncertainty during\ntraining, thus allowing the model to say \"I do not know\" in scenarios beyond\nits handling capacity. However, the direct application of traditional DER\ntheory and its regularizer reveals structural flaws, leading to unintended\nconstraints in VTG tasks. In response, we develop a simple yet effective\nGeom-regularizer that enhances the uncertainty learning framework from the\nground up. To the best of our knowledge, this marks the first successful\nattempt of DER in VTG. Our extensive quantitative and qualitative results\naffirm the effectiveness, robustness, and interpretability of our modules and\nthe uncertainty learning paradigm in VTG tasks. The code will be made\navailable.\n","authors":["Kaijing Ma","Haojian Huang","Jin Chen","Haodong Chen","Pengliang Ji","Xianghao Zang","Han Fang","Chao Ban","Hao Sun","Mulin Chen","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.16272v1.pdf","comment":"Ongoing work: 28pages, 19 figures, 7 tables. Code is available at:\n https://kaijing.space/SRAM/"},{"id":"http://arxiv.org/abs/2408.16268v1","updated":"2024-08-29T05:13:01Z","published":"2024-08-29T05:13:01Z","title":"UDD: Dataset Distillation via Mining Underutilized Regions","summary":" Dataset distillation synthesizes a small dataset such that a model trained on\nthis set approximates the performance of the original dataset. Recent studies\non dataset distillation focused primarily on the design of the optimization\nprocess, with methods such as gradient matching, feature alignment, and\ntraining trajectory matching. However, little attention has been given to the\nissue of underutilized regions in synthetic images. In this paper, we propose\nUDD, a novel approach to identify and exploit the underutilized regions to make\nthem informative and discriminate, and thus improve the utilization of the\nsynthetic dataset. Technically, UDD involves two underutilized regions\nsearching policies for different conditions, i.e., response-based policy and\ndata jittering-based policy. Compared with previous works, such two policies\nare utilization-sensitive, equipping with the ability to dynamically adjust the\nunderutilized regions during the training process. Additionally, we analyze the\ncurrent model optimization problem and design a category-wise feature\ncontrastive loss, which can enhance the distinguishability of different\ncategories and alleviate the shortcomings of the existing multi-formation\nmethods. Experimentally, our method improves the utilization of the synthetic\ndataset and outperforms the state-of-the-art methods on various datasets, such\nas MNIST, FashionMNIST, SVHN, CIFAR-10, and CIFAR-100. For example, the\nimprovements on CIFAR-10 and CIFAR-100 are 4.0\\% and 3.7\\% over the next best\nmethod with IPC=1, by mining the underutilized regions.\n","authors":["Shiguang Wang","Zhongyu Zhang","Jian Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.16268v1.pdf","comment":"PRCV2024"},{"id":"http://arxiv.org/abs/2408.16266v1","updated":"2024-08-29T05:05:02Z","published":"2024-08-29T05:05:02Z","title":"Improving Diffusion-based Data Augmentation with Inversion Spherical\n Interpolation","summary":" Data Augmentation (DA), \\ie, synthesizing faithful and diverse samples to\nexpand the original training set, is a prevalent and effective strategy to\nimprove various visual recognition tasks. With the powerful image generation\nability, diffusion-based DA has shown strong performance gains on different\nbenchmarks. In this paper, we analyze today's diffusion-based DA methods, and\nargue that they cannot take account of both faithfulness and diversity, which\nare two critical keys for generating high-quality samples and boosting final\nclassification performance. To this end, we propose a novel Diffusion-based\nInversion Interpolation DA method: Diff-II. Specifically, Diff-II consists of\nthree main steps: 1) Category concepts learning: Learning concept embeddings\nfor each category. 2) Inversion interpolation: Calculating the inversion for\neach image, and conducting spherical interpolation for two randomly sampled\ninversions from the same category. 3) Two-stage denoising: Using different\nprompts to generate synthesized images in a coarse-to-fine manner. Extensive\nexperiments on multiple image classification tasks (\\eg, few-shot, long-tailed,\nand out-of-distribution classification) have demonstrated its effectiveness\nover state-of-the-art diffusion-based DA methods.\n","authors":["Yanghao Wang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2408.16266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16265v1","updated":"2024-08-29T05:04:25Z","published":"2024-08-29T05:04:25Z","title":"Low Saturation Confidence Distribution-based Test-Time Adaptation for\n Cross-Domain Remote Sensing Image Classification","summary":" Although the Unsupervised Domain Adaptation (UDA) method has improved the\neffect of remote sensing image classification tasks, most of them are still\nlimited by access to the source domain (SD) data. Designs such as Source-free\nDomain Adaptation (SFDA) solve the challenge of a lack of SD data, however,\nthey still rely on a large amount of target domain data and thus cannot achieve\nfast adaptations, which seriously hinders their further application in broader\nscenarios. The real-world applications of cross-domain remote sensing image\nclassification require a balance of speed and accuracy at the same time.\nTherefore, we propose a novel and comprehensive test time adaptation (TTA)\nmethod -- Low Saturation Confidence Distribution Test Time Adaptation\n(LSCD-TTA), which is the first attempt to solve such scenarios through the idea\nof TTA. LSCD-TTA specifically considers the distribution characteristics of\nremote sensing images, including three main parts that concentrate on different\noptimization directions: First, low saturation distribution (LSD) considers the\ndominance of low-confidence samples during the later TTA stage. Second,\nweak-category cross-entropy (WCCE) increases the weight of categories that are\nmore difficult to classify with less prior knowledge. Finally, diverse\ncategories confidence (DIV) comprehensively considers the category diversity to\nalleviate the deviation of the sample distribution. By weighting the\nabovementioned three modules, the model can widely, quickly and accurately\nadapt to the target domain without much prior target distributions, repeated\ndata access, and manual annotation. We evaluate LSCD-TTA on three\nremote-sensing image datasets. The experimental results show that LSCD-TTA\nachieves a significant gain of 4.96%-10.51% with Resnet-50 and 5.33%-12.49%\nwith Resnet-101 in average accuracy compared to other state-of-the-art DA and\nTTA methods.\n","authors":["Yu Liang","Xiucheng Zhang","Juepeng Zheng","Jianxi Huang","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.16265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16258v1","updated":"2024-08-29T04:40:31Z","published":"2024-08-29T04:40:31Z","title":"Advancing Architectural Floorplan Design with Geometry-enhanced Graph\n Diffusion","summary":" Automating architectural floorplan design is vital for housing and interior\ndesign, offering a faster, cost-effective alternative to manual sketches by\narchitects. However, existing methods, including rule-based and learning-based\napproaches, face challenges in design complexity and constrained generation\nwith extensive post-processing, and tend to obvious geometric inconsistencies\nsuch as misalignment, overlap, and gaps. In this work, we propose a novel\ngenerative framework for vector floorplan design via structural graph\ngeneration, called GSDiff, focusing on wall junction generation and wall\nsegment prediction to capture both geometric and semantic aspects of structural\ngraphs. To improve the geometric rationality of generated structural graphs, we\npropose two innovative geometry enhancement methods. In wall junction\ngeneration, we propose a novel alignment loss function to improve geometric\nconsistency. In wall segment prediction, we propose a random self-supervision\nmethod to enhance the model's perception of the overall geometric structure,\nthereby promoting the generation of reasonable geometric structures. Employing\nthe diffusion model and the Transformer model, as well as the geometry\nenhancement strategies, our framework can generate wall junctions, wall\nsegments and room polygons with structural and semantic information, resulting\nin structural graphs that accurately represent floorplans. Extensive\nexperiments show that the proposed method surpasses existing techniques,\nenabling free generation and constrained generation, marking a shift towards\nstructure generation in architectural design.\n","authors":["Sizhe Hu","Wenming Wu","Yuntao Wang","Benzhu Xu","Liping Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.16258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16254v1","updated":"2024-08-29T04:30:31Z","published":"2024-08-29T04:30:31Z","title":"EvLight++: Low-Light Video Enhancement with an Event Camera: A\n Large-Scale Real-World Dataset, Novel Method, and More","summary":" Event cameras offer significant advantages for low-light video enhancement,\nprimarily due to their high dynamic range. Current research, however, is\nseverely limited by the absence of large-scale, real-world, and\nspatio-temporally aligned event-video datasets. To address this, we introduce a\nlarge-scale dataset with over 30,000 pairs of frames and events captured under\nvarying illumination. This dataset was curated using a robotic arm that traces\na consistent non-linear trajectory, achieving spatial alignment precision under\n0.03mm and temporal alignment with errors under 0.01s for 90% of the dataset.\nBased on the dataset, we propose \\textbf{EvLight++}, a novel event-guided\nlow-light video enhancement approach designed for robust performance in\nreal-world scenarios. Firstly, we design a multi-scale holistic fusion branch\nto integrate structural and textural information from both images and events.\nTo counteract variations in regional illumination and noise, we introduce\nSignal-to-Noise Ratio (SNR)-guided regional feature selection, enhancing\nfeatures from high SNR regions and augmenting those from low SNR regions by\nextracting structural information from events. To incorporate temporal\ninformation and ensure temporal coherence, we further introduce a recurrent\nmodule and temporal loss in the whole pipeline. Extensive experiments on our\nand the synthetic SDSD dataset demonstrate that EvLight++ significantly\noutperforms both single image- and video-based methods by 1.37 dB and 3.71 dB,\nrespectively. To further explore its potential in downstream tasks like\nsemantic segmentation and monocular depth estimation, we extend our datasets by\nadding pseudo segmentation and depth labels via meticulous annotation efforts\nwith foundation models. Experiments under diverse low-light scenes show that\nthe enhanced results achieve a 15.97% improvement in mIoU for semantic\nsegmentation.\n","authors":["Kanghao Chen","Guoqiang Liang","Hangyu Li","Yunfan Lu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16254v1.pdf","comment":"Journal extension based on EvLight (arXiv:2404.00834)"},{"id":"http://arxiv.org/abs/2408.16247v1","updated":"2024-08-29T03:58:21Z","published":"2024-08-29T03:58:21Z","title":"Anno-incomplete Multi-dataset Detection","summary":" Object detectors have shown outstanding performance on various public\ndatasets. However, annotating a new dataset for a new task is usually\nunavoidable in real, since 1) a single existing dataset usually does not\ncontain all object categories needed; 2) using multiple datasets usually\nsuffers from annotation incompletion and heterogeneous features. We propose a\nnovel problem as \"Annotation-incomplete Multi-dataset Detection\", and develop\nan end-to-end multi-task learning architecture which can accurately detect all\nthe object categories with multiple partially annotated datasets. Specifically,\nwe propose an attention feature extractor which helps to mine the relations\namong different datasets. Besides, a knowledge amalgamation training strategy\nis incorporated to accommodate heterogeneous features from different sources.\nExtensive experiments on different object detection datasets demonstrate the\neffectiveness of our methods and an improvement of 2.17%, 2.10% in mAP can be\nachieved on COCO and VOC respectively.\n","authors":["Yiran Xu","Haoxiang Zhong","Kai Wu","Jialin Li","Yong Liu","Chengjie Wang","Shu-Tao Xia","Hongen Liao"],"pdf_url":"https://arxiv.org/pdf/2408.16247v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.15643v2","updated":"2024-08-29T03:47:04Z","published":"2024-08-28T08:53:33Z","title":"RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via\n Rotation-Invariant Analysis","summary":" The rotation robustness property has drawn much attention to point cloud\nanalysis, whereas it still poses a critical challenge in 3D object detection.\nWhen subjected to arbitrary rotation, most existing detectors fail to produce\nexpected outputs due to the poor rotation robustness. In this paper, we present\nRIDE, a pioneering exploration of Rotation-Invariance for the 3D\nLiDAR-point-based object DEtector, with the key idea of designing\nrotation-invariant features from LiDAR scenes and then effectively\nincorporating them into existing 3D detectors. Specifically, we design a\nbi-feature extractor that extracts (i) object-aware features though sensitive\nto rotation but preserve geometry well, and (ii) rotation-invariant features,\nwhich lose geometric information to a certain extent but are robust to\nrotation. These two kinds of features complement each other to decode 3D\nproposals that are robust to arbitrary rotations. Particularly, our RIDE is\ncompatible and easy to plug into the existing one-stage and two-stage 3D\ndetectors, and boosts both detection performance and rotation robustness.\nExtensive experiments on the standard benchmarks showcase that the mean average\nprecision (mAP) and rotation robustness can be significantly boosted by\nintegrating with our RIDE, with +5.6% mAP and 53% rotation robustness\nimprovement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes.\nThe code will be available soon.\n","authors":["Zhaoxuan Wang","Xu Han","Hongxin Liu","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2408.15643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16236v1","updated":"2024-08-29T03:26:14Z","published":"2024-08-29T03:26:14Z","title":"Neural Spectral Decomposition for Dataset Distillation","summary":" In this paper, we propose Neural Spectrum Decomposition, a generic\ndecomposition framework for dataset distillation. Unlike previous methods, we\nconsider the entire dataset as a high-dimensional observation that is low-rank\nacross all dimensions. We aim to discover the low-rank representation of the\nentire dataset and perform distillation efficiently. Toward this end, we learn\na set of spectrum tensors and transformation matrices, which, through simple\nmatrix multiplication, reconstruct the data distribution. Specifically, a\nspectrum tensor can be mapped back to the image space by a transformation\nmatrix, and efficient information sharing during the distillation learning\nprocess is achieved through pairwise combinations of different spectrum vectors\nand transformation matrices. Furthermore, we integrate a trajectory matching\noptimization method guided by a real distribution. Our experimental results\ndemonstrate that our approach achieves state-of-the-art performance on\nbenchmarks, including CIFAR10, CIFAR100, Tiny Imagenet, and ImageNet Subset.\nOur code are available at \\url{https://github.com/slyang2021/NSD}.\n","authors":["Shaolei Yang","Shen Cheng","Mingbo Hong","Haoqiang Fan","Xing Wei","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16236v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2311.01673v2","updated":"2024-08-29T03:25:04Z","published":"2023-11-03T02:43:51Z","title":"Content Significance Distribution of Sub-Text Blocks in Articles and Its\n Application to Article-Organization Assessment","summary":" We explore how to capture the significance of a sub-text block in an article\nand how it may be used for text mining tasks. A sub-text block is a\nsub-sequence of sentences in the article. We formulate the notion of content\nsignificance distribution (CSD) of sub-text blocks, referred to as CSD of the\nfirst kind and denoted by CSD-1. In particular, we leverage Hugging Face's\nSentenceTransformer to generate contextual sentence embeddings, and use\nMoverScore over text embeddings to measure how similar a sub-text block is to\nthe entire text. To overcome the exponential blowup on the number of sub-text\nblocks, we present an approximation algorithm and show that the approximated\nCSD-1 is almost identical to the exact CSD-1. Under this approximation, we show\nthat the average and median CSD-1's for news, scholarly research, argument, and\nnarrative articles share the same pattern. We also show that under a certain\nlinear transformation, the complement of the cumulative distribution function\nof the beta distribution with certain values of $\\alpha$ and $\\beta$ resembles\na CSD-1 curve. We then use CSD-1's to extract linguistic features to train an\nSVC classifier for assessing how well an article is organized. Through\nexperiments, we show that this method achieves high accuracy for assessing\nstudent essays. Moreover, we study CSD of sentence locations, referred to as\nCSD of the second kind and denoted by CSD-2, and show that average CSD-2's for\ndifferent types of articles possess distinctive patterns, which either conform\ncommon perceptions of article structures or provide rectification with minor\ndeviation.\n","authors":["You Zhou","Jie Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16235v1","updated":"2024-08-29T03:23:51Z","published":"2024-08-29T03:23:51Z","title":"LMT-GP: Combined Latent Mean-Teacher and Gaussian Process for\n Semi-supervised Low-light Image Enhancement","summary":" While recent low-light image enhancement (LLIE) methods have made significant\nadvancements, they still face challenges in terms of low visual quality and\nweak generalization ability when applied to complex scenarios. To address these\nissues, we propose a semi-supervised method based on latent mean-teacher and\nGaussian process, named LMT-GP. We first design a latent mean-teacher framework\nthat integrates both labeled and unlabeled data, as well as their latent\nvectors, into model training. Meanwhile, we use a mean-teacher-assisted\nGaussian process learning strategy to establish a connection between the latent\nand pseudo-latent vectors obtained from the labeled and unlabeled data. To\nguide the learning process, we utilize an assisted Gaussian process regression\n(GPR) loss function. Furthermore, we design a pseudo-label adaptation module\n(PAM) to ensure the reliability of the network learning. To demonstrate our\nmethod's generalization ability and effectiveness, we apply it to multiple LLIE\ndatasets and high-level vision tasks. Experiment results demonstrate that our\nmethod achieves high generalization performance and image quality. The code is\navailable at https://github.com/HFUT-CV/LMT-GP.\n","authors":["Ye Yu","Fengxin Chen","Jun Yu","Zhen Kan"],"pdf_url":"https://arxiv.org/pdf/2408.16235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16233v1","updated":"2024-08-29T03:20:43Z","published":"2024-08-29T03:20:43Z","title":"PSE-Net: Channel Pruning for Convolutional Neural Networks with\n Parallel-subnets Estimator","summary":" Channel Pruning is one of the most widespread techniques used to compress\ndeep neural networks while maintaining their performances. Currently, a typical\npruning algorithm leverages neural architecture search to directly find\nnetworks with a configurable width, the key step of which is to identify\nrepresentative subnet for various pruning ratios by training a supernet.\nHowever, current methods mainly follow a serial training strategy to optimize\nsupernet, which is very time-consuming. In this work, we introduce PSE-Net, a\nnovel parallel-subnets estimator for efficient channel pruning. Specifically,\nwe propose a parallel-subnets training algorithm that simulate the\nforward-backward pass of multiple subnets by droping extraneous features on\nbatch dimension, thus various subnets could be trained in one round. Our\nproposed algorithm facilitates the efficiency of supernet training and equips\nthe network with the ability to interpolate the accuracy of unsampled subnets,\nenabling PSE-Net to effectively evaluate and rank the subnets. Over the trained\nsupernet, we develop a prior-distributed-based sampling algorithm to boost the\nperformance of classical evolutionary search. Such algorithm utilizes the prior\ninformation of supernet training phase to assist in the search of optimal\nsubnets while tackling the challenge of discovering samples that satisfy\nresource constraints due to the long-tail distribution of network\nconfiguration. Extensive experiments demonstrate PSE-Net outperforms previous\nstate-of-the-art channel pruning methods on the ImageNet dataset while\nretaining superior supernet training efficiency. For example, under 300M FLOPs\nconstraint, our pruned MobileNetV2 achieves 75.2% Top-1 accuracy on ImageNet\ndataset, exceeding the original MobileNetV2 by 2.6 units while only cost\n30%/16% times than BCNet/AutoAlim.\n","authors":["Shiguang Wang","Tao Xie","Haijun Liu","Xingcheng Zhang","Jian Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.16233v1.pdf","comment":"10pages, Neural Networks"},{"id":"http://arxiv.org/abs/2408.16232v1","updated":"2024-08-29T03:12:04Z","published":"2024-08-29T03:12:04Z","title":"Enhancing Conditional Image Generation with Explainable Latent Space\n Manipulation","summary":" In the realm of image synthesis, achieving fidelity to a reference image\nwhile adhering to conditional prompts remains a significant challenge. This\npaper proposes a novel approach that integrates a diffusion model with latent\nspace manipulation and gradient-based selective attention mechanisms to address\nthis issue. Leveraging Grad-SAM (Gradient-based Selective Attention\nManipulation), we analyze the cross attention maps of the cross attention\nlayers and gradients for the denoised latent vector, deriving importance scores\nof elements of denoised latent vector related to the subject of interest. Using\nthis information, we create masks at specific timesteps during denoising to\npreserve subjects while seamlessly integrating the reference image features.\nThis approach ensures the faithful formation of subjects based on conditional\nprompts, while concurrently refining the background for a more coherent\ncomposition. Our experiments on places365 dataset demonstrate promising\nresults, with our proposed model achieving the lowest mean and median Frechet\nInception Distance (FID) scores compared to baseline models, indicating\nsuperior fidelity preservation. Furthermore, our model exhibits competitive\nperformance in aligning the generated images with provided textual\ndescriptions, as evidenced by high CLIP scores. These results highlight the\neffectiveness of our approach in both fidelity preservation and textual context\npreservation, offering a significant advancement in text-to-image synthesis\ntasks.\n","authors":["Kshitij Pathania"],"pdf_url":"https://arxiv.org/pdf/2408.16232v1.pdf","comment":"7 pages , 5 figures"},{"id":"http://arxiv.org/abs/2311.13385v4","updated":"2024-08-29T03:11:14Z","published":"2023-11-22T13:27:36Z","title":"SegVol: Universal and Interactive Volumetric Medical Image Segmentation","summary":" Precise image segmentation provides clinical study with instructive\ninformation. Despite the remarkable progress achieved in medical image\nsegmentation, there is still an absence of a 3D foundation segmentation model\nthat can segment a wide range of anatomical categories with easy user\ninteraction. In this paper, we propose a 3D foundation segmentation model,\nnamed SegVol, supporting universal and interactive volumetric medical image\nsegmentation. By scaling up training data to 90K unlabeled Computed Tomography\n(CT) volumes and 6K labeled CT volumes, this foundation model supports the\nsegmentation of over 200 anatomical categories using semantic and spatial\nprompts. To facilitate efficient and precise inference on volumetric images, we\ndesign a zoom-out-zoom-in mechanism. Extensive experiments on 22 anatomical\nsegmentation tasks verify that SegVol outperforms the competitors in 19 tasks,\nwith improvements up to 37.24% compared to the runner-up methods. We\ndemonstrate the effectiveness and importance of specific designs by ablation\nstudy. We expect this foundation model can promote the development of\nvolumetric medical image analysis. The model and code are publicly available\nat: https://github.com/BAAI-DCAI/SegVol.\n","authors":["Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.13385v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02139v3","updated":"2024-08-29T03:09:40Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":" Diffusion models with their powerful expressivity and high sample quality\nhave achieved State-Of-The-Art (SOTA) performance in the generative domain. The\npioneering Vision Transformer (ViT) has also demonstrated strong modeling\ncapabilities and scalability, especially for recognition tasks. In this paper,\nwe study the effectiveness of ViTs in diffusion-based generative learning and\npropose a new model denoted as Diffusion Vision Transformers (DiffiT).\nSpecifically, we propose a methodology for finegrained control of the denoising\nprocess and introduce the Time-dependant Multihead Self Attention (TMSA)\nmechanism. DiffiT is surprisingly effective in generating high-fidelity images\nwith significantly better parameter efficiency. We also propose latent and\nimage space DiffiT models and show SOTA performance on a variety of\nclass-conditional and unconditional synthesis tasks at different resolutions.\nThe Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256\ndataset while having 19.85%, 16.88% less parameters than other\nTransformer-based diffusion models such as MDT and DiT,respectively. Code:\nhttps://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v3.pdf","comment":"Accepted to ECCV'24"},{"id":"http://arxiv.org/abs/2408.16227v1","updated":"2024-08-29T02:58:35Z","published":"2024-08-29T02:58:35Z","title":"Revisiting 360 Depth Estimation with PanoGabor: A New Fusion Perspective","summary":" Depth estimation from a monocular 360 image is important to the perception of\nthe entire 3D environment. However, the inherent distortion and large field of\nview (FoV) in 360 images pose great challenges for this task. To this end,\nexisting mainstream solutions typically introduce additional perspective-based\n360 representations (\\textit{e.g.}, Cubemap) to achieve effective feature\nextraction. Nevertheless, regardless of the introduced representations, they\neventually need to be unified into the equirectangular projection (ERP) format\nfor the subsequent depth estimation, which inevitably reintroduces the\ntroublesome distortions. In this work, we propose an oriented distortion-aware\nGabor Fusion framework (PGFuse) to address the above challenges. First, we\nintroduce Gabor filters that analyze texture in the frequency domain, thereby\nextending the receptive fields and enhancing depth cues. To address the\nreintroduced distortions, we design a linear latitude-aware distortion\nrepresentation method to generate customized, distortion-aware Gabor filters\n(PanoGabor filters). Furthermore, we design a channel-wise and spatial-wise\nunidirectional fusion module (CS-UFM) that integrates the proposed PanoGabor\nfilters to unify other representations into the ERP format, delivering\neffective and distortion-free features. Considering the orientation sensitivity\nof the Gabor transform, we introduce a spherical gradient constraint to\nstabilize this sensitivity. Experimental results on three popular indoor 360\nbenchmarks demonstrate the superiority of the proposed PGFuse to existing\nstate-of-the-art solutions. Code can be available upon acceptance.\n","authors":["Zhijie Shen","Chunyu Lin","Lang Nie","Kang Liao"],"pdf_url":"https://arxiv.org/pdf/2408.16227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13520v2","updated":"2024-08-29T02:52:46Z","published":"2024-07-18T13:55:54Z","title":"EaDeblur-GS: Event assisted 3D Deblur Reconstruction with Gaussian\n Splatting","summary":" 3D deblurring reconstruction techniques have recently seen significant\nadvancements with the development of Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS). Although these techniques can recover relatively\nclear 3D reconstructions from blurry image inputs, they still face limitations\nin handling severe blurring and complex camera motion. To address these issues,\nwe propose Event-assisted 3D Deblur Reconstruction with Gaussian Splatting\n(EaDeblur-GS), which integrates event camera data to enhance the robustness of\n3DGS against motion blur. By employing an Adaptive Deviation Estimator (ADE)\nnetwork to estimate Gaussian center deviations and using novel loss functions,\nEaDeblur-GS achieves sharp 3D reconstructions in real-time, demonstrating\nperformance comparable to state-of-the-art methods.\n","authors":["Yuchen Weng","Zhengwen Shen","Ruofan Chen","Qi Wang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16224v1","updated":"2024-08-29T02:43:20Z","published":"2024-08-29T02:43:20Z","title":"LLaVA-SG: Leveraging Scene Graphs as Visual Semantic Expression in\n Vision-Language Models","summary":" Recent advances in large vision-language models (VLMs) typically employ\nvision encoders based on the Vision Transformer (ViT) architecture. The\ndivision of the images into patches by ViT results in a fragmented perception,\nthereby hindering the visual understanding capabilities of VLMs. In this paper,\nwe propose an innovative enhancement to address this limitation by introducing\na Scene Graph Expression (SGE) module in VLMs. This module extracts and\nstructurally expresses the complex semantic information within images, thereby\nimproving the foundational perception and understanding abilities of VLMs.\nExtensive experiments demonstrate that integrating our SGE module significantly\nenhances the VLM's performance in vision-language tasks, indicating its\neffectiveness in preserving intricate semantic details and facilitating better\nvisual understanding. Code and data would be available.\n","authors":["Jingyi Wang","Jianzhong Ju","Jian Luan","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.16224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10121v3","updated":"2024-08-29T02:35:21Z","published":"2023-09-18T19:49:22Z","title":"Pre-training on Synthetic Driving Data for Trajectory Prediction","summary":" Accumulating substantial volumes of real-world driving data proves pivotal in\nthe realm of trajectory forecasting for autonomous driving. Given the heavy\nreliance of current trajectory forecasting models on data-driven methodologies,\nwe aim to tackle the challenge of learning general trajectory forecasting\nrepresentations under limited data availability. We propose a pipeline-level\nsolution to mitigate the issue of data scarcity in trajectory forecasting. The\nsolution is composed of two parts: firstly, we adopt HD map augmentation and\ntrajectory synthesis for generating driving data, and then we learn\nrepresentations by pre-training on them. Specifically, we apply vector\ntransformations to reshape the maps, and then employ a rule-based model to\ngenerate trajectories on both original and augmented scenes; thus enlarging the\ndriving data without collecting additional real ones. To foster the learning of\ngeneral representations within this augmented dataset, we comprehensively\nexplore the different pre-training strategies, including extending the concept\nof a Masked AutoEncoder (MAE) for trajectory forecasting. Without bells and\nwhistles, our proposed pipeline-level solution is general, simple, yet\neffective: we conduct extensive experiments to demonstrate the effectiveness of\nour data expansion and pre-training strategies, which outperform the baseline\nprediction model by large margins, e.g. 5.04%, 3.84% and 8.30% in terms of\n$MR_6$, $minADE_6$ and $minFDE_6$. The pre-training dataset and the codes for\npre-training and fine-tuning are released at\nhttps://github.com/yhli123/Pretraining_on_Synthetic_Driving_Data_for_Trajectory_Prediction.\n","authors":["Yiheng Li","Seth Z. Zhao","Chenfeng Xu","Chen Tang","Chenran Li","Mingyu Ding","Masayoshi Tomizuka","Wei Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.10121v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16219v1","updated":"2024-08-29T02:25:12Z","published":"2024-08-29T02:25:12Z","title":"Training-free Video Temporal Grounding using Large-scale Pre-trained\n Models","summary":" Video temporal grounding aims to identify video segments within untrimmed\nvideos that are most relevant to a given natural language query. Existing video\ntemporal localization models rely on specific datasets for training and have\nhigh data collection costs, but they exhibit poor generalization capability\nunder the across-dataset and out-of-distribution (OOD) settings. In this paper,\nwe propose a Training-Free Video Temporal Grounding (TFVTG) approach that\nleverages the ability of pre-trained large models. A naive baseline is to\nenumerate proposals in the video and use the pre-trained visual language models\n(VLMs) to select the best proposal according to the vision-language alignment.\nHowever, most existing VLMs are trained on image-text pairs or trimmed video\nclip-text pairs, making it struggle to (1) grasp the relationship and\ndistinguish the temporal boundaries of multiple events within the same video;\n(2) comprehend and be sensitive to the dynamic transition of events (the\ntransition from one event to another) in the video. To address these issues, we\npropose leveraging large language models (LLMs) to analyze multiple sub-events\ncontained in the query text and analyze the temporal order and relationships\nbetween these events. Secondly, we split a sub-event into dynamic transition\nand static status parts and propose the dynamic and static scoring functions\nusing VLMs to better evaluate the relevance between the event and the\ndescription. Finally, for each sub-event description, we use VLMs to locate the\ntop-k proposals and leverage the order and relationships between sub-events\nprovided by LLMs to filter and integrate these proposals. Our method achieves\nthe best performance on zero-shot video temporal grounding on Charades-STA and\nActivityNet Captions datasets without any training and demonstrates better\ngeneralization capabilities in cross-dataset and OOD settings.\n","authors":["Minghang Zheng","Xinhao Cai","Qingchao Chen","Yuxin Peng","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16219v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2305.18680v2","updated":"2024-08-29T02:22:08Z","published":"2023-05-30T01:38:54Z","title":"Improving Deep Representation Learning via Auxiliary Learnable Target\n Coding","summary":" Deep representation learning is a subfield of machine learning that focuses\non learning meaningful and useful representations of data through deep neural\nnetworks. However, existing methods for semantic classification typically\nemploy pre-defined target codes such as the one-hot and the Hadamard codes,\nwhich can either fail or be less flexible to model inter-class correlation. In\nlight of this, this paper introduces a novel learnable target coding as an\nauxiliary regularization of deep representation learning, which can not only\nincorporate latent dependency across classes but also impose geometric\nproperties of target codes into representation space. Specifically, a\nmargin-based triplet loss and a correlation consistency loss on the proposed\ntarget codes are designed to encourage more discriminative representations\nowing to enlarging between-class margins in representation space and favoring\nequal semantic correlation of learnable target codes respectively. Experimental\nresults on several popular visual classification and retrieval benchmarks can\ndemonstrate the effectiveness of our method on improving representation\nlearning, especially for imbalanced data. Source codes are made publicly\navailable at\n\\href{https://github.com/AkonLau/LTC}{https://github.com/AkonLau/LTC}.\n","authors":["Kangjun Liu","Ke Chen","Kui Jia","Yaowei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.18680v2.pdf","comment":"Accepted by Pattern Recognition, 33 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2408.15829v2","updated":"2024-08-29T02:16:02Z","published":"2024-08-28T14:44:42Z","title":"SITransformer: Shared Information-Guided Transformer for Extreme\n Multimodal Summarization","summary":" Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an\nattractive summarization approach by integrating various types of information\nto create extremely concise yet informative summaries for individual\nmodalities. Existing methods overlook the issue that multimodal data often\ncontains more topic irrelevant information, which can mislead the model into\nproducing inaccurate summaries especially for extremely short ones. In this\npaper, we propose SITransformer, a Shared Information-guided Transformer for\nextreme multimodal summarization. It has a shared information guided pipeline\nwhich involves a cross-modal shared information extractor and a cross-modal\ninteraction module. The extractor formulates semantically shared salient\ninformation from different modalities by devising a novel filtering process\nconsisting of a differentiable top-k selector and a shared-information guided\ngating unit. As a result, the common, salient, and relevant contents across\nmodalities are identified. Next, a transformer with cross-modal attentions is\ndeveloped for intra- and inter-modality learning with the shared information\nguidance to produce the extreme summary. Comprehensive experiments demonstrate\nthat SITransformer significantly enhances the summarization quality for both\nvideo and text summaries for XMSMO. Our code will be publicly available at\nhttps://github.com/SichengLeoLiu/MMAsia24-XMSMO.\n","authors":["Sicheng Liu","Lintao Wang","Xiaogan Zhu","Xuequan Lu","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15829v2.pdf","comment":"8 pages, 5 figures, submitted to ACM Multimedia Asia 2024"},{"id":"http://arxiv.org/abs/2403.11541v2","updated":"2024-08-29T02:13:09Z","published":"2024-03-18T07:51:22Z","title":"Hierarchical Spatial Proximity Reasoning for Vision-and-Language\n Navigation","summary":" Most Vision-and-Language Navigation (VLN) algorithms are prone to making\ndecision due to a lack of visual common sense and insufficient reasoning\ncapabilities. To address this issue, we propose a Hierarchical Spatial\nProximity Reasoning (HSPR) method. First, we introduce a scene understanding\nauxiliary task to help the agent build a knowledge base of hierarchical spatial\nproximity. This task utilizes panoramic views and object features to identify\ntypes of nodes and uncover the adjacency relationships between nodes, objects,\nand between nodes and objects. Second, we propose a multi-step reasoning\nnavigation algorithm based on hierarchical spatial proximity knowledge base,\nwhich continuously plans feasible paths to enhance exploration efficiency.\nThird, we introduce a residual fusion method to improve navigation decision\naccuracy. Finally, we validate our approach with experiments on publicly\navailable datasets including REVERIE, SOON, R2R, and R4R. Our code is available\nat https://github.com/iCityLab/HSPR.\n","authors":["Ming Xu","Zilong Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16213v1","updated":"2024-08-29T02:12:58Z","published":"2024-08-29T02:12:58Z","title":"M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language\n Models for Chest X-ray Interpretation","summary":" The rapid evolution of artificial intelligence, especially in large language\nmodels (LLMs), has significantly impacted various domains, including\nhealthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs,\nbut with limitations: either underutilizing the multi-tasking capabilities of\nLLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM\ndesigned to enhance CXR interpretation. The model is trained on a visual\ninstruction-following dataset that integrates various task-specific datasets in\na conversational format. As a result, the model supports multiple tasks such as\nmedical report generation (MRG), visual grounding, and visual question\nanswering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by\nemploying a chain-of-thought prompting strategy, in which it identifies\nfindings in CXR images and subsequently generates corresponding reports. The\nmodel is adaptable to various MRG scenarios depending on the available inputs,\nsuch as single-image, multi-image, and multi-study contexts. In addition to\nMRG, M4CXR performs visual grounding at a level comparable to specialized\nmodels and also demonstrates outstanding performance in VQA. Both quantitative\nand qualitative assessments reveal M4CXR's versatility in MRG, visual\ngrounding, and VQA, while consistently maintaining clinical accuracy.\n","authors":["Jonggwon Park","Soobum Kim","Byungmu Yoon","Jihun Hyun","Kyoyun Choi"],"pdf_url":"https://arxiv.org/pdf/2408.16213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10577v2","updated":"2024-08-29T02:09:11Z","published":"2024-05-17T07:04:29Z","title":"DuoSpaceNet: Leveraging Both Bird's-Eye-View and Perspective View\n Representations for 3D Object Detection","summary":" Recent advances in multi-view camera-only 3D object detection either rely on\nan accurate reconstruction of bird's-eye-view (BEV) 3D features or on\ntraditional 2D perspective view (PV) image features. While both have their own\npros and cons, few have found a way to stitch them together in order to benefit\nfrom \"the best of both worlds\". To this end, we explore a duo space (i.e., BEV\nand PV) 3D perception framework, in conjunction with some useful duo space\nfusion strategies that allow effective aggregation of the two feature\nrepresentations. To the best of our knowledge, our proposed method,\nDuoSpaceNet, is the first to leverage two distinct feature spaces and achieves\nthe state-of-the-art 3D object detection and BEV map segmentation results on\nnuScenes dataset.\n","authors":["Zhe Huang","Yizhe Zhao","Hao Xiao","Chenyan Wu","Lingting Ge"],"pdf_url":"https://arxiv.org/pdf/2405.10577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18038v2","updated":"2024-08-29T02:05:04Z","published":"2024-07-25T13:31:55Z","title":"TiCoSS: Tightening the Coupling between Semantic Segmentation and Stereo\n Matching within A Joint Learning Framework","summary":" Semantic segmentation and stereo matching, respectively analogous to the\nventral and dorsal streams in our human brain, are two key components of\nautonomous driving perception systems. Addressing these two tasks with separate\nnetworks is no longer the mainstream direction in developing computer vision\nalgorithms, particularly with the recent advances in large vision models and\nembodied artificial intelligence. The trend is shifting towards combining them\nwithin a joint learning framework, especially emphasizing feature sharing\nbetween the two tasks. The major contributions of this study lie in\ncomprehensively tightening the coupling between semantic segmentation and\nstereo matching. Specifically, this study introduces three novelties: (1) a\ntightly coupled, gated feature fusion strategy, (2) a hierarchical deep\nsupervision strategy, and (3) a coupling tightening loss function. The combined\nuse of these technical contributions results in TiCoSS, a state-of-the-art\njoint learning framework that simultaneously tackles semantic segmentation and\nstereo matching. Through extensive experiments on the KITTI and vKITTI2\ndatasets, along with qualitative and quantitative analyses, we validate the\neffectiveness of our developed strategies and loss function, and demonstrate\nits superior performance compared to prior arts, with a notable increase in\nmIoU by over 9%. Our source code will be publicly available at\nmias.group/TiCoSS upon publication.\n","authors":["Guanfeng Tang","Zhiyuan Wu","Jiahang Li","Ping Zhong","Xieyuanli Chen","Huiming Liu","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.18038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16201v1","updated":"2024-08-29T01:46:37Z","published":"2024-08-29T01:46:37Z","title":"Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on\n Model-free Products","summary":" Anomaly detection is a long-standing challenge in manufacturing systems.\nTraditionally, anomaly detection has relied on human inspectors. However, 3D\npoint clouds have gained attention due to their robustness to environmental\nfactors and their ability to represent geometric data. Existing 3D anomaly\ndetection methods generally fall into two categories. One compares scanned 3D\npoint clouds with design files, assuming these files are always available.\nHowever, such assumptions are often violated in many real-world applications\nwhere model-free products exist, such as fresh produce (i.e., ``Cookie\",\n``Potato\", etc.), dentures, bone, etc. The other category compares patches of\nscanned 3D point clouds with a library of normal patches named memory bank.\nHowever, those methods usually fail to detect incomplete shapes, which is a\nfairly common defect type (i.e., missing pieces of different products). The\nmain challenge is that missing areas in 3D point clouds represent the absence\nof scanned points. This makes it infeasible to compare the missing region with\nexisting point cloud patches in the memory bank. To address these two\nchallenges, we proposed a unified, unsupervised 3D anomaly detection framework\ncapable of identifying all types of defects on model-free products. Our method\nintegrates two detection modules: a feature-based detection module and a\nreconstruction-based detection module. Feature-based detection covers geometric\ndefects, such as dents, holes, and cracks, while the reconstruction-based\nmethod detects missing regions. Additionally, we employ a One-class Support\nVector Machine (OCSVM) to fuse the detection results from both modules. The\nresults demonstrate that (1) our proposed method outperforms the\nstate-of-the-art methods in identifying incomplete shapes and (2) it still\nmaintains comparable performance with the SOTA methods in detecting all other\ntypes of anomalies.\n","authors":["Jiayu Liu","Shancong Mou","Nathan Gaw","Yinan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16268v2","updated":"2024-08-29T01:44:27Z","published":"2023-12-26T12:16:03Z","title":"360 Layout Estimation via Orthogonal Planes Disentanglement and\n Multi-view Geometric Consistency Perception","summary":" Existing panoramic layout estimation solutions tend to recover room\nboundaries from a vertically compressed sequence, yielding imprecise results as\nthe compression process often muddles the semantics between various planes.\nBesides, these data-driven approaches impose an urgent demand for massive data\nannotations, which are laborious and time-consuming. For the first problem, we\npropose an orthogonal plane disentanglement network (termed DOPNet) to\ndistinguish ambiguous semantics. DOPNet consists of three modules that are\nintegrated to deliver distortion-free, semantics-clean, and detail-sharp\ndisentangled representations, which benefit the subsequent layout recovery. For\nthe second problem, we present an unsupervised adaptation technique tailored\nfor horizon-depth and ratio representations. Concretely, we introduce an\noptimization strategy for decision-level layout analysis and a 1D cost volume\nconstruction method for feature-level multi-view aggregation, both of which are\ndesigned to fully exploit the geometric consistency across multiple\nperspectives. The optimizer provides a reliable set of pseudo-labels for\nnetwork training, while the 1D cost volume enriches each view with\ncomprehensive scene information derived from other perspectives. Extensive\nexperiments demonstrate that our solution outperforms other SoTA models on both\nmonocular layout estimation and multi-view layout estimation tasks. Cobe can be\navailable at https://github.com/zhijieshen-bjtu/MV-DOPNet.\n","authors":["Zhijie Shen","Chunyu Lin","Junsong Zhang","Lang Nie","Kang Liao","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.16268v2.pdf","comment":"Accept to TPAMI2024. arXiv admin note: substantial text overlap with\n arXiv:2303.00971"},{"id":"http://arxiv.org/abs/2408.16200v1","updated":"2024-08-29T01:42:38Z","published":"2024-08-29T01:42:38Z","title":"PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object\n Detection in Bird's-Eye-View","summary":" Recently, LSS-based multi-view 3D object detection provides an economical and\ndeployment-friendly solution for autonomous driving. However, all the existing\nLSS-based methods transform multi-view image features into a Cartesian\nBird's-Eye-View(BEV) representation, which does not take into account the\nnon-uniform image information distribution and hardly exploits the view\nsymmetry. In this paper, in order to adapt the image information distribution\nand preserve the view symmetry by regular convolution, we propose to employ the\npolar BEV representation to substitute the Cartesian BEV representation. To\nachieve this, we elaborately tailor three modules: a polar view transformer to\ngenerate the polar BEV representation, a polar temporal fusion module for\nfusing historical polar BEV features and a polar detection head to predict the\npolar-parameterized representation of the object. In addition, we design a 2D\nauxiliary detection head and a spatial attention enhancement module to improve\nthe quality of feature extraction in perspective view and BEV, respectively.\nFinally, we integrate the above improvements into a novel multi-view 3D object\ndetector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves\nthe superior performance. The code is available at\nhttps://github.com/Yzichen/PolarBEVDet.git.\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16200v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.01565v2","updated":"2024-08-29T01:32:17Z","published":"2024-08-02T20:40:19Z","title":"Embodiment: Self-Supervised Depth Estimation Based on Camera Models","summary":" Depth estimation is a critical topic for robotics and vision-related tasks.\nIn monocular depth estimation, in comparison with supervised learning that\nrequires expensive ground truth labeling, self-supervised methods possess great\npotential due to no labeling cost. However, self-supervised learning still has\na large gap with supervised learning in 3D reconstruction and depth estimation\nperformance. Meanwhile, scaling is also a major issue for monocular\nunsupervised depth estimation, which commonly still needs ground truth scale\nfrom GPS, LiDAR, or existing maps to correct. In the era of deep learning,\nexisting methods primarily rely on exploring image relationships to train\nunsupervised neural networks, while the physical properties of the camera\nitself such as intrinsics and extrinsics are often overlooked. These physical\nproperties are not just mathematical parameters; they are embodiments of the\ncamera's interaction with the physical world. By embedding these physical\nproperties into the deep learning model, we can calculate depth priors for\nground regions and regions connected to the ground based on physical\nprinciples, providing free supervision signals without the need for additional\nsensors. This approach is not only easy to implement but also enhances the\neffects of all unsupervised methods by embedding the camera's physical\nproperties into the model, thereby achieving an embodied understanding of the\nreal world.\n","authors":["Jinchang Zhang","Praveen Kumar Reddy","Xue-Iuan Wong","Yiannis Aloimonos","Guoyu Lu"],"pdf_url":"https://arxiv.org/pdf/2408.01565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16195v1","updated":"2024-08-29T01:25:36Z","published":"2024-08-29T01:25:36Z","title":"DLM-VMTL:A Double Layer Mapper for heterogeneous data video Multi-task\n prompt learning","summary":" In recent years, the parameters of backbones of Video Understanding tasks\ncontinue to increase and even reach billion-level. Whether fine-tuning a\nspecific task on the Video Foundation Model or pre-training the model designed\nfor the specific task, incurs a lot of overhead. How to make these models play\nother values than their own tasks becomes a worthy question. Multi-Task\nLearning(MTL) makes the visual task acquire the rich shareable knowledge from\nother tasks while joint training. It is fully explored in Image Recognition\ntasks especially dense predict tasks. Nevertheless, it is rarely used in video\ndomain due to the lack of multi-labels video data. In this paper, a\nheterogenous data video multi-task prompt learning (VMTL) method is proposed to\naddress above problem. It's different from it in image domain, a Double-Layers\nMapper(DLM) is proposed to extract the shareable knowledge into visual promptS\nand align it with representation of primary task. Extensive experiments prove\nthat our DLM-VMTL performs better than baselines on 6 different video\nunderstanding tasks and 11 datasets.\n","authors":["Zeyi Bo","Wuxi Sun","Ye Jin"],"pdf_url":"https://arxiv.org/pdf/2408.16195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16190v1","updated":"2024-08-29T01:06:51Z","published":"2024-08-29T01:06:51Z","title":"Estimating Dynamic Flow Features in Groups of Tracked Objects","summary":" Interpreting motion captured in image sequences is crucial for a wide range\nof computer vision applications. Typical estimation approaches include optical\nflow (OF), which approximates the apparent motion instantaneously in a scene,\nand multiple object tracking (MOT), which tracks the motion of subjects over\ntime. Often, the motion of objects in a scene is governed by some underlying\ndynamical system which could be inferred by analyzing the motion of groups of\nobjects. Standard motion analyses, however, are not designed to intuit flow\ndynamics from trajectory data, making such measurements difficult in practice.\nThe goal of this work is to extend gradient-based dynamical systems analyses to\nreal-world applications characterized by complex, feature-rich image sequences\nwith imperfect tracers. The tracer trajectories are tracked using deep vision\nnetworks and gradients are approximated using Lagrangian gradient regression\n(LGR), a tool designed to estimate spatial gradients from sparse data. From\ngradients, dynamical features such as regions of coherent rotation and\ntransport barriers are identified. The proposed approach is affordably\nimplemented and enables advanced studies including the motion analysis of two\ndistinct object classes in a single image sequence. Two examples of the method\nare presented on data sets for which standard gradient-based analyses do not\napply.\n","authors":["Tanner D. Harms","Steven L. Brunton","Beverley J. McKeon"],"pdf_url":"https://arxiv.org/pdf/2408.16190v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.16930v1","updated":"2024-08-29T22:13:29Z","published":"2024-08-29T22:13:29Z","title":"VLM-KD: Knowledge Distillation from VLM for Long-Tail Visual Recognition","summary":" For visual recognition, knowledge distillation typically involves\ntransferring knowledge from a large, well-trained teacher model to a smaller\nstudent model. In this paper, we introduce an effective method to distill\nknowledge from an off-the-shelf vision-language model (VLM), demonstrating that\nit provides novel supervision in addition to those from a conventional\nvision-only teacher model. Our key technical contribution is the development of\na framework that generates novel text supervision and distills free-form text\ninto a vision encoder. We showcase the effectiveness of our approach, termed\nVLM-KD, across various benchmark datasets, showing that it surpasses several\nstate-of-the-art long-tail visual classifiers. To our knowledge, this work is\nthe first to utilize knowledge distillation with text supervision generated by\nan off-the-shelf VLM and apply it to vanilla randomly initialized vision\nencoders.\n","authors":["Zaiwei Zhang","Gregory P. Meyer","Zhichao Lu","Ashish Shrivastava","Avinash Ravichandran","Eric M. Wolff"],"pdf_url":"https://arxiv.org/pdf/2408.16930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16924v1","updated":"2024-08-29T21:53:01Z","published":"2024-08-29T21:53:01Z","title":"Enhancing Autism Spectrum Disorder Early Detection with the Parent-Child\n Dyads Block-Play Protocol and an Attention-enhanced GCN-xLSTM Hybrid Deep\n Learning Framework","summary":" Autism Spectrum Disorder (ASD) is a rapidly growing neurodevelopmental\ndisorder. Performing a timely intervention is crucial for the growth of young\nchildren with ASD, but traditional clinical screening methods lack objectivity.\nThis study introduces an innovative approach to early detection of ASD. The\ncontributions are threefold. First, this work proposes a novel Parent-Child\nDyads Block-Play (PCB) protocol, grounded in kinesiological and neuroscientific\nresearch, to identify behavioral patterns distinguishing ASD from typically\ndeveloping (TD) toddlers. Second, we have compiled a substantial video dataset,\nfeaturing 40 ASD and 89 TD toddlers engaged in block play with parents. This\ndataset exceeds previous efforts on both the scale of participants and the\nlength of individual sessions. Third, our approach to action analysis in videos\nemploys a hybrid deep learning framework, integrating a two-stream graph\nconvolution network with attention-enhanced xLSTM (2sGCN-AxLSTM). This\nframework is adept at capturing dynamic interactions between toddlers and\nparents by extracting spatial features correlated with upper body and head\nmovements and focusing on global contextual information of action sequences\nover time. By learning these global features with spatio-temporal correlations,\nour 2sGCN-AxLSTM effectively analyzes dynamic human behavior patterns and\ndemonstrates an unprecedented accuracy of 89.6\\% in early detection of ASD. Our\napproach shows strong potential for enhancing early ASD diagnosis by accurately\nanalyzing parent-child interactions, providing a critical tool to support\ntimely and informed clinical decision-making.\n","authors":["Xiang Li","Lizhou Fan","Hanbo Wu","Kunping Chen","Xiaoxiao Yu","Chao Che","Zhifeng Cai","Xiuhong Niu","Aihua Cao","Xin Ma"],"pdf_url":"https://arxiv.org/pdf/2408.16924v1.pdf","comment":"18 pages, 8 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2408.16907v1","updated":"2024-08-29T21:08:07Z","published":"2024-08-29T21:08:07Z","title":"Ig3D: Integrating 3D Face Representations in Facial Expression Inference","summary":" Reconstructing 3D faces with facial geometry from single images has allowed\nfor major advances in animation, generative models, and virtual reality.\nHowever, this ability to represent faces with their 3D features is not as fully\nexplored by the facial expression inference (FEI) community. This study\ntherefore aims to investigate the impacts of integrating such 3D\nrepresentations into the FEI task, specifically for facial expression\nclassification and face-based valence-arousal (VA) estimation. To accomplish\nthis, we first assess the performance of two 3D face representations (both\nbased on the 3D morphable model, FLAME) for the FEI tasks. We further explore\ntwo fusion architectures, intermediate fusion and late fusion, for integrating\nthe 3D face representations with existing 2D inference frameworks. To evaluate\nour proposed architecture, we extract the corresponding 3D representations and\nperform extensive tests on the AffectNet and RAF-DB datasets. Our experimental\nresults demonstrate that our proposed method outperforms the state-of-the-art\nAffectNet VA estimation and RAF-DB classification tasks. Moreover, our method\ncan act as a complement to other existing methods to boost performance in many\nemotion inference tasks.\n","authors":["Lu Dong","Xiao Wang","Srirangaraj Setlur","Venu Govindaraju","Ifeoma Nwogu"],"pdf_url":"https://arxiv.org/pdf/2408.16907v1.pdf","comment":"Accepted by ECCVW 2024"},{"id":"http://arxiv.org/abs/2408.16892v1","updated":"2024-08-29T20:26:27Z","published":"2024-08-29T20:26:27Z","title":"Tex-ViT: A Generalizable, Robust, Texture-based dual-branch\n cross-attention deepfake detector","summary":" Deepfakes, which employ GAN to produce highly realistic facial modification,\nare widely regarded as the prevailing method. Traditional CNN have been able to\nidentify bogus media, but they struggle to perform well on different datasets\nand are vulnerable to adversarial attacks due to their lack of robustness.\nVision transformers have demonstrated potential in the realm of image\nclassification problems, but they require enough training data. Motivated by\nthese limitations, this publication introduces Tex-ViT (Texture-Vision\nTransformer), which enhances CNN features by combining ResNet with a vision\ntransformer. The model combines traditional ResNet features with a texture\nmodule that operates in parallel on sections of ResNet before each\ndown-sampling operation. The texture module then serves as an input to the dual\nbranch of the cross-attention vision transformer. It specifically focuses on\nimproving the global texture module, which extracts feature map correlation.\nEmpirical analysis reveals that fake images exhibit smooth textures that do not\nremain consistent over long distances in manipulations. Experiments were\nperformed on different categories of FF++, such as DF, f2f, FS, and NT,\ntogether with other types of GAN datasets in cross-domain scenarios.\nFurthermore, experiments also conducted on FF++, DFDCPreview, and Celeb-DF\ndataset underwent several post-processing situations, such as blurring,\ncompression, and noise. The model surpassed the most advanced models in terms\nof generalization, achieving a 98% accuracy in cross-domain scenarios. This\ndemonstrates its ability to learn the shared distinguishing textural\ncharacteristics in the manipulated samples. These experiments provide evidence\nthat the proposed model is capable of being applied to various situations and\nis resistant to many post-processing procedures.\n","authors":["Deepak Dagar","Dinesh Kumar Vishwakarma"],"pdf_url":"https://arxiv.org/pdf/2408.16892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16886v1","updated":"2024-08-29T20:19:10Z","published":"2024-08-29T20:19:10Z","title":"LV-UNet: A Lightweight and Vanilla Model for Medical Image Segmentation","summary":" Although the progress made by large models in computer vision, optimization\nchallenges, the complexity of transformer models, computational limitations,\nand the requirements of practical applications call for simpler designs in\nmodel architecture for medical image segmentation, especially in mobile medical\ndevices that require lightweight and deployable models with real-time\nperformance. However, some of the current lightweight models exhibit poor\nrobustness across different datasets, which hinders their broader adoption.\nThis paper proposes a lightweight and vanilla model called LV-UNet, which\neffectively utilizes pre-trained MobileNetv3-Large models and introduces\nfusible modules. It can be trained using an improved deep training strategy and\nswitched to deployment mode during inference, reducing both parameter count and\ncomputational load. Experiments are conducted on ISIC 2016, BUSI, CVC-\nClinicDB, CVC-ColonDB, and Kvair-SEG datasets, achieving better performance\ncompared to the state-of-the-art and classic models.\n","authors":["Juntao Jiang","Mengmeng Wang","Huizhong Tian","Lingbo Cheng","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16883v1","updated":"2024-08-29T20:12:01Z","published":"2024-08-29T20:12:01Z","title":"Revising Multimodal VAEs with Diffusion Decoders","summary":" Multimodal VAEs often struggle with generating high-quality outputs, a\nchallenge that extends beyond the inherent limitations of the VAE framework.\nThe core issue lies in the restricted joint representation of the latent space,\nparticularly when complex modalities like images are involved. Feedforward\ndecoders, commonly used for these intricate modalities, inadvertently constrain\nthe joint latent space, leading to a degradation in the quality of the other\nmodalities as well. Although recent studies have shown improvement by\nintroducing modality-specific representations, the issue remains significant.\nIn this work, we demonstrate that incorporating a flexible diffusion decoder\nspecifically for the image modality not only enhances the generation quality of\nthe images but also positively impacts the performance of the other modalities\nthat rely on feedforward decoders. This approach addresses the limitations\nimposed by conventional joint representations and opens up new possibilities\nfor improving multimodal generation tasks using the multimodal VAE framework.\nOur model provides state-of-the-art results compared to other multimodal VAEs\nin different datasets with higher coherence and superior quality in the\ngenerated modalities\n","authors":["Daniel Wesego","Amirmohammad Rooshenas"],"pdf_url":"https://arxiv.org/pdf/2408.16883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16881v1","updated":"2024-08-29T20:08:22Z","published":"2024-08-29T20:08:22Z","title":"FineFACE: Fair Facial Attribute Classification Leveraging Fine-grained\n Features","summary":" Published research highlights the presence of demographic bias in automated\nfacial attribute classification algorithms, particularly impacting women and\nindividuals with darker skin tones. Existing bias mitigation techniques\ntypically require demographic annotations and often obtain a trade-off between\nfairness and accuracy, i.e., Pareto inefficiency. Facial attributes, whether\ncommon ones like gender or others such as \"chubby\" or \"high cheekbones\",\nexhibit high interclass similarity and intraclass variation across demographics\nleading to unequal accuracy. This requires the use of local and subtle cues\nusing fine-grained analysis for differentiation. This paper proposes a novel\napproach to fair facial attribute classification by framing it as a\nfine-grained classification problem. Our approach effectively integrates both\nlow-level local features (like edges and color) and high-level semantic\nfeatures (like shapes and structures) through cross-layer mutual attention\nlearning. Here, shallow to deep CNN layers function as experts, offering\ncategory predictions and attention regions. An exhaustive evaluation on facial\nattribute annotated datasets demonstrates that our FineFACE model improves\naccuracy by 1.32% to 1.74% and fairness by 67% to 83.6%, over the SOTA bias\nmitigation techniques. Importantly, our approach obtains a Pareto-efficient\nbalance between accuracy and fairness between demographic groups. In addition,\nour approach does not require demographic annotations and is applicable to\ndiverse downstream classification tasks. To facilitate reproducibility, the\ncode and dataset information is available at\nhttps://github.com/VCBSL-Fairness/FineFACE.\n","authors":["Ayesha Manzoor","Ajita Rattani"],"pdf_url":"https://arxiv.org/pdf/2408.16881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16879v1","updated":"2024-08-29T20:05:02Z","published":"2024-08-29T20:05:02Z","title":"MSLIQA: Enhancing Learning Representations for Image Quality Assessment\n through Multi-Scale Learning","summary":" No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due\nto the diversity of distortions and the lack of large annotated datasets. Many\nstudies have attempted to tackle these challenges by developing more accurate\nNR-IQA models, often employing complex and computationally expensive networks,\nor by bridging the domain gap between various distortions to enhance\nperformance on test datasets. In our work, we improve the performance of a\ngeneric lightweight NR-IQA model by introducing a novel augmentation strategy\nthat boosts its performance by almost 28\\%. This augmentation strategy enables\nthe network to better discriminate between different distortions in various\nparts of the image by zooming in and out. Additionally, the inclusion of\ntest-time augmentation further enhances performance, making our lightweight\nnetwork's results comparable to the current state-of-the-art models, simply\nthrough the use of augmentations.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildiyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.16879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00264v2","updated":"2024-08-29T19:38:54Z","published":"2024-05-01T00:48:55Z","title":"Using Texture to Classify Forests Separately from Vegetation","summary":" Identifying terrain within satellite image data is a key issue in\ngeographical information sciences, with numerous environmental and safety\nimplications. Many techniques exist to derive classifications from spectral\ndata captured by satellites. However, the ability to reliably classify\nvegetation remains a challenge. In particular, no precise methods exist for\nclassifying forest vs. non-forest vegetation in high-level satellite images.\nThis paper provides an initial proposal for a static, algorithmic process to\nidentify forest regions in satellite image data through texture features\ncreated from detected edges and the NDVI ratio captured by Sentinel-2 satellite\nimages. With strong initial results, this paper also identifies the next steps\nto improve the accuracy of the classification and verification processes.\n","authors":["David R. Treadwell IV","Derek Jacoby","Will Parkinson","Bruce Maxwell","Yvonne Coady"],"pdf_url":"https://arxiv.org/pdf/2405.00264v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16866v1","updated":"2024-08-29T19:11:46Z","published":"2024-08-29T19:11:46Z","title":"GameIR: A Large-Scale Synthesized Ground-Truth Dataset for Image\n Restoration over Gaming Content","summary":" Image restoration methods like super-resolution and image synthesis have been\nsuccessfully used in commercial cloud gaming products like NVIDIA's DLSS.\nHowever, restoration over gaming content is not well studied by the general\npublic. The discrepancy is mainly caused by the lack of ground-truth gaming\ntraining data that match the test cases. Due to the unique characteristics of\ngaming content, the common approach of generating pseudo training data by\ndegrading the original HR images results in inferior restoration performance.\nIn this work, we develop GameIR, a large-scale high-quality\ncomputer-synthesized ground-truth dataset to fill in the blanks, targeting at\ntwo different applications. The first is super-resolution with deferred\nrendering, to support the gaming solution of rendering and transferring LR\nimages only and restoring HR images on the client side. We provide 19200 LR-HR\npaired ground-truth frames coming from 640 videos rendered at 720p and 1440p\nfor this task. The second is novel view synthesis (NVS), to support the\nmultiview gaming solution of rendering and transferring part of the multiview\nframes and generating the remaining frames on the client side. This task has\n57,600 HR frames from 960 videos of 160 scenes with 6 camera views. In addition\nto the RGB frames, the GBuffers during the deferred rendering stage are also\nprovided, which can be used to help restoration. Furthermore, we evaluate\nseveral SOTA super-resolution algorithms and NeRF-based NVS algorithms over our\ndataset, which demonstrates the effectiveness of our ground-truth GameIR data\nin improving restoration performance for gaming content. Also, we test the\nmethod of incorporating the GBuffers as additional input information for\nhelping super-resolution and NVS. We release our dataset and models to the\ngeneral public to facilitate research on restoration methods over gaming\ncontent.\n","authors":["Lebin Zhou","Kun Han","Nam Ling","Wei Wang","Wei Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.16866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16859v1","updated":"2024-08-29T18:49:32Z","published":"2024-08-29T18:49:32Z","title":"Comparative Analysis of Transfer Learning Models for Breast Cancer\n Classification","summary":" The classification of histopathological images is crucial for the early and\nprecise detection of breast cancer. This study investigates the efficiency of\ndeep learning models in distinguishing between Invasive Ductal Carcinoma (IDC)\nand non-IDC in histopathology slides. We conducted a thorough comparison\nexamination of eight sophisticated models: ResNet-50, DenseNet-121, ResNeXt-50,\nVision Transformer (ViT), GoogLeNet (Inception v3), EfficientNet, MobileNet,\nand SqueezeNet. This analysis was carried out using a large dataset of 277,524\nimage patches. Our research makes a substantial contribution to the field by\noffering a comprehensive assessment of the performance of each model. We\nparticularly highlight the exceptional efficacy of attention-based mechanisms\nin the ViT model, which achieved a remarkable validation accuracy of 93\\%,\nsurpassing conventional convolutional networks. This study highlights the\npromise of advanced machine learning approaches in clinical settings, offering\nimproved precision as well as efficiency in breast cancer diagnosis.\n","authors":["Sania Eskandari","Ali Eslamian","Qiang Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.16859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16845v1","updated":"2024-08-29T18:21:50Z","published":"2024-08-29T18:21:50Z","title":"Enabling Local Editing in Diffusion Models by Joint and Individual\n Component Analysis","summary":" Recent advances in Diffusion Models (DMs) have led to significant progress in\nvisual synthesis and editing tasks, establishing them as a strong competitor to\nGenerative Adversarial Networks (GANs). However, the latent space of DMs is not\nas well understood as that of GANs. Recent research has focused on unsupervised\nsemantic discovery in the latent space of DMs by leveraging the bottleneck\nlayer of the denoising network, which has been shown to exhibit properties of a\nsemantic latent space. However, these approaches are limited to discovering\nglobal attributes. In this paper we address, the challenge of local image\nmanipulation in DMs and introduce an unsupervised method to factorize the\nlatent semantics learned by the denoising network of pre-trained DMs. Given an\narbitrary image and defined regions of interest, we utilize the Jacobian of the\ndenoising network to establish a relation between the regions of interest and\ntheir corresponding subspaces in the latent space. Furthermore, we disentangle\nthe joint and individual components of these subspaces to identify latent\ndirections that enable local image manipulation. Once discovered, these\ndirections can be applied to different images to produce semantically\nconsistent edits, making our method suitable for practical applications.\nExperimental results on various datasets demonstrate that our method can\nproduce semantic edits that are more localized and have better fidelity\ncompared to the state-of-the-art.\n","authors":["Theodoros Kouzelis","Manos Plitsis","Mihalis A. Nikolaou","Yannis Panagakis"],"pdf_url":"https://arxiv.org/pdf/2408.16845v1.pdf","comment":"Code available here: https://zelaki.github.io/localdiff/"},{"id":"http://arxiv.org/abs/2408.16827v1","updated":"2024-08-29T18:00:03Z","published":"2024-08-29T18:00:03Z","title":"Fluent and Accurate Image Captioning with a Self-Trained Reward Model","summary":" Fine-tuning image captioning models with hand-crafted rewards like the CIDEr\nmetric has been a classical strategy for promoting caption quality at the\nsequence level. This approach, however, is known to limit descriptiveness and\nsemantic richness and tends to drive the model towards the style of\nground-truth sentences, thus losing detail and specificity. On the contrary,\nrecent attempts to employ image-text models like CLIP as reward have led to\ngrammatically incorrect and repetitive captions. In this paper, we propose\nSelf-Cap, a captioning approach that relies on a learnable reward model based\non self-generated negatives that can discriminate captions based on their\nconsistency with the image. Specifically, our discriminator is a fine-tuned\ncontrastive image-text model trained to promote caption correctness while\navoiding the aberrations that typically happen when training with a CLIP-based\nreward. To this end, our discriminator directly incorporates negative samples\nfrom a frozen captioner, which significantly improves the quality and richness\nof the generated captions but also reduces the fine-tuning time in comparison\nto using the CIDEr score as the sole metric for optimization. Experimental\nresults demonstrate the effectiveness of our training strategy on both standard\nand zero-shot image captioning datasets.\n","authors":["Nicholas Moratelli","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2408.16827v1.pdf","comment":"ICPR 2024"},{"id":"http://arxiv.org/abs/2408.16809v1","updated":"2024-08-29T17:59:57Z","published":"2024-08-29T17:59:57Z","title":"See or Guess: Counterfactually Regularized Image Captioning","summary":" Image captioning, which generates natural language descriptions of the visual\ninformation in an image, is a crucial task in vision-language research.\nPrevious models have typically addressed this task by aligning the generative\ncapabilities of machines with human intelligence through statistical fitting of\nexisting datasets. While effective for normal images, they may struggle to\naccurately describe those where certain parts of the image are obscured or\nedited, unlike humans who excel in such cases. These weaknesses they exhibit,\nincluding hallucinations and limited interpretability, often hinder performance\nin scenarios with shifted association patterns. In this paper, we present a\ngeneric image captioning framework that employs causal inference to make\nexisting models more capable of interventional tasks, and counterfactually\nexplainable. Our approach includes two variants leveraging either total effect\nor natural direct effect. Integrating them into the training process enables\nmodels to handle counterfactual scenarios, increasing their generalizability.\nExtensive experiments on various datasets show that our method effectively\nreduces hallucinations and improves the model's faithfulness to images,\ndemonstrating high portability across both small-scale and large-scale\nimage-to-text models. The code is available at\nhttps://github.com/Aman-4-Real/See-or-Guess.\n","authors":["Qian Cao","Xu Chen","Ruihua Song","Xiting Wang","Xinting Huang","Yuchen Ren"],"pdf_url":"https://arxiv.org/pdf/2408.16809v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.16807v1","updated":"2024-08-29T17:29:26Z","published":"2024-08-29T17:29:26Z","title":"STEREO: Towards Adversarially Robust Concept Erasing from Text-to-Image\n Generation Models","summary":" The rapid proliferation of large-scale text-to-image generation (T2IG) models\nhas led to concerns about their potential misuse in generating harmful content.\nThough many methods have been proposed for erasing undesired concepts from T2IG\nmodels, they only provide a false sense of security, as recent works\ndemonstrate that concept-erased models (CEMs) can be easily deceived to\ngenerate the erased concept through adversarial attacks. The problem of\nadversarially robust concept erasing without significant degradation to model\nutility (ability to generate benign concepts) remains an unresolved challenge,\nespecially in the white-box setting where the adversary has access to the CEM.\nTo address this gap, we propose an approach called STEREO that involves two\ndistinct stages. The first stage searches thoroughly enough for strong and\ndiverse adversarial prompts that can regenerate an erased concept from a CEM,\nby leveraging robust optimization principles from adversarial training. In the\nsecond robustly erase once stage, we introduce an anchor-concept-based\ncompositional objective to robustly erase the target concept at one go, while\nattempting to minimize the degradation on model utility. By benchmarking the\nproposed STEREO approach against four state-of-the-art concept erasure methods\nunder three adversarial attacks, we demonstrate its ability to achieve a better\nrobustness vs. utility trade-off. Our code and models are available at\nhttps://github.com/koushiksrivats/robust-concept-erasing.\n","authors":["Koushik Srivatsan","Fahad Shamshad","Muzammal Naseer","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2408.16807v1.pdf","comment":"Project Page:\n https://koushiksrivats.github.io/robust-concept-erasing/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.16672v1","updated":"2024-08-29T16:21:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce several improvements to the ColBERT model architecture\nand training pipeline, leveraging techniques successful in the more established\nsingle-vector embedding model paradigm, particularly those suited for\nheterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates\nstrong performance across a range of English and multilingual retrieval tasks,\nwhile also cutting storage requirements by up to 50% compared to previous\nmodels.\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Saba Sturua","Mohammad Kalim Akram","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14698v2","updated":"2024-08-29T15:14:48Z","published":"2024-08-26T23:52:27Z","title":"Smart Multi-Modal Search: Contextual Sparse and Dense Embedding\n Integration in Adobe Express","summary":" As user content and queries become increasingly multi-modal, the need for\neffective multi-modal search systems has grown. Traditional search systems\noften rely on textual and metadata annotations for indexed images, while\nmulti-modal embeddings like CLIP enable direct search using text and image\nembeddings. However, embedding-based approaches face challenges in integrating\ncontextual features such as user locale and recency. Building a scalable\nmulti-modal search system requires fine-tuning several components. This paper\npresents a multi-modal search architecture and a series of AB tests that\noptimize embeddings and multi-modal technologies in Adobe Express template\nsearch. We address considerations such as embedding model selection, the roles\nof embeddings in matching and ranking, and the balance between dense and sparse\nembeddings. Our iterative approach demonstrates how utilizing sparse, dense,\nand contextual features enhances short and long query search, significantly\nreduces null rates (over 70\\%), and increases click-through rates (CTR). Our\nfindings provide insights into developing robust multi-modal search systems,\nthereby enhancing relevance for complex queries.\n","authors":["Cherag Aroraa","Tracy Holloway King","Jayant Kumar","Yi Lu","Sanat Sharma","Arvind Srikantan","David Uvalle","Josep Valls-Vargas","Harsha Vardhan"],"pdf_url":"https://arxiv.org/pdf/2408.14698v2.pdf","comment":"CIKM 2024 (International Conference on Information and Knowledge\n Management), Multimodal Search and Recommendations Workshop"},{"id":"http://arxiv.org/abs/2408.16578v1","updated":"2024-08-29T14:44:12Z","published":"2024-08-29T14:44:12Z","title":"Transformers Meet ACT-R: Repeat-Aware and Sequential Listening Session\n Recommendation","summary":" Music streaming services often leverage sequential recommender systems to\npredict the best music to showcase to users based on past sequences of\nlistening sessions. Nonetheless, most sequential recommendation methods ignore\nor insufficiently account for repetitive behaviors. This is a crucial\nlimitation for music recommendation, as repeatedly listening to the same song\nover time is a common phenomenon that can even change the way users perceive\nthis song. In this paper, we introduce PISA (Psychology-Informed Session\nembedding using ACT-R), a session-level sequential recommender system that\novercomes this limitation. PISA employs a Transformer architecture learning\nembedding representations of listening sessions and users using attention\nmechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational),\na cognitive architecture modeling human information access and memory dynamics.\nThis approach enables us to capture dynamic and repetitive patterns from user\nbehaviors, allowing us to effectively predict the songs they will listen to in\nsubsequent sessions, whether they are repeated or new ones. We demonstrate the\nempirical relevance of PISA using both publicly available listening data from\nLast.fm and proprietary data from Deezer, a global music streaming service,\nconfirming the critical importance of repetition modeling for sequential\nlistening session recommendation. Along with this paper, we publicly release\nour proprietary dataset to foster future research in this field, as well as the\nsource code of PISA to facilitate its future use.\n","authors":["Viet-Anh Tran","Guillaume Salha-Galvan","Bruno Sguerra","Romain Hennequin"],"pdf_url":"https://arxiv.org/pdf/2408.16578v1.pdf","comment":"11 pages. Accepted by RecSys'2024, full paper"},{"id":"http://arxiv.org/abs/2408.16446v1","updated":"2024-08-29T11:19:57Z","published":"2024-08-29T11:19:57Z","title":"Is text normalization relevant for classifying medieval charters?","summary":" This study examines the impact of historical text normalization on the\nclassification of medieval charters, specifically focusing on document dating\nand locating. Using a data set of Middle High German charters from a digital\narchive, we evaluate various classifiers, including traditional and\ntransformer-based models, with and without normalization. Our results indicate\nthat the given normalization minimally improves locating tasks but reduces\naccuracy for dating, implying that original texts contain crucial features that\nnormalization may obscure. We find that support vector machines and gradient\nboosting outperform other models, questioning the efficiency of transformers\nfor this use case. Results suggest a selective approach to historical text\nnormalization, emphasizing the significance of preserving some textual\ncharacteristics that are critical for classification tasks in document\nanalysis.\n","authors":["Florian Atzenhofer-Baumgartner","Tamás Kovács"],"pdf_url":"https://arxiv.org/pdf/2408.16446v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n improvements or corrections"},{"id":"http://arxiv.org/abs/2408.16430v1","updated":"2024-08-29T10:44:59Z","published":"2024-08-29T10:44:59Z","title":"Do Recommender Systems Promote Local Music? A Reproducibility Study\n Using Music Streaming Data","summary":" This paper examines the influence of recommender systems on local music\nrepresentation, discussing prior findings from an empirical study on the LFM-2b\npublic dataset. This prior study argued that different recommender systems\nexhibit algorithmic biases shifting music consumption either towards or against\nlocal content. However, LFM-2b users do not reflect the diverse audience of\nmusic streaming services. To assess the robustness of this study's conclusions,\nwe conduct a comparative analysis using proprietary listening data from a\nglobal music streaming service, which we publicly release alongside this paper.\nWe observe significant differences in local music consumption patterns between\nour dataset and LFM-2b, suggesting that caution should be exercised when\ndrawing conclusions on local music based solely on LFM-2b. Moreover, we show\nthat the algorithmic biases exhibited in the original work vary in our dataset,\nand that several unexplored model parameters can significantly influence these\nbiases and affect the study's conclusion on both datasets. Finally, we discuss\nthe complexity of accurately labeling local music, emphasizing the risk of\nmisleading conclusions due to unreliable, biased, or incomplete labels. To\nencourage further research and ensure reproducibility, we have publicly shared\nour dataset and code.\n","authors":["Kristina Matrosova","Lilian Marey","Guillaume Salha-Galvan","Thomas Louail","Olivier Bodini","Manuel Moussallam"],"pdf_url":"https://arxiv.org/pdf/2408.16430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16312v1","updated":"2024-08-29T07:20:56Z","published":"2024-08-29T07:20:56Z","title":"SynDL: A Large-Scale Synthetic Test Collection","summary":" Large-scale test collections play a crucial role in Information Retrieval\n(IR) research. However, according to the Cranfield paradigm and the research\ninto publicly available datasets, the existing information retrieval research\nstudies are commonly developed on small-scale datasets that rely on human\nassessors for relevance judgments - a time-intensive and expensive process.\nRecent studies have shown the strong capability of Large Language Models (LLMs)\nin producing reliable relevance judgments with human accuracy but at a greatly\nreduced cost. In this paper, to address the missing large-scale ad-hoc document\nretrieval dataset, we extend the TREC Deep Learning Track (DL) test collection\nvia additional language model synthetic labels to enable researchers to test\nand evaluate their search systems at a large scale. Specifically, such a test\ncollection includes more than 1,900 test queries from the previous years of\ntracks. We compare system evaluation with past human labels from past years and\nfind that our synthetically created large-scale test collection can lead to\nhighly correlated system rankings.\n","authors":["Hossein A. Rahmani","Xi Wang","Emine Yilmaz","Nick Craswell","Bhaskar Mitra","Paul Thomas"],"pdf_url":"https://arxiv.org/pdf/2408.16312v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2408.16296v1","updated":"2024-08-29T06:54:03Z","published":"2024-08-29T06:54:03Z","title":"Rethinking Sparse Lexical Representations for Image Retrieval in the Age\n of Rising Multi-Modal Large Language Models","summary":" In this paper, we rethink sparse lexical representations for image retrieval.\nBy utilizing multi-modal large language models (M-LLMs) that support visual\nprompting, we can extract image features and convert them into textual data,\nenabling us to utilize efficient sparse retrieval algorithms employed in\nnatural language processing for image retrieval tasks. To assist the LLM in\nextracting image features, we apply data augmentation techniques for key\nexpansion and analyze the impact with a metric for relevance between images and\ntextual data. We empirically show the superior precision and recall performance\nof our image retrieval method compared to conventional vision-language\nmodel-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a\nkeyword-based image retrieval scenario, where keywords serve as search queries.\nWe also demonstrate that the retrieval performance can be improved by\niteratively incorporating keywords into search queries.\n","authors":["Kengo Nakata","Daisuke Miyashita","Youyang Ng","Yasuto Hoshi","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2408.16296v1.pdf","comment":"Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer\n Vision in the Age of Deep Learning (TradiCV)"},{"id":"http://arxiv.org/abs/2408.16238v1","updated":"2024-08-29T03:34:39Z","published":"2024-08-29T03:34:39Z","title":"Efficient Transfer Learning Framework for Cross-Domain Click-Through\n Rate Prediction","summary":" Natural content and advertisement coexist in industrial recommendation\nsystems but differ in data distribution. Concretely, traffic related to the\nadvertisement is considerably sparser compared to that of natural content,\nwhich motivates the development of transferring knowledge from the richer\nsource natural content domain to the sparser advertising domain. The challenges\ninclude the inefficiencies arising from the management of extensive source data\nand the problem of 'catastrophic forgetting' that results from the CTR model's\ndaily updating. To this end, we propose a novel tri-level asynchronous\nframework, i.e., Efficient Transfer Learning Framework for Cross-Domain\nClick-Through Rate Prediction (E-CDCTR), to transfer comprehensive knowledge of\nnatural content to advertisement CTR models. This framework consists of three\nkey components: Tiny Pre-training Model ((TPM), which trains a tiny CTR model\nwith several basic features on long-term natural data; Complete Pre-training\nModel (CPM), which trains a CTR model holding network structure and input\nfeatures the same as target advertisement on short-term natural data;\nAdvertisement CTR model (A-CTR), which derives its parameter initialization\nfrom CPM together with multiple historical embeddings from TPM as extra feature\nand then fine-tunes on advertisement data. TPM provides richer representations\nof user and item for both the CPM and A-CTR, effectively alleviating the\nforgetting problem inherent in the daily updates. CPM further enhances the\nadvertisement model by providing knowledgeable initialization, thereby\nalleviating the data sparsity challenges typically encountered by advertising\nCTR models. Such a tri-level cross-domain transfer learning framework offers an\nefficient solution to address both data sparsity and `catastrophic forgetting',\nyielding remarkable improvements.\n","authors":["Qi Liu","Xingyuan Tang","Jianqiang Huang","Xiangqian Yu","Haoran Jin","Jin Chen","Yuanhao Pu","Defu Lian","Tan Qu","Zhe Wang","Jia Cheng","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2408.16238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21191v2","updated":"2024-08-29T02:27:19Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Sequential Recommendation with Large Language Models","summary":" Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data and recommend next items for the user.\nSignificant progress has been made in this domain by leveraging classification\nbased learning methods. Inspired by the recent paradigm of 'pretrain, prompt\nand predict' in NLP, we consider sequential recommendation as a sequence to\nsequence generation task and propose a novel model named Generative\nRecommendation (GenRec). Unlike classification based models that learn explicit\nuser and item representations, GenRec utilizes the sequence modeling capability\nof Transformer and adopts the masked item prediction objective to effectively\nlearn the hidden bidirectional sequential patterns. Different from existing\ngenerative sequential recommendation models, GenRec does not rely on manually\ndesigned hard prompts. The input to GenRec is textual user item sequence and\nthe output is top ranked next items. Moreover, GenRec is lightweight and\nrequires only a few hours to train effectively in low-resource settings, making\nit highly applicable to real-world scenarios and helping to democratize large\nlanguage models in the sequential recommendation domain. Our extensive\nexperiments have demonstrated that GenRec generalizes on various public\nreal-world datasets and achieves state-of-the-art results. Our experiments also\nvalidate the effectiveness of the the proposed masked item prediction objective\nthat improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15793v2","updated":"2024-08-29T00:32:15Z","published":"2023-07-28T20:25:11Z","title":"Summaries, Highlights, and Action items: Design, implementation and\n evaluation of an LLM-powered meeting recap system","summary":" Meetings play a critical infrastructural role in the coordination of work. In\nrecent years, due to shift to hybrid and remote work, more meetings are moving\nto online Computer Mediated Spaces. This has led to new problems (e.g. more\ntime spent in less engaging meetings) and new opportunities (e.g. automated\ntranscription/captioning and recap support). Recent advances in large language\nmodels (LLMs) for dialog summarization have the potential to improve the\nexperience of meetings by reducing individuals' meeting load and increasing the\nclarity and alignment of meeting outputs. Despite this potential, they face\ntechnological limitation due to long transcripts and inability to capture\ndiverse recap needs based on user's context. To address these gaps, we design,\nimplement and evaluate in-context a meeting recap system. We first\nconceptualize two salient recap representations -- important highlights, and a\nstructured, hierarchical minutes view. We develop a system to operationalize\nthe representations with dialogue summarization as its building blocks.\nFinally, we evaluate the effectiveness of the system with seven users in the\ncontext of their work meetings. Our findings show promise in using LLM-based\ndialogue summarization for meeting recap and the need for both representations\nin different contexts. However, we find that LLM-based recap still lacks an\nunderstanding of whats personally relevant to participants, can miss important\ndetails, and mis-attributions can be detrimental to group dynamics. We identify\ncollaboration opportunities such as a shared recap document that a high quality\nrecap enables. We report on implications for designing AI systems to partner\nwith users to learn and improve from natural interactions to overcome the\nlimitations related to personal relevance and summarization quality.\n","authors":["Sumit Asthana","Sagih Hilleli","Pengcheng He","Aaron Halfaker"],"pdf_url":"https://arxiv.org/pdf/2307.15793v2.pdf","comment":"in review for CSCW 24"},{"id":"http://arxiv.org/abs/2404.05893v4","updated":"2024-08-29T21:34:22Z","published":"2024-04-08T22:29:53Z","title":"Use of a Structured Knowledge Base Enhances Metadata Curation by Large\n Language Models","summary":" Metadata play a crucial role in ensuring the findability, accessibility,\ninteroperability, and reusability of datasets. This paper investigates the\npotential of large language models (LLMs), specifically GPT-4, to improve\nadherence to metadata standards. We conducted experiments on 200 random data\nrecords describing human samples relating to lung cancer from the NCBI\nBioSample repository, evaluating GPT-4's ability to suggest edits for adherence\nto metadata standards. We computed the adherence accuracy of field name-field\nvalue pairs through a peer review process, and we observed a marginal average\nimprovement in adherence to the standard data dictionary from 79% to 80%\n(p<0.5). We then prompted GPT-4 with domain information in the form of the\ntextual descriptions of CEDAR templates and recorded a significant improvement\nto 97% from 79% (p<0.01). These results indicate that, while LLMs may not be\nable to correct legacy metadata to ensure satisfactory adherence to standards\nwhen unaided, they do show promise for use in automated metadata curation when\nintegrated with a structured knowledge base\n","authors":["Sowmya S. Sundaram","Benjamin Solomon","Avani Khatri","Anisha Laumas","Purvesh Khatri","Mark A. Musen"],"pdf_url":"https://arxiv.org/pdf/2404.05893v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16885v1","updated":"2024-08-29T20:18:00Z","published":"2024-08-29T20:18:00Z","title":"A Prototype Model of Zero-Trust Architecture Blockchain with\n EigenTrust-Based Practical Byzantine Fault Tolerance Protocol to Manage\n Decentralized Clinical Trials","summary":" The COVID-19 pandemic necessitated the emergence of decentralized Clinical\nTrials (DCTs) due to patient retention, accelerate trials, improve data\naccessibility, enable virtual care, and facilitate seamless communication\nthrough integrated systems. However, integrating systems in DCTs exposes\nclinical data to potential security threats, making them susceptible to theft\nat any stage, a high risk of protocol deviations, and monitoring issues. To\nmitigate these challenges, blockchain technology serves as a secure framework,\nacting as a decentralized ledger, creating an immutable environment by\nestablishing a zero-trust architecture, where data are deemed untrusted until\nverified. In combination with Internet of Things (IoT)-enabled wearable\ndevices, blockchain secures the transfer of clinical trial data on private\nblockchains during DCT automation and operations. This paper proposes a\nprototype model of the Zero-Trust Architecture Blockchain (z-TAB) to integrate\npatient-generated clinical trial data during DCT operation management. The\nEigenTrust-based Practical Byzantine Fault Tolerance (T-PBFT) algorithm has\nbeen incorporated as a consensus protocol, leveraging Hyperledger Fabric.\nFurthermore, the Internet of Things (IoT) has been integrated to streamline\ndata processing among stakeholders within the blockchain platforms. Rigorous\nevaluation has been done to evaluate the quality of the system.\n","authors":["Ashok Kumar Peepliwall","Hari Mohan Pandey","Surya Prakash","Anand A Mahajan","Sudhinder Singh Chowhan","Vinesh Kumar","Rahul Sharma"],"pdf_url":"https://arxiv.org/pdf/2408.16885v1.pdf","comment":"NA"},{"id":"http://arxiv.org/abs/2408.16877v1","updated":"2024-08-29T19:58:46Z","published":"2024-08-29T19:58:46Z","title":"Longitudinal Modularity, a Modularity for Link Streams","summary":" Temporal networks are commonly used to model real-life phenomena. When these\nphenomena represent interactions and are captured at a fine-grained temporal\nresolution, they are modeled as link streams. Community detection is an\nessential network analysis task. Although many methods exist for static\nnetworks, and some methods have been developed for temporal networks\nrepresented as sequences of snapshots, few works can handle link streams. This\narticle introduces the first adaptation of the well-known Modularity quality\nfunction to link streams. Unlike existing methods, it is independent of the\ntime scale of analysis. After introducing the quality function, and its\nrelation to existing static and dynamic definitions of Modularity, we show\nexperimentally its relevance for dynamic community evaluation.\n","authors":["Victor Brabant","Yasaman Asgari","Pierre Borgnat","Angela Bonifati","Remy Cazabet"],"pdf_url":"https://arxiv.org/pdf/2408.16877v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.16765v1","updated":"2024-08-29T17:59:07Z","published":"2024-08-29T17:59:07Z","title":"A Score-Based Density Formula, with Applications in Diffusion Generative\n Models","summary":" Score-based generative models (SGMs) have revolutionized the field of\ngenerative modeling, achieving unprecedented success in generating realistic\nand diverse content. Despite empirical advances, the theoretical basis for why\noptimizing the evidence lower bound (ELBO) on the log-likelihood is effective\nfor training diffusion generative models, such as DDPMs, remains largely\nunexplored. In this paper, we address this question by establishing a density\nformula for a continuous-time diffusion process, which can be viewed as the\ncontinuous-time limit of the forward process in an SGM. This formula reveals\nthe connection between the target density and the score function associated\nwith each step of the forward process. Building on this, we demonstrate that\nthe minimizer of the optimization objective for training DDPMs nearly coincides\nwith that of the true objective, providing a theoretical foundation for\noptimizing DDPMs using the ELBO. Furthermore, we offer new insights into the\nrole of score-matching regularization in training GANs, the use of ELBO in\ndiffusion classifiers, and the recently proposed diffusion loss.\n","authors":["Gen Li","Yuling Yan"],"pdf_url":"https://arxiv.org/pdf/2408.16765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05733v2","updated":"2024-08-29T17:58:35Z","published":"2024-05-09T12:50:16Z","title":"Batched Stochastic Bandit for Nondegenerate Functions","summary":" This paper studies batched bandit learning problems for nondegenerate\nfunctions. We introduce an algorithm that solves the batched bandit problem for\nnondegenerate functions near-optimally. More specifically, we introduce an\nalgorithm, called Geometric Narrowing (GN), whose regret bound is of order\n$\\widetilde{{\\mathcal{O}}} ( A_{+}^d \\sqrt{T} )$. In addition, GN only needs\n$\\mathcal{O} (\\log \\log T)$ batches to achieve this regret. We also provide\nlower bound analysis for this problem. More specifically, we prove that over\nsome (compact) doubling metric space of doubling dimension $d$: 1. For any\npolicy $\\pi$, there exists a problem instance on which $\\pi$ admits a regret of\norder ${\\Omega} ( A_-^d \\sqrt{T})$; 2. No policy can achieve a regret of order\n$ A_-^d \\sqrt{T} $ over all problem instances, using less than $ \\Omega ( \\log\n\\log T ) $ rounds of communications. Our lower bound analysis shows that the GN\nalgorithm achieves near optimal regret with minimal number of batches.\n","authors":["Yu Liu","Yunlu Shu","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2405.05733v2.pdf","comment":"34 pages, 14 colored figures"},{"id":"http://arxiv.org/abs/2408.16762v1","updated":"2024-08-29T17:57:05Z","published":"2024-08-29T17:57:05Z","title":"UV-free Texture Generation with Denoising and Geodesic Heat Diffusions","summary":" Seams, distortions, wasted UV space, vertex-duplication, and varying\nresolution over the surface are the most prominent issues of the standard\nUV-based texturing of meshes. These issues are particularly acute when\nautomatic UV-unwrapping techniques are used. For this reason, instead of\ngenerating textures in automatically generated UV-planes like most\nstate-of-the-art methods, we propose to represent textures as coloured\npoint-clouds whose colours are generated by a denoising diffusion probabilistic\nmodel constrained to operate on the surface of 3D objects. Our sampling and\nresolution agnostic generative model heavily relies on heat diffusion over the\nsurface of the meshes for spatial communication between points. To enable\nprocessing of arbitrarily sampled point-cloud textures and ensure long-distance\ntexture consistency we introduce a fast re-sampling of the mesh spectral\nproperties used during the heat diffusion and introduce a novel\nheat-diffusion-based self-attention mechanism. Our code and pre-trained models\nare available at github.com/simofoti/UV3-TeD.\n","authors":["Simone Foti","Stefanos Zafeiriou","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2408.16762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10972v2","updated":"2024-08-29T17:55:52Z","published":"2024-07-15T17:59:55Z","title":"VGBench: Evaluating Large Language Models on Vector Graphics\n Understanding and Generation","summary":" In the realm of vision models, the primary mode of representation is using\npixels to rasterize the visual world. Yet this is not always the best or unique\nway to represent visual content, especially for designers and artists who\ndepict the world using geometry primitives such as polygons. Vector graphics\n(VG), on the other hand, offer a textual representation of visual content,\nwhich can be more concise and powerful for content like cartoons, sketches and\nscientific figures. Recent studies have shown promising results on processing\nvector graphics with capable Large Language Models (LLMs). However, such works\nfocus solely on qualitative results, understanding, or a specific type of\nvector graphics. We propose VGBench, a comprehensive benchmark for LLMs on\nhandling vector graphics through diverse aspects, including (a) both visual\nunderstanding and generation, (b) evaluation of various vector graphics\nformats, (c) diverse question types, (d) wide range of prompting techniques,\n(e) under multiple LLMs and (f) comparison with VLMs on rasterized\nrepresentations. Evaluating on our collected 4279 understanding and 5845\ngeneration samples, we find that LLMs show strong capability on both aspects\nwhile exhibiting less desirable performance on low-level formats (SVG). Both\ndata and evaluation pipeline will be open-sourced at https://vgbench.github.io.\n","authors":["Bocheng Zou","Mu Cai","Jianrui Zhang","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2407.10972v2.pdf","comment":"Project Page: https://vgbench.github.io"},{"id":"http://arxiv.org/abs/2408.16753v1","updated":"2024-08-29T17:49:18Z","published":"2024-08-29T17:49:18Z","title":"Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning\n of Large Language Models","summary":" Reinforcement learning is used to align language models with human preference\nsignals after first pre-training the model to predict the next token of text\nwithin a large corpus using likelihood maximization. Before being deployed in a\nspecific domain, models are often further fine-tuned on task specific data.\nSince human preferences are often unavailable for the last step, it is\nperformed using likelihood maximization as that is the typical default method.\nHowever, reinforcement learning has other advantages besides facilitating\nalignment to a human derived reward function. For one, whereas likelihood\nmaximization is a form of imitation learning in which the model is trained on\nwhat to do under ideal conditions, reinforcement learning is not limited to\ndemonstrating actions just for optimally reached states and trains a model what\nto do under a range of scenarios as it explores the policy space. In addition,\nit also trains a model what not to do, suppressing competitive but poor\nactions. This work develops a framework for last-mile fine-tuning using\nreinforcement learning and tests whether it garners performance gains. The\nexperiments center on abstractive summarization, but the framework is general\nand broadly applicable. Use of the procedure produced significantly better\nresults than likelihood maximization when comparing raw predictions. For the\nspecific data tested, the gap could be bridged by employing post-processing of\nthe maximum likelihood outputs. Nonetheless, the framework offers a new avenue\nfor model optimization in situations where post-processing may be less\nstraightforward or effective, and it can be extended to include more complex\nclasses of undesirable outputs to penalize and train against, such as\nhallucinations.\n","authors":["Alec Solway"],"pdf_url":"https://arxiv.org/pdf/2408.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13154v3","updated":"2024-08-29T17:47:18Z","published":"2024-06-19T02:09:15Z","title":"Conditional score-based diffusion models for solving inverse problems in\n mechanics","summary":" We propose a framework to perform Bayesian inference using conditional\nscore-based diffusion models to solve a class of inverse problems in mechanics\ninvolving the inference of a specimen's spatially varying material properties\nfrom noisy measurements of its mechanical response to loading. Conditional\nscore-based diffusion models are generative models that learn to approximate\nthe score function of a conditional distribution using samples from the joint\ndistribution. More specifically, the score functions corresponding to multiple\nrealizations of the measurement are approximated using a single neural network,\nthe so-called score network, which is subsequently used to sample the posterior\ndistribution using an appropriate Markov chain Monte Carlo scheme based on\nLangevin dynamics. Training the score network only requires simulating the\nforward model. Hence, the proposed approach can accommodate black-box forward\nmodels and complex measurement noise. Moreover, once the score network has been\ntrained, it can be re-used to solve the inverse problem for different\nrealizations of the measurements. We demonstrate the efficacy of the proposed\napproach on a suite of high-dimensional inverse problems in mechanics that\ninvolve inferring heterogeneous material properties from noisy measurements.\nSome examples we consider involve synthetic data, while others include data\ncollected from actual elastography experiments. Further, our applications\ndemonstrate that the proposed approach can handle different measurement\nmodalities, complex patterns in the inferred quantities, non-Gaussian and\nnon-additive noise models, and nonlinear black-box forward models. The results\nshow that the proposed framework can solve large-scale physics-based inverse\nproblems efficiently.\n","authors":["Agnimitra Dasgupta","Harisankar Ramaswamy","Javier Murgoitio-Esandi","Ken Foo","Runze Li","Qifa Zhou","Brendan Kennedy","Assad Oberai"],"pdf_url":"https://arxiv.org/pdf/2406.13154v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16751v1","updated":"2024-08-29T17:46:18Z","published":"2024-08-29T17:46:18Z","title":"A Gradient Analysis Framework for Rewarding Good and Penalizing Bad\n Examples in Language Models","summary":" Beyond maximum likelihood estimation (MLE), the standard objective of a\nlanguage model (LM) that optimizes good examples probabilities, many studies\nhave explored ways that also penalize bad examples for enhancing the quality of\noutput distribution, including unlikelihood training, exponential maximizing\naverage treatment effect (ExMATE), and direct preference optimization (DPO). To\nsystematically compare these methods and further provide a unified recipe for\nLM optimization, in this paper, we present a unique angle of gradient analysis\nof loss functions that simultaneously reward good examples and penalize bad\nones in LMs. Through both mathematical results and experiments on\nCausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional\ncharacteristics among these methods. We find that ExMATE serves as a superior\nsurrogate for MLE, and that combining DPO with ExMATE instead of MLE further\nenhances both the statistical (5-7%) and generative (+18% win rate)\nperformance.\n","authors":["Yi-Lin Tuan","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06599v3","updated":"2024-08-29T17:31:26Z","published":"2023-02-13T18:55:31Z","title":"FilFL: Client Filtering for Optimized Client Participation in Federated\n Learning","summary":" Federated learning, an emerging machine learning paradigm, enables clients to\ncollaboratively train a model without exchanging local data. Clients\nparticipating in the training process significantly impact the convergence\nrate, learning efficiency, and model generalization. We propose a novel\napproach, client filtering, to improve model generalization and optimize client\nparticipation and training. The proposed method periodically filters available\nclients to identify a subset that maximizes a combinatorial objective function\nwith an efficient greedy filtering algorithm. Thus, the clients are assessed as\na combination rather than individually. We theoretically analyze the\nconvergence of federated learning with client filtering in heterogeneous\nsettings and evaluate its performance across diverse vision and language tasks,\nincluding realistic scenarios with time-varying client availability. Our\nempirical results demonstrate several benefits of our approach, including\nimproved learning efficiency, faster convergence, and up to 10% higher test\naccuracy than training without client filtering.\n","authors":["Fares Fourati","Salma Kharrat","Vaneet Aggarwal","Mohamed-Slim Alouini","Marco Canini"],"pdf_url":"https://arxiv.org/pdf/2302.06599v3.pdf","comment":"Accepted at ECAI'24"},{"id":"http://arxiv.org/abs/2310.03103v5","updated":"2024-08-29T17:24:20Z","published":"2023-10-04T18:47:34Z","title":"Learning to Prompt Your Domain for Vision-Language Models","summary":" Prompt learning has recently become a very efficient transfer learning\nparadigm for Contrastive Language Image Pretraining (CLIP) models. Compared\nwith fine-tuning the entire encoder, prompt learning can obtain highly\ncompetitive results by optimizing only a small number of parameters, which\npresents considerably exciting benefits for federated learning applications\nthat prioritizes communication efficiency. However, in this work, we identify\nthat directly transferring prompt learning approaches into federated learning\ndoes not yield favorable results since the model often suffers from\nconsiderable domain gaps across different clients. To address this issue, we\npropose ADAPT, a novel domain-aware prompt learning approach that facilitates\nboth intra- and inter-domain prompts across federated participants. The basic\nidea of ADAPT is that the prompted CLIP should detect the input image's domain\ncorrespondence and before making the prediction of its category. Extensive\nexperiments of ADAPT demonstrate its significant efficiency and effectiveness\nin federated learning. For example, by learning and sharing only 0.08M\nparameters, our ADAPT attains a 68.4% average accuracy over six domains in the\nDomainNet dataset, which improves the original CLIP by a large margin of 14.8%.\n","authors":["Guoyizhe Wei","Feng Wang","Anshul Shah","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2310.03103v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09359v4","updated":"2024-08-29T17:21:27Z","published":"2024-04-14T21:14:47Z","title":"Evaluation Framework for Feedback Generation Methods in Skeletal\n Movement Assessment","summary":" The application of machine-learning solutions to movement assessment from\nskeleton videos has attracted significant research attention in recent years.\nThis advancement has made rehabilitation at home more accessible, utilizing\nmovement assessment algorithms that can operate on affordable equipment for\nhuman pose detection and analysis from 2D or 3D videos. While the primary\nobjective of automatic assessment tasks is to score movements, the automatic\ngeneration of feedback highlighting key movement issues has the potential to\nsignificantly enhance and accelerate the rehabilitation process. While numerous\nresearch works exist in the field of automatic movement assessment, only a\nhandful address feedback generation. In this study, we propose terminology and\ncriteria for the classification, evaluation, and comparison of feedback\ngeneration solutions. We discuss the challenges associated with each feedback\ngeneration approach and use our proposed criteria to classify existing\nsolutions. To our knowledge, this is the first work that formulates feedback\ngeneration in skeletal movement assessment.\n","authors":["Tal Hakim"],"pdf_url":"https://arxiv.org/pdf/2404.09359v4.pdf","comment":"Accepted to xAI4Biometrics 2024 at ECCV 2024"},{"id":"http://arxiv.org/abs/2303.15477v5","updated":"2024-08-29T17:20:14Z","published":"2023-03-26T18:31:52Z","title":"Adaptive Log-Euclidean Metrics for SPD Matrix Learning","summary":" Symmetric Positive Definite (SPD) matrices have received wide attention in\nmachine learning due to their intrinsic capacity to encode underlying\nstructural correlation in data. Many successful Riemannian metrics have been\nproposed to reflect the non-Euclidean geometry of SPD manifolds. However, most\nexisting metric tensors are fixed, which might lead to sub-optimal performance\nfor SPD matrix learning, especially for deep SPD neural networks. To remedy\nthis limitation, we leverage the commonly encountered pullback techniques and\npropose Adaptive Log-Euclidean Metrics (ALEMs), which extend the widely used\nLog-Euclidean Metric (LEM). Compared with the previous Riemannian metrics, our\nmetrics contain learnable parameters, which can better adapt to the complex\ndynamics of Riemannian neural networks with minor extra computations. We also\npresent a complete theoretical analysis to support our ALEMs, including\nalgebraic and Riemannian properties. The experimental and theoretical results\ndemonstrate the merit of the proposed metrics in improving the performance of\nSPD neural networks. The efficacy of our metrics is further showcased on a set\nof recently developed Riemannian building blocks, including Riemannian batch\nnormalization, Riemannian Residual blocks, and Riemannian classifiers.\n","authors":["Ziheng Chen","Yue Song","Tianyang Xu","Zhiwu Huang","Xiao-Jun Wu","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2303.15477v5.pdf","comment":"Accepted by TIP 2024"},{"id":"http://arxiv.org/abs/2408.16725v1","updated":"2024-08-29T17:18:53Z","published":"2024-08-29T17:18:53Z","title":"Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming","summary":" Recent advances in language models have achieved significant progress.\nGPT-4o, as a new milestone, has enabled real-time conversations with humans,\ndemonstrating near-human natural fluency. Such human-computer interaction\nnecessitates models with the capability to perform reasoning directly with the\naudio modality and generate output in streaming. However, this remains beyond\nthe reach of current academic models, as they typically depend on extra TTS\nsystems for speech synthesis, resulting in undesirable latency. This paper\nintroduces the Mini-Omni, an audio-based end-to-end conversational model,\ncapable of real-time speech interaction. To achieve this capability, we propose\na text-instructed speech generation method, along with batch-parallel\nstrategies during inference to further boost the performance. Our method also\nhelps to retain the original model's language capabilities with minimal\ndegradation, enabling other works to establish real-time interaction\ncapabilities. We call this training method \"Any Model Can Talk\". We also\nintroduce the VoiceAssistant-400K dataset to fine-tune models optimized for\nspeech output. To our best knowledge, Mini-Omni is the first fully end-to-end,\nopen-source model for real-time speech interaction, offering valuable potential\nfor future research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16725v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.16717v1","updated":"2024-08-29T17:07:43Z","published":"2024-08-29T17:07:43Z","title":"A GREAT Architecture for Edge-Based Graph Problems Like TSP","summary":" In the last years, many neural network-based approaches have been proposed to\ntackle combinatorial optimization problems such as routing problems. Many of\nthese approaches are based on graph neural networks (GNNs) or related\ntransformers, operating on the Euclidean coordinates representing the routing\nproblems. However, GNNs are inherently not well suited to operate on dense\ngraphs, such as in routing problems. Furthermore, models operating on Euclidean\ncoordinates cannot be applied to non-Euclidean versions of routing problems\nthat are often found in real-world settings. To overcome these limitations, we\npropose a novel GNN-related edge-based neural model called Graph Edge Attention\nNetwork (GREAT). We evaluate the performance of GREAT in the\nedge-classification task to predict optimal edges in the Traveling Salesman\nProblem (TSP). We can use such a trained GREAT model to produce sparse TSP\ngraph instances, keeping only the edges GREAT finds promising. Compared to\nother, non-learning-based methods to sparsify TSP graphs, GREAT can produce\nvery sparse graphs while keeping most of the optimal edges. Furthermore, we\nbuild a reinforcement learning-based GREAT framework which we apply to\nEuclidean and non-Euclidean asymmetric TSP. This framework achieves\nstate-of-the-art results.\n","authors":["Attila Lischka","Jiaming Wu","Morteza Haghir Chehreghani","Balázs Kulcsár"],"pdf_url":"https://arxiv.org/pdf/2408.16717v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.09536v2","updated":"2024-08-29T17:06:52Z","published":"2024-05-15T17:45:59Z","title":"Wasserstein Gradient Boosting: A Framework for Distribution-Valued\n Supervised Learning","summary":" Gradient boosting is a sequential ensemble method that fits a new weaker\nlearner to pseudo residuals at each iteration. We propose Wasserstein gradient\nboosting, a novel extension of gradient boosting that fits a new weak learner\nto alternative pseudo residuals that are Wasserstein gradients of loss\nfunctionals of probability distributions assigned at each input. It solves\ndistribution-valued supervised learning, where the output values of the\ntraining dataset are probability distributions for each input. In\nclassification and regression, a model typically returns, for each input, a\npoint estimate of a parameter of a noise distribution specified for a response\nvariable, such as the class probability parameter of a categorical distribution\nspecified for a response label. A main application of Wasserstein gradient\nboosting in this paper is tree-based evidential learning, which returns a\ndistributional estimate of the response parameter for each input. We\nempirically demonstrate the superior performance of the probabilistic\nprediction by Wasserstein gradient boosting in comparison with existing\nuncertainty quantification methods.\n","authors":["Takuo Matsubara"],"pdf_url":"https://arxiv.org/pdf/2405.09536v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16707v1","updated":"2024-08-29T17:00:47Z","published":"2024-08-29T17:00:47Z","title":"Enhanced forecasting of stock prices based on variational mode\n decomposition, PatchTST, and adaptive scale-weighted layer","summary":" The significant fluctuations in stock index prices in recent years highlight\nthe critical need for accurate forecasting to guide investment and financial\nstrategies. This study introduces a novel composite forecasting framework that\nintegrates variational mode decomposition (VMD), PatchTST, and adaptive\nscale-weighted layer (ASWL) to address these challenges. Utilizing datasets of\nfour major stock indices--SP500, DJI, SSEC, and FTSE--from 2000 to 2024, the\nproposed method first decomposes the raw price series into intrinsic mode\nfunctions (IMFs) using VMD. Each IMF is then modeled with PatchTST to capture\ntemporal patterns effectively. The ASWL module is applied to incorporate scale\ninformation, enhancing prediction accuracy. The final forecast is derived by\naggregating predictions from all IMFs. The VMD-PatchTST-ASWL framework\ndemonstrates significant improvements in forecasting accuracy compared to\ntraditional models, showing robust performance across different indices. This\ninnovative approach provides a powerful tool for stock index price forecasting,\nwith potential applications in various financial analysis and investment\ndecision-making contexts.\n","authors":["Xiaorui Xue","Shaofang Li","Xiaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05527v3","updated":"2024-08-29T16:48:58Z","published":"2024-03-08T18:48:30Z","title":"GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless\n Generative Inference of LLM","summary":" Key-value (KV) caching has become the de-facto to accelerate generation speed\nfor large language models (LLMs) inference. However, the growing cache demand\nwith increasing sequence length has transformed LLM inference to be a memory\nbound problem, significantly constraining the system throughput. Existing\nmethods rely on dropping unimportant tokens or quantizing all entries\nuniformly. Such methods, however, often incur high approximation errors to\nrepresent the compressed matrices. The autoregressive decoding process further\ncompounds the error of each step, resulting in critical deviation in model\ngeneration and deterioration of performance. To tackle this challenge, we\npropose GEAR, an efficient KV cache compression framework that achieves\nnear-lossless high-ratio compression. GEAR first applies quantization to\nmajority of entries of similar magnitudes to ultra-low precision. It then\nemploys a low rank matrix to approximate the quantization error, and a sparse\nmatrix to remedy individual errors from outlier entries. By adeptly integrating\nthree techniques, GEAR is able to fully exploit their synergistic potentials.\nOur experiments demonstrate that compared to alternatives, GEAR achieves\nnear-lossless 4-bit KV cache compression with up to 2.38x throughput\nimprovement, while reducing peak-memory size up to 2.29x. Our code is publicly\navailable at https://github.com/HaoKang-Timmy/GEAR.\n","authors":["Hao Kang","Qingru Zhang","Souvik Kundu","Geonhwa Jeong","Zaoxing Liu","Tushar Krishna","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.05527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16698v1","updated":"2024-08-29T16:47:58Z","published":"2024-08-29T16:47:58Z","title":"SympGNNs: Symplectic Graph Neural Networks for identifiying\n high-dimensional Hamiltonian systems and node classification","summary":" Existing neural network models to learn Hamiltonian systems, such as\nSympNets, although accurate in low-dimensions, struggle to learn the correct\ndynamics for high-dimensional many-body systems. Herein, we introduce\nSymplectic Graph Neural Networks (SympGNNs) that can effectively handle system\nidentification in high-dimensional Hamiltonian systems, as well as node\nclassification. SympGNNs combines symplectic maps with permutation\nequivariance, a property of graph neural networks. Specifically, we propose two\nvariants of SympGNNs: i) G-SympGNN and ii) LA-SympGNN, arising from different\nparameterizations of the kinetic and potential energy. We demonstrate the\ncapabilities of SympGNN on two physical examples: a 40-particle coupled\nHarmonic oscillator, and a 2000-particle molecular dynamics simulation in a\ntwo-dimensional Lennard-Jones potential. Furthermore, we demonstrate the\nperformance of SympGNN in the node classification task, achieving accuracy\ncomparable to the state-of-the-art. We also empirically show that SympGNN can\novercome the oversmoothing and heterophily problems, two key challenges in the\nfield of graph neural networks.\n","authors":["Alan John Varghese","Zhen Zhang","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2408.16698v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.10166v2","updated":"2024-08-29T16:44:17Z","published":"2024-06-14T16:36:35Z","title":"Misam: Using ML in Dataflow Selection of Sparse-Sparse Matrix\n Multiplication","summary":" Sparse matrix-matrix multiplication (SpGEMM) is a critical operation in\nnumerous fields, including scientific computing, graph analytics, and deep\nlearning. These applications exploit the sparsity of matrices to reduce storage\nand computational demands. However, the irregular structure of sparse matrices\nposes significant challenges for performance optimization. Traditional hardware\naccelerators are tailored for specific sparsity patterns with fixed dataflow\nschemes - inner, outer, and row-wise but often perform suboptimally when the\nactual sparsity deviates from these predetermined patterns. As the use of\nSpGEMM expands across various domains, each with distinct sparsity\ncharacteristics, the demand for hardware accelerators that can efficiently\nhandle a range of sparsity patterns is increasing. This paper presents a\nmachine learning based approach for adaptively selecting the most appropriate\ndataflow scheme for SpGEMM tasks with diverse sparsity patterns. By employing\ndecision trees and deep reinforcement learning, we explore the potential of\nthese techniques to surpass heuristic-based methods in identifying optimal\ndataflow schemes. We evaluate our models by comparing their performance with\nthat of a heuristic, highlighting the strengths and weaknesses of each\napproach. Our findings suggest that using machine learning for dynamic dataflow\nselection in hardware accelerators can provide upto 28 times gains.\n","authors":["Sanjali Yadav","Bahar Asgari"],"pdf_url":"https://arxiv.org/pdf/2406.10166v2.pdf","comment":"Accepted to ISCA 2024 MLArchSys workshop\n https://openreview.net/forum?id=A1V9FaZRbV"},{"id":"http://arxiv.org/abs/2310.12000v2","updated":"2024-08-29T16:40:44Z","published":"2023-10-18T14:31:16Z","title":"Iterative Methods for Vecchia-Laplace Approximations for Latent Gaussian\n Process Models","summary":" Latent Gaussian process (GP) models are flexible probabilistic non-parametric\nfunction models. Vecchia approximations are accurate approximations for GPs to\novercome computational bottlenecks for large data, and the Laplace\napproximation is a fast method with asymptotic convergence guarantees to\napproximate marginal likelihoods and posterior predictive distributions for\nnon-Gaussian likelihoods. Unfortunately, the computational complexity of\ncombined Vecchia-Laplace approximations grows faster than linearly in the\nsample size when used in combination with direct solver methods such as the\nCholesky decomposition. Computations with Vecchia-Laplace approximations can\nthus become prohibitively slow precisely when the approximations are usually\nthe most accurate, i.e., on large data sets. In this article, we present\niterative methods to overcome this drawback. Among other things, we introduce\nand analyze several preconditioners, derive new convergence results, and\npropose novel methods for accurately approximating predictive variances. We\nanalyze our proposed methods theoretically and in experiments with simulated\nand real-world data. In particular, we obtain a speed-up of an order of\nmagnitude compared to Cholesky-based calculations and a threefold increase in\nprediction accuracy in terms of the continuous ranked probability score\ncompared to a state-of-the-art method on a large satellite data set. All\nmethods are implemented in a free C++ software library with high-level Python\nand R packages.\n","authors":["Pascal Kündig","Fabio Sigrist"],"pdf_url":"https://arxiv.org/pdf/2310.12000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06829v3","updated":"2024-08-29T16:38:24Z","published":"2022-11-13T06:11:38Z","title":"Methods for Recovering Conditional Independence Graphs: A Survey","summary":" Conditional Independence (CI) graphs are a type of probabilistic graphical\nmodels that are primarily used to gain insights about feature relationships.\nEach edge represents the partial correlation between the connected features\nwhich gives information about their direct dependence. In this survey, we list\nout different methods and study the advances in techniques developed to recover\nCI graphs. We cover traditional optimization methods as well as recently\ndeveloped deep learning architectures along with their recommended\nimplementations. To facilitate wider adoption, we include preliminaries that\nconsolidate associated operations, for example techniques to obtain covariance\nmatrix for mixed datatypes.\n","authors":["Harsh Shrivastava","Urszula Chajewska"],"pdf_url":"https://arxiv.org/pdf/2211.06829v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16686v1","updated":"2024-08-29T16:32:24Z","published":"2024-08-29T16:32:24Z","title":"CW-CNN & CW-AN: Convolutional Networks and Attention Networks for\n CW-Complexes","summary":" We present a novel framework for learning on CW-complex structured data\npoints. Recent advances have discussed CW-complexes as ideal learning\nrepresentations for problems in cheminformatics. However, there is a lack of\navailable machine learning methods suitable for learning on CW-complexes. In\nthis paper we develop notions of convolution and attention that are well\ndefined for CW-complexes. These notions enable us to create the first neural\nnetwork that can receive a CW-complex as input. We illustrate and interpret\nthis framework in the context of supervised prediction.\n","authors":["Rahul Khorana"],"pdf_url":"https://arxiv.org/pdf/2408.16686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16683v1","updated":"2024-08-29T16:28:43Z","published":"2024-08-29T16:28:43Z","title":"A Catalog of Fairness-Aware Practices in Machine Learning Engineering","summary":" Machine learning's widespread adoption in decision-making processes raises\nconcerns about fairness, particularly regarding the treatment of sensitive\nfeatures and potential discrimination against minorities. The software\nengineering community has responded by developing fairness-oriented metrics,\nempirical studies, and approaches. However, there remains a gap in\nunderstanding and categorizing practices for engineering fairness throughout\nthe machine learning lifecycle. This paper presents a novel catalog of\npractices for addressing fairness in machine learning derived from a systematic\nmapping study. The study identifies and categorizes 28 practices from existing\nliterature, mapping them onto different stages of the machine learning\nlifecycle. From this catalog, the authors extract actionable items and\nimplications for both researchers and practitioners in software engineering.\nThis work aims to provide a comprehensive resource for integrating fairness\nconsiderations into the development and deployment of machine learning systems,\nenhancing their reliability, accountability, and credibility.\n","authors":["Gianmario Voria","Giulia Sellitto","Carmine Ferrara","Francesco Abate","Andrea De Lucia","Filomena Ferrucci","Gemma Catolino","Fabio Palomba"],"pdf_url":"https://arxiv.org/pdf/2408.16683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16673v1","updated":"2024-08-29T16:21:00Z","published":"2024-08-29T16:21:00Z","title":"Entropic Distribution Matching in Supervised Fine-tuning of LLMs: Less\n Overfitting and Better Diversity","summary":" Large language models rely on Supervised Fine-Tuning (SFT) to specialize in\ndownstream tasks. Cross Entropy (CE) loss is the de facto choice in SFT, but it\noften leads to overfitting and limited output diversity due to its aggressive\nupdates to the data distribution. This paper aim to address these issues by\nintroducing the maximum entropy principle, which favors models with flatter\ndistributions that still effectively capture the data. Specifically, we develop\na new distribution matching method called GEM, which solves reverse\nKullback-Leibler divergence minimization with an entropy regularizer.\n For the SFT of Llama-3-8B models, GEM outperforms CE in several aspects.\nFirst, when applied to the UltraFeedback dataset to develop general\ninstruction-following abilities, GEM exhibits reduced overfitting, evidenced by\nlower perplexity and better performance on the IFEval benchmark. Furthermore,\nGEM enhances output diversity, leading to performance gains of up to 7 points\non math reasoning and code generation tasks using best-of-n sampling, even\nwithout domain-specific data. Second, when fine-tuning with domain-specific\ndatasets for math reasoning and code generation, GEM also shows less\noverfitting and improvements of up to 10 points compared with CE.\n","authors":["Ziniu Li","Congliang Chen","Tian Xu","Zeyu Qin","Jiancong Xiao","Ruoyu Sun","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2408.16673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16667v1","updated":"2024-08-29T16:15:01Z","published":"2024-08-29T16:15:01Z","title":"Iterative Graph Alignment","summary":" By compressing diverse narratives, LLMs go beyond memorization, achieving\nintelligence by capturing generalizable causal relationships. However, they\nsuffer from local 'representation gaps' due to insufficient training data\ndiversity, limiting their real-world utility, especially in tasks requiring\nstrict alignment to rules. Traditional alignment methods relying on heavy human\nannotations are inefficient and unscalable. Recent self-alignment techniques\nalso fall short, as they often depend on self-selection based prompting and\nmemorization-based learning. To address these issues, we introduce Iterative\nGraph Alignment (IGA), an annotation-free rule-based alignment algorithm. A\nteacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical\ngraphs and reference answers. The student model (LLM) identifies local\nknowledge gaps by attempting to align its responses with these references,\ncollaborating with helper models to generate diverse answers. These aligned\nresponses are then used for iterative supervised fine-tuning (SFT). Our\nevaluations across five rule-based scenarios demonstrate IGP's effectiveness,\nwith a 73.12\\% alignment improvement in Claude Sonnet 3.5, and\nLlama3-8B-Instruct achieving an 86.20\\% improvement, outperforming Claude\nSonnet 3.5 in rule-based alignment.\n","authors":["Fangyuan Yu","Hardeep Singh Arora","Matt Johnson"],"pdf_url":"https://arxiv.org/pdf/2408.16667v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.15096v2","updated":"2024-08-29T15:59:13Z","published":"2024-08-27T14:26:56Z","title":"Post-processing fairness with minimal changes","summary":" In this paper, we introduce a novel post-processing algorithm that is both\nmodel-agnostic and does not require the sensitive attribute at test time. In\naddition, our algorithm is explicitly designed to enforce minimal changes\nbetween biased and debiased predictions; a property that, while highly\ndesirable, is rarely prioritized as an explicit objective in fairness\nliterature. Our approach leverages a multiplicative factor applied to the logit\nvalue of probability scores produced by a black-box classifier. We demonstrate\nthe efficacy of our method through empirical evaluations, comparing its\nperformance against other four debiasing algorithms on two widely used datasets\nin fairness research.\n","authors":["Federico Di Gennaro","Thibault Laugel","Vincent Grari","Xavier Renard","Marcin Detyniecki"],"pdf_url":"https://arxiv.org/pdf/2408.15096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04559v2","updated":"2024-08-29T15:58:09Z","published":"2024-07-05T14:48:15Z","title":"Not (yet) the whole story: Evaluating Visual Storytelling Requires More\n than Measuring Coherence, Grounding, and Repetition","summary":" Visual storytelling consists in generating a natural language story given a\ntemporally ordered sequence of images. This task is not only challenging for\nmodels, but also very difficult to evaluate with automatic metrics since there\nis no consensus about what makes a story 'good'. In this paper, we introduce a\nnovel method that measures story quality in terms of human likeness regarding\nthree key aspects highlighted in previous work: visual grounding, coherence,\nand repetitiveness. We then use this method to evaluate the stories generated\nby several models, showing that the foundation model LLaVA obtains the best\nresult, but only slightly so compared to TAPM, a 50-times smaller visual\nstorytelling model. Upgrading the visual and language components of TAPM\nresults in a model that yields competitive performance with a relatively low\nnumber of parameters. Finally, we carry out a human evaluation study, whose\nresults suggest that a 'good' story may require more than a human-like level of\nvisual grounding, coherence, and repetition.\n","authors":["Aditya K Surikuchi","Raquel Fernández","Sandro Pezzelle"],"pdf_url":"https://arxiv.org/pdf/2407.04559v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16653v1","updated":"2024-08-29T15:56:22Z","published":"2024-08-29T15:56:22Z","title":"Optimal Parallelization of Boosting","summary":" Recent works on the parallel complexity of Boosting have established strong\nlower bounds on the tradeoff between the number of training rounds $p$ and the\ntotal parallel work per round $t$. These works have also presented highly\nnon-trivial parallel algorithms that shed light on different regions of this\ntradeoff. Despite these advancements, a significant gap persists between the\ntheoretical lower bounds and the performance of these algorithms across much of\nthe tradeoff space. In this work, we essentially close this gap by providing\nboth improved lower bounds on the parallel complexity of weak-to-strong\nlearners, and a parallel Boosting algorithm whose performance matches these\nbounds across the entire $p$ vs.~$t$ compromise spectrum, up to logarithmic\nfactors. Ultimately, this work settles the true parallel complexity of Boosting\nalgorithms that are nearly sample-optimal.\n","authors":["Arthur da Cunha","Mikael Møller Høgsgaard","Kasper Green Larsen"],"pdf_url":"https://arxiv.org/pdf/2408.16653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16650v1","updated":"2024-08-29T15:55:27Z","published":"2024-08-29T15:55:27Z","title":"Towards Efficient Modelling of String Dynamics: A Comparison of State\n Space and Koopman based Deep Learning Methods","summary":" This paper presents an examination of State Space Models (SSM) and\nKoopman-based deep learning methods for modelling the dynamics of both linear\nand non-linear stiff strings. Through experiments with datasets generated under\ndifferent initial conditions and sample rates, we assess the capacity of these\nmodels to accurately model the complex behaviours observed in string dynamics.\nOur findings indicate that our proposed Koopman-based model performs as well as\nor better than other existing approaches in non-linear cases for long-sequence\nmodelling.\n We inform the design of these architectures with the structure of the\nproblems at hand. Although challenges remain in extending model predictions\nbeyond the training horizon (i.e., extrapolation), the focus of our\ninvestigation lies in the models' ability to generalise across different\ninitial conditions within the training time interval. This research contributes\ninsights into the physical modelling of dynamical systems (in particular those\naddressing musical acoustics) by offering a comparative overview of these and\nprevious methods and introducing innovative strategies for model improvement.\nOur results highlight the efficacy of these models in simulating non-linear\ndynamics and emphasise their wide-ranging applicability in accurately modelling\ndynamical systems over extended sequences.\n","authors":["Rodrigo Diaz","Carlos De La Vega Martin","Mark Sandler"],"pdf_url":"https://arxiv.org/pdf/2408.16650v1.pdf","comment":"Accepted to DAFx2024"},{"id":"http://arxiv.org/abs/2405.00846v3","updated":"2024-08-29T15:53:29Z","published":"2024-05-01T20:21:44Z","title":"Gameplay Filters: Robust Zero-Shot Safety through Adversarial\n Imagination","summary":" Despite the impressive recent advances in learning-based robot control,\nensuring robustness to out-of-distribution conditions remains an open\nchallenge. Safety filters can, in principle, keep arbitrary control policies\nfrom incurring catastrophic failures by overriding unsafe actions, but existing\nsolutions for complex (e.g., legged) robot dynamics do not span the full motion\nenvelope and instead rely on local, reduced-order models. These filters tend to\noverly restrict agility and can still fail when perturbed away from nominal\nconditions. This paper presents the gameplay filter, a new class of predictive\nsafety filter that continually plays out hypothetical matches between its\nsimulation-trained safety strategy and a virtual adversary co-trained to invoke\nworst-case events and sim-to-real error, and precludes actions that would cause\nit to fail down the line. We demonstrate the scalability and robustness of the\napproach with a first-of-its-kind full-order safety filter for (36-D)\nquadrupedal dynamics. Physical experiments on two different quadruped platforms\ndemonstrate the superior zero-shot effectiveness of the gameplay filter under\nlarge perturbations such as tugging and unmodeled terrain.\n","authors":["Duy P. Nguyen","Kai-Chieh Hsu","Wenhao Yu","Jie Tan","Jaime F. Fisac"],"pdf_url":"https://arxiv.org/pdf/2405.00846v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16638v1","updated":"2024-08-29T15:42:06Z","published":"2024-08-29T15:42:06Z","title":"3D Pose-Based Temporal Action Segmentation for Figure Skating: A\n Fine-Grained and Jump Procedure-Aware Annotation Approach","summary":" Understanding human actions from videos is essential in many domains,\nincluding sports. In figure skating, technical judgments are performed by\nwatching skaters' 3D movements, and its part of the judging procedure can be\nregarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure\nskating that automatically assign temporal semantics to video are actively\nresearched. However, there is a lack of datasets and effective methods for TAS\ntasks requiring 3D pose data. In this study, we first created the FS-Jump3D\ndataset of complex and dynamic figure skating jumps using optical markerless\nmotion capture. We also propose a new fine-grained figure skating jump TAS\ndataset annotation method with which TAS models can learn jump procedures. In\nthe experimental results, we validated the usefulness of 3D pose features as\ninput and the fine-grained dataset for the TAS model in figure skating.\nFS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D.\n","authors":["Ryota Tanaka","Tomohiro Suzuki","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2408.16638v1.pdf","comment":"10 pages, 7th ACM International Workshop on Multimedia Content\n Analysis in Sports"},{"id":"http://arxiv.org/abs/2405.14469v2","updated":"2024-08-29T15:41:52Z","published":"2024-05-23T11:56:05Z","title":"Generalization of Hamiltonian algorithms","summary":" The paper proves generalization results for a class of stochastic learning\nalgorithms. The method applies whenever the algorithm generates an absolutely\ncontinuous distribution relative to some a-priori measure and the Radon Nikodym\nderivative has subgaussian concentration. Applications are bounds for the Gibbs\nalgorithm and randomizations of stable deterministic algorithms as well as\nPAC-Bayesian bounds with data-dependent priors.\n","authors":["Andreas Maurer"],"pdf_url":"https://arxiv.org/pdf/2405.14469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20743v2","updated":"2024-08-29T15:31:58Z","published":"2024-05-31T10:13:17Z","title":"Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent\n Codes","summary":" Trajectory forecasting is crucial for video surveillance analytics, as it\nenables the anticipation of future movements for a set of agents, e.g.\nbasketball players engaged in intricate interactions with long-term intentions.\nDeep generative models offer a natural learning approach for trajectory\nforecasting, yet they encounter difficulties in achieving an optimal balance\nbetween sampling fidelity and diversity. We address this challenge by\nleveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a\ndiscrete latent space to tackle the issue of posterior collapse. Specifically,\nwe introduce an instance-based codebook that allows tailored latent\nrepresentations for each example. In a nutshell, the rows of the codebook are\ndynamically adjusted to reflect contextual information (i.e., past motion\npatterns extracted from the observed trajectories). In this way, the\ndiscretization process gains flexibility, leading to improved reconstructions.\nNotably, instance-level dynamics are injected into the codebook through\nlow-rank updates, which restrict the customization of the codebook to a lower\ndimension space. The resulting discrete space serves as the basis of the\nsubsequent step, which regards the training of a diffusion-based predictive\nmodel. We show that such a two-fold framework, augmented with instance-level\ndiscretization, leads to accurate and diverse forecasts, yielding\nstate-of-the-art performance on three established benchmarks.\n","authors":["Riccardo Benaglia","Angelo Porrello","Pietro Buzzega","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2405.20743v2.pdf","comment":"15 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.16623v1","updated":"2024-08-29T15:31:51Z","published":"2024-08-29T15:31:51Z","title":"Turbulence Strength $C_n^2$ Estimation from Video using Physics-based\n Deep Learning","summary":" Images captured from a long distance suffer from dynamic image distortion due\nto turbulent flow of air cells with random temperatures, and thus refractive\nindices. This phenomenon, known as image dancing, is commonly characterized by\nits refractive-index structure constant $C_n^2$ as a measure of the turbulence\nstrength. For many applications such as atmospheric forecast model,\nlong-range/astronomy imaging, and aviation safety, optical communication\ntechnology, $C_n^2$ estimation is critical for accurately sensing the turbulent\nenvironment. Previous methods for $C_n^2$ estimation include estimation from\nmeteorological data (temperature, relative humidity, wind shear, etc.) for\nsingle-point measurements, two-ended pathlength measurements from optical\nscintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$\nfrom passive video cameras for low cost and hardware complexity. In this paper,\nwe present a comparative analysis of classical image gradient methods for\n$C_n^2$ estimation and modern deep learning-based methods leveraging\nconvolutional neural networks. To enable this, we collect a dataset of video\ncapture along with reference scintillometer measurements for ground truth, and\nwe release this unique dataset to the scientific community. We observe that\ndeep learning methods can achieve higher accuracy when trained on similar data,\nbut suffer from generalization errors to other, unseen imagery as compared to\nclassical methods. To overcome this trade-off, we present a novel physics-based\nnetwork architecture that combines learned convolutional layers with a\ndifferentiable image gradient method that maintains high accuracy while being\ngeneralizable across image datasets.\n","authors":["Ripon Kumar Saha","Esen Salcin","Jihoo Kim","Joseph Smith","Suren Jayasuriya"],"pdf_url":"https://arxiv.org/pdf/2408.16623v1.pdf","comment":"Code Available: https://github.com/Riponcs/Cn2Estimation"},{"id":"http://arxiv.org/abs/2408.13140v2","updated":"2024-08-29T15:31:35Z","published":"2024-08-23T15:02:09Z","title":"Verification of Geometric Robustness of Neural Networks via Piecewise\n Linear Approximation and Lipschitz Optimisation","summary":" We address the problem of verifying neural networks against geometric\ntransformations of the input image, including rotation, scaling, shearing, and\ntranslation. The proposed method computes provably sound piecewise linear\nconstraints for the pixel values by using sampling and linear approximations in\ncombination with branch-and-bound Lipschitz optimisation. The method obtains\nprovably tighter over-approximations of the perturbation region than the\npresent state-of-the-art. We report results from experiments on a comprehensive\nset of verification benchmarks on MNIST and CIFAR10. We show that our proposed\nimplementation resolves up to 32% more verification cases than present\napproaches.\n","authors":["Ben Batten","Yang Zheng","Alessandro De Palma","Panagiotis Kouvaros","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2408.13140v2.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.16621v1","updated":"2024-08-29T15:28:42Z","published":"2024-08-29T15:28:42Z","title":"Towards Infusing Auxiliary Knowledge for Distracted Driver Detection","summary":" Distracted driving is a leading cause of road accidents globally.\nIdentification of distracted driving involves reliably detecting and\nclassifying various forms of driver distraction (e.g., texting, eating, or\nusing in-car devices) from in-vehicle camera feeds to enhance road safety. This\ntask is challenging due to the need for robust models that can generalize to a\ndiverse set of driver behaviors without requiring extensive annotated datasets.\nIn this paper, we propose KiD3, a novel method for distracted driver detection\n(DDD) by infusing auxiliary knowledge about semantic relations between entities\nin a scene and the structural configuration of the driver's pose. Specifically,\nwe construct a unified framework that integrates the scene graphs, and driver\npose information with the visual cues in video frames to create a holistic\nrepresentation of the driver's actions.Our results indicate that KiD3 achieves\na 13.64% accuracy improvement over the vision-only baseline by incorporating\nsuch auxiliary knowledge with visual information.\n","authors":["Ishwar B Balappanawar","Ashmit Chamoli","Ruwan Wickramarachchi","Aditya Mishra","Ponnurangam Kumaraguru","Amit P. Sheth"],"pdf_url":"https://arxiv.org/pdf/2408.16621v1.pdf","comment":"Accepted at KiL 2024: Workshop on Knowledge-infused Learning\n co-located with 30th ACM KDD Conference"},{"id":"http://arxiv.org/abs/2408.16620v1","updated":"2024-08-29T15:28:01Z","published":"2024-08-29T15:28:01Z","title":"Hyperdimensional Vector Tsetlin Machines with Applications to Sequence\n Learning and Generation","summary":" We construct a two-layered model for learning and generating sequential data\nthat is both computationally fast and competitive with vanilla Tsetlin\nmachines, adding numerous advantages. Through the use of hyperdimensional\nvector computing (HVC) algebras and Tsetlin machine clause structures, we\ndemonstrate that the combination of both inherits the generality of data\nencoding and decoding of HVC with the fast interpretable nature of Tsetlin\nmachines to yield a powerful machine learning model. We apply the approach in\ntwo areas, namely in forecasting, generating new sequences, and classification.\nFor the latter, we derive results for the entire UCR Time Series Archive and\ncompare with the standard benchmarks to see how well the method competes in\ntime series classification.\n","authors":["Christian D. Blakely"],"pdf_url":"https://arxiv.org/pdf/2408.16620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16613v1","updated":"2024-08-29T15:20:17Z","published":"2024-08-29T15:20:17Z","title":"Blending Low and High-Level Semantics of Time Series for Better Masked\n Time Series Generation","summary":" State-of-the-art approaches in time series generation (TSG), such as\nTimeVQVAE, utilize vector quantization-based tokenization to effectively model\ncomplex distributions of time series. These approaches first learn to transform\ntime series into a sequence of discrete latent vectors, and then a prior model\nis learned to model the sequence. The discrete latent vectors, however, only\ncapture low-level semantics (\\textit{e.g.,} shapes). We hypothesize that\nhigher-fidelity time series can be generated by training a prior model on more\ninformative discrete latent vectors that contain both low and high-level\nsemantics (\\textit{e.g.,} characteristic dynamics). In this paper, we introduce\na novel framework, termed NC-VQVAE, to integrate self-supervised learning into\nthose TSG methods to derive a discrete latent space where low and high-level\nsemantics are captured. Our experimental results demonstrate that NC-VQVAE\nresults in a considerable improvement in the quality of synthetic samples.\n","authors":["Johan Vik Mathisen","Erlend Lokna","Daesoo Lee","Erlend Aune"],"pdf_url":"https://arxiv.org/pdf/2408.16613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16612v1","updated":"2024-08-29T15:19:06Z","published":"2024-08-29T15:19:06Z","title":"Data Quality Monitoring through Transfer Learning on Anomaly Detection\n for the Hadron Calorimeters","summary":" The proliferation of sensors brings an immense volume of spatio-temporal (ST)\ndata in many domains for various purposes, including monitoring, diagnostics,\nand prognostics applications. Data curation is a time-consuming process for a\nlarge volume of data, making it challenging and expensive to deploy data\nanalytics platforms in new environments. Transfer learning (TL) mechanisms\npromise to mitigate data sparsity and model complexity by utilizing pre-trained\nmodels for a new task. Despite the triumph of TL in fields like computer vision\nand natural language processing, efforts on complex ST models for anomaly\ndetection (AD) applications are limited. In this study, we present the\npotential of TL within the context of AD for the Hadron Calorimeter of the\nCompact Muon Solenoid experiment at CERN. We have transferred the ST AD models\ntrained on data collected from one part of a calorimeter to another. We have\ninvestigated different configurations of TL on semi-supervised autoencoders of\nthe ST AD models -- transferring convolutional, graph, and recurrent neural\nnetworks of both the encoder and decoder networks. The experiment results\ndemonstrate that TL effectively enhances the model learning accuracy on a\ntarget subdetector. The TL achieves promising data reconstruction and AD\nperformance while substantially reducing the trainable parameters of the AD\nmodels. It also improves robustness against anomaly contamination in the\ntraining data sets of the semi-supervised AD models.\n","authors":["Mulugeta Weldezgina Asres","Christian Walter Omlin","Long Wang","Pavel Parygin","David Yu","Jay Dittmann","The CMS-HCAL Collaboration"],"pdf_url":"https://arxiv.org/pdf/2408.16612v1.pdf","comment":"28 pages, 15 figures, and 9 tables"},{"id":"http://arxiv.org/abs/2408.16605v1","updated":"2024-08-29T15:14:52Z","published":"2024-08-29T15:14:52Z","title":"Subspace Representation Learning for Sparse Linear Arrays to Localize\n More Sources than Sensors: A Deep Learning Methodology","summary":" Localizing more sources than sensors with a sparse linear array (SLA) has\nlong relied on minimizing a distance between two covariance matrices and recent\nalgorithms often utilize semidefinite programming (SDP). Although deep neural\nnetwork (DNN)-based methods offer new alternatives, they still depend on\ncovariance matrix fitting. In this paper, we develop a novel methodology that\nestimates the co-array subspaces from a sample covariance for SLAs. Our\nmethodology trains a DNN to learn signal and noise subspace representations\nthat are invariant to the selection of bases. To learn such representations, we\npropose loss functions that gauge the separation between the desired and the\nestimated subspace. In particular, we propose losses that measure the length of\nthe shortest path between subspaces viewed on a union of Grassmannians, and\nprove that it is possible for a DNN to approximate signal subspaces. The\ncomputation of learning subspaces of different dimensions is accelerated by a\nnew batch sampling strategy called consistent rank sampling. The methodology is\nrobust to array imperfections due to its geometry-agnostic and data-driven\nnature. In addition, we propose a fully end-to-end gridless approach that\ndirectly learns angles to study the possibility of bypassing subspace methods.\nNumerical results show that learning such subspace representations is more\nbeneficial than learning covariances or angles. It outperforms conventional\nSDP-based methods such as the sparse and parametric approach (SPA) and existing\nDNN-based covariance reconstruction methods for a wide range of signal-to-noise\nratios (SNRs), snapshots, and source numbers for both perfect and imperfect\narrays.\n","authors":["Kuan-Lin Chen","Bhaskar D. Rao"],"pdf_url":"https://arxiv.org/pdf/2408.16605v1.pdf","comment":"13 pages. Submitted to the IEEE Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2401.12972v3","updated":"2024-08-29T15:11:29Z","published":"2024-01-23T18:58:35Z","title":"On the Efficacy of Text-Based Input Modalities for Action Anticipation","summary":" Anticipating future actions is a highly challenging task due to the diversity\nand scale of potential future actions; yet, information from different\nmodalities help narrow down plausible action choices. Each modality can provide\ndiverse and often complementary context for the model to learn from. While\nprevious multi-modal methods leverage information from modalities such as video\nand audio, we primarily explore how text descriptions of actions and objects\ncan also lead to more accurate action anticipation by providing additional\ncontextual cues, e.g., about the environment and its contents. We propose a\nMulti-modal Contrastive Anticipative Transformer (M-CAT), a video transformer\narchitecture that jointly learns from multi-modal features and text\ndescriptions of actions and objects. We train our model in two stages, where\nthe model first learns to align video clips with descriptions of future\nactions, and is subsequently fine-tuned to predict future actions. Compared to\nexisting methods, M-CAT has the advantage of learning additional context from\ntwo types of text inputs: rich descriptions of future actions during\npre-training, and, text descriptions for detected objects and actions during\nmodality feature fusion. Through extensive experimental evaluation, we\ndemonstrate that our model outperforms previous methods on the EpicKitchens\ndatasets, and show that using simple text descriptions of actions and objects\naid in more effective action anticipation. In addition, we examine the impact\nof object and action information obtained via text, and perform extensive\nablations.\n","authors":["Apoorva Beedu","Harish Haresamudram","Karan Samel","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2401.12972v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16599v1","updated":"2024-08-29T15:09:04Z","published":"2024-08-29T15:09:04Z","title":"sEMG-Driven Physics-Informed Gated Recurrent Networks for Modeling Upper\n Limb Multi-Joint Movement Dynamics","summary":" Exoskeletons and rehabilitation systems offer great potential for enhancing\nhuman strength and recovery through advanced human-machine interfaces (HMIs)\nthat adapt to movement dynamics. However, the real-time application of\nphysics-informed neural networks (PINNs) is limited by their reliance on fixed\ninput lengths and surrogate models. This study introduces a novel\nphysics-informed Gated Recurrent Network (PiGRN) designed to predict\nmulti-joint torques using surface electromyography (sEMG) data. The PiGRN model\nemploys a Gated Recurrent Unit (GRU) to convert time-series sEMG inputs into\nmulti-joint kinematics and external loads, which are then integrated into an\nequation of motion to ensure consistency with physical laws. Experimental\nvalidation with sEMG data from five participants performing elbow\nflexion-extension tasks showed that the PiGRN model accurately predicted joint\ntorques for 10 unfamiliar movements, with RMSE values between 4.02\\% and\n11.40\\% and correlation coefficients ranging from 0.87 to 0.98. These findings\nhighlight the PiGRN's potential for real-time exoskeleton and rehabilitation\napplications. Future research will explore more diverse datasets, improve\nmusculoskeletal models, and investigate unsupervised learning methods.\n","authors":["Rajnish Kumar","Anand Gupta","Suriya Prakash Muthukrishnan","Lalan Kumar","Sitikantha Roy"],"pdf_url":"https://arxiv.org/pdf/2408.16599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16592v1","updated":"2024-08-29T14:55:33Z","published":"2024-08-29T14:55:33Z","title":"High-Dimensional Sparse Data Low-rank Representation via Accelerated\n Asynchronous Parallel Stochastic Gradient Descent","summary":" Data characterized by high dimensionality and sparsity are commonly used to\ndescribe real-world node interactions. Low-rank representation (LR) can map\nhigh-dimensional sparse (HDS) data to low-dimensional feature spaces and infer\nnode interactions via modeling data latent associations. Unfortunately,\nexisting optimization algorithms for LR models are computationally inefficient\nand slowly convergent on large-scale datasets. To address this issue, this\npaper proposes an Accelerated Asynchronous Parallel Stochastic Gradient Descent\nA2PSGD for High-Dimensional Sparse Data Low-rank Representation with three\nfold-ideas: a) establishing a lock-free scheduler to simultaneously respond to\nscheduling requests from multiple threads; b) introducing a greedy\nalgorithm-based load balancing strategy for balancing the computational load\namong threads; c) incorporating Nesterov's accelerated gradient into the\nlearning scheme to accelerate model convergence. Empirical studies show that\nA2PSGD outperforms existing optimization algorithms for HDS data LR in both\naccuracy and training time.\n","authors":["Qicong Hu","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16589v1","updated":"2024-08-29T14:52:42Z","published":"2024-08-29T14:52:42Z","title":"CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions","summary":" We demonstrate that carefully adjusting the tokenizer of the Whisper speech\nrecognition model significantly improves the precision of word-level timestamps\nwhen applying dynamic time warping to the decoder's cross-attention scores. We\nfine-tune the model to produce more verbatim speech transcriptions and employ\nseveral techniques to increase robustness against multiple speakers and\nbackground noise. These adjustments achieve state-of-the-art performance on\nbenchmarks for verbatim speech transcription, word segmentation, and the timed\ndetection of filler events, and can further mitigate transcription\nhallucinations. The code is available open\nhttps://github.com/nyrahealth/CrisperWhisper.\n","authors":["Laurin Wagner","Bernhard Thallinger","Mario Zusag"],"pdf_url":"https://arxiv.org/pdf/2408.16589v1.pdf","comment":"Published at INTERSPEECH2024"},{"id":"http://arxiv.org/abs/2308.11375v2","updated":"2024-08-29T14:45:26Z","published":"2023-08-22T12:01:49Z","title":"Standardized Interpretable Fairness Measures for Continuous Risk Scores","summary":" We propose a standardized version of fairness measures for continuous scores\nwith a reasonable interpretation based on the Wasserstein distance. Our\nmeasures are easily computable and well suited for quantifying and interpreting\nthe strength of group disparities as well as for comparing biases across\ndifferent models, datasets, or time points. We derive a link between the\ndifferent families of existing fairness measures for scores and show that the\nproposed standardized fairness measures outperform ROC-based fairness measures\nbecause they are more explicit and can quantify significant biases that\nROC-based fairness measures miss.\n","authors":["Ann-Kristin Becker","Oana Dumitrasc","Klaus Broelemann"],"pdf_url":"https://arxiv.org/pdf/2308.11375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16578v1","updated":"2024-08-29T14:44:12Z","published":"2024-08-29T14:44:12Z","title":"Transformers Meet ACT-R: Repeat-Aware and Sequential Listening Session\n Recommendation","summary":" Music streaming services often leverage sequential recommender systems to\npredict the best music to showcase to users based on past sequences of\nlistening sessions. Nonetheless, most sequential recommendation methods ignore\nor insufficiently account for repetitive behaviors. This is a crucial\nlimitation for music recommendation, as repeatedly listening to the same song\nover time is a common phenomenon that can even change the way users perceive\nthis song. In this paper, we introduce PISA (Psychology-Informed Session\nembedding using ACT-R), a session-level sequential recommender system that\novercomes this limitation. PISA employs a Transformer architecture learning\nembedding representations of listening sessions and users using attention\nmechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational),\na cognitive architecture modeling human information access and memory dynamics.\nThis approach enables us to capture dynamic and repetitive patterns from user\nbehaviors, allowing us to effectively predict the songs they will listen to in\nsubsequent sessions, whether they are repeated or new ones. We demonstrate the\nempirical relevance of PISA using both publicly available listening data from\nLast.fm and proprietary data from Deezer, a global music streaming service,\nconfirming the critical importance of repetition modeling for sequential\nlistening session recommendation. Along with this paper, we publicly release\nour proprietary dataset to foster future research in this field, as well as the\nsource code of PISA to facilitate its future use.\n","authors":["Viet-Anh Tran","Guillaume Salha-Galvan","Bruno Sguerra","Romain Hennequin"],"pdf_url":"https://arxiv.org/pdf/2408.16578v1.pdf","comment":"11 pages. Accepted by RecSys'2024, full paper"},{"id":"http://arxiv.org/abs/2408.16577v1","updated":"2024-08-29T14:43:42Z","published":"2024-08-29T14:43:42Z","title":"Seeking the Sufficiency and Necessity Causal Features in Multimodal\n Representation Learning","summary":" Learning representations with a high Probability of Necessary and Sufficient\nCauses (PNS) has been shown to enhance deep learning models' ability. This task\ninvolves identifying causal features that are both sufficient (guaranteeing the\noutcome) and necessary (without which the outcome cannot occur). However,\ncurrent research predominantly focuses on unimodal data, and extending PNS\nlearning to multimodal settings presents significant challenges. The challenges\narise as the conditions for PNS identifiability, Exogeneity and Monotonicity,\nneed to be reconsidered in a multimodal context, where sufficient and necessary\ncausal features are distributed across different modalities. To address this,\nwe first propose conceptualizing multimodal representations as comprising\nmodality-invariant and modality-specific components. We then analyze PNS\nidentifiability for each component, while ensuring non-trivial PNS estimation.\nFinally, we formulate tractable optimization objectives that enable multimodal\nmodels to learn high-PNS representations, thereby enhancing their predictive\nperformance. Experiments demonstrate the effectiveness of our method on both\nsynthetic and real-world data.\n","authors":["Boyu Chen","Junjie Liu","Zhu Li","Mengyue yang"],"pdf_url":"https://arxiv.org/pdf/2408.16577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16573v1","updated":"2024-08-29T14:40:32Z","published":"2024-08-29T14:40:32Z","title":"An Adaptive Latent Factorization of Tensors Model for Embedding Dynamic\n Communication Network","summary":" The Dynamic Communication Network (DCN) describes the interactions over time\namong various communication nodes, and it is widely used in Big-data\napplications as a data source. As the number of communication nodes increases\nand temporal slots accumulate, each node interacts in with only a few nodes in\na given temporal slot, the DCN can be represented by an High-Dimensional Sparse\n(HDS) tensor. In order to extract rich behavioral patterns from an HDS tensor\nin DCN, this paper proposes an Adaptive Temporal-dependent Tensor low-rank\nrepresentation (ATT) model. It adopts a three-fold approach: a) designing a\ntemporal-dependent method to reconstruct temporal feature matrix, thereby\nprecisely represent the data by capturing the temporal patterns; b) achieving\nhyper-parameters adaptation of the model via the Differential Evolutionary\nAlgorithms (DEA) to avoid tedious hyper-parameters tuning; c) employing\nnonnegative learning schemes for the model parameters to effectively handle an\nthe nonnegativity inherent in HDS data. The experimental results on four\nreal-world DCNs demonstrate that the proposed ATT model significantly\noutperforms several state-of-the-art models in both prediction errors and\nconvergence rounds.\n","authors":["Xin Liao","Qicong Hu","Peng Tang"],"pdf_url":"https://arxiv.org/pdf/2408.16573v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2210.05506v2","updated":"2024-08-29T14:36:20Z","published":"2022-10-11T14:58:58Z","title":"Follow-up Attention: An Empirical Study of Developer and Neural Model\n Code Exploration","summary":" Recent neural models of code, such as OpenAI Codex and AlphaCode, have\ndemonstrated remarkable proficiency at code generation due to the underlying\nattention mechanism. However, it often remains unclear how the models actually\nprocess code, and to what extent their reasoning and the way their attention\nmechanism scans the code matches the patterns of developers. A poor\nunderstanding of the model reasoning process limits the way in which current\nneural models are leveraged today, so far mostly for their raw prediction. To\nfill this gap, this work studies how the processed attention signal of three\nopen large language models - CodeGen, InCoder and GPT-J - agrees with how\ndevelopers look at and explore code when each answers the same sensemaking\nquestions about code. Furthermore, we contribute an open-source eye-tracking\ndataset comprising 92 manually-labeled sessions from 25 developers engaged in\nsensemaking tasks. We empirically evaluate five heuristics that do not use the\nattention and ten attention-based post-processing approaches of the attention\nsignal of CodeGen against our ground truth of developers exploring code,\nincluding the novel concept of follow-up attention which exhibits the highest\nagreement between model and human attention. Our follow-up attention method can\npredict the next line a developer will look at with 47% accuracy. This\noutperforms the baseline prediction accuracy of 42.3%, which uses the session\nhistory of other developers to recommend the next line. These results\ndemonstrate the potential of leveraging the attention signal of pre-trained\nmodels for effective code exploration.\n","authors":["Matteo Paltenghi","Rahul Pandita","Austin Z. Henley","Albert Ziegler"],"pdf_url":"https://arxiv.org/pdf/2210.05506v2.pdf","comment":"Published at IEEE Transactions on Software Engineering"},{"id":"http://arxiv.org/abs/2408.16567v1","updated":"2024-08-29T14:35:14Z","published":"2024-08-29T14:35:14Z","title":"Identifying Terrain Physical Parameters from Vision -- Towards\n Physical-Parameter-Aware Locomotion and Navigation","summary":" Identifying the physical properties of the surrounding environment is\nessential for robotic locomotion and navigation to deal with non-geometric\nhazards, such as slippery and deformable terrains. It would be of great benefit\nfor robots to anticipate these extreme physical properties before contact;\nhowever, estimating environmental physical parameters from vision is still an\nopen challenge. Animals can achieve this by using their prior experience and\nknowledge of what they have seen and how it felt. In this work, we propose a\ncross-modal self-supervised learning framework for vision-based environmental\nphysical parameter estimation, which paves the way for future\nphysical-property-aware locomotion and navigation. We bridge the gap between\nexisting policies trained in simulation and identification of physical terrain\nparameters from vision. We propose to train a physical decoder in simulation to\npredict friction and stiffness from multi-modal input. The trained network\nallows the labeling of real-world images with physical parameters in a\nself-supervised manner to further train a visual network during deployment,\nwhich can densely predict the friction and stiffness from image data. We\nvalidate our physical decoder in simulation and the real world using a\nquadruped ANYmal robot, outperforming an existing baseline method. We show that\nour visual network can predict the physical properties in indoor and outdoor\nexperiments while allowing fast adaptation to new environments.\n","authors":["Jiaqi Chen","Jonas Frey","Ruyi Zhou","Takahiro Miki","Georg Martius","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2408.16567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11962v3","updated":"2024-08-29T14:31:58Z","published":"2023-02-23T12:18:28Z","title":"Unified Convergence Theory of Stochastic and Variance-Reduced Cubic\n Newton Methods","summary":" We study stochastic Cubic Newton methods for solving general possibly\nnon-convex minimization problems. We propose a new framework, which we call the\nhelper framework, that provides a unified view of the stochastic and\nvariance-reduced second-order algorithms equipped with global complexity\nguarantees. It can also be applied to learning with auxiliary information. Our\nhelper framework offers the algorithm designer high flexibility for\nconstructing and analyzing the stochastic Cubic Newton methods, allowing\narbitrary size batches, and the use of noisy and possibly biased estimates of\nthe gradients and Hessians, incorporating both the variance reduction and the\nlazy Hessian updates. We recover the best-known complexities for the stochastic\nand variance-reduced Cubic Newton, under weak assumptions on the noise. A\ndirect consequence of our theory is the new lazy stochastic second-order\nmethod, which significantly improves the arithmetic complexity for large\ndimension problems. We also establish complexity bounds for the classes of\ngradient-dominated objectives, that include convex and strongly convex\nproblems. For Auxiliary Learning, we show that using a helper (auxiliary\nfunction) can outperform training alone if a given similarity measure is small.\n","authors":["El Mahdi Chayti","Nikita Doikov","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2302.11962v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15099v2","updated":"2024-08-29T14:20:44Z","published":"2024-08-27T14:31:54Z","title":"No Regrets: Investigating and Improving Regret Approximations for\n Curriculum Discovery","summary":" What data or environments to use for training to improve downstream\nperformance is a longstanding and very topical question in reinforcement\nlearning. In particular, Unsupervised Environment Design (UED) methods have\ngained recent attention as their adaptive curricula enable agents to be robust\nto in- and out-of-distribution tasks. We ask to what extent these methods are\nthemselves robust when applied to a novel setting, closely inspired by a\nreal-world robotics problem. Surprisingly, we find that the state-of-the-art\nUED methods either do not improve upon the na\\\"{i}ve baseline of Domain\nRandomisation (DR), or require substantial hyperparameter tuning to do so. Our\nanalysis shows that this is due to their underlying scoring functions failing\nto predict intuitive measures of ``learnability'', i.e., in finding the\nsettings that the agent sometimes solves, but not always. Based on this, we\ninstead directly train on levels with high learnability and find that this\nsimple and intuitive approach outperforms UED methods and DR in several\nbinary-outcome environments, including on our domain and the standard UED\ndomain of Minigrid. We further introduce a new adversarial evaluation procedure\nfor directly measuring robustness, closely mirroring the conditional value at\nrisk (CVaR). We open-source all our code and present visualisations of final\npolicies here: https://github.com/amacrutherford/sampling-for-learnability.\n","authors":["Alexander Rutherford","Michael Beukman","Timon Willi","Bruno Lacerda","Nick Hawes","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2408.15099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16555v1","updated":"2024-08-29T14:18:54Z","published":"2024-08-29T14:18:54Z","title":"Android Malware Detection Based on RGB Images and Multi-feature Fusion","summary":" With the widespread adoption of smartphones, Android malware has become a\nsignificant challenge in the field of mobile device security. Current Android\nmalware detection methods often rely on feature engineering to construct\ndynamic or static features, which are then used for learning. However, static\nfeature-based methods struggle to counter code obfuscation, packing, and\nsigning techniques, while dynamic feature-based methods involve time-consuming\nfeature extraction. Image-based methods for Android malware detection offer\nbetter resilience against malware variants and polymorphic malware. This paper\nproposes an end-to-end Android malware detection technique based on RGB images\nand multi-feature fusion. The approach involves extracting Dalvik Executable\n(DEX) files, AndroidManifest.xml files, and API calls from APK files,\nconverting them into grayscale images, and enhancing their texture features\nusing Canny edge detection, histogram equalization, and adaptive thresholding\ntechniques. These grayscale images are then combined into an RGB image\ncontaining multi-feature fusion information, which is analyzed using mainstream\nimage classification models for Android malware detection. Extensive\nexperiments demonstrate that the proposed method effectively captures Android\nmalware characteristics, achieving an accuracy of up to 97.25%, outperforming\nexisting detection methods that rely solely on DEX files as classification\nfeatures. Additionally, ablation experiments confirm the effectiveness of using\nthe three key files for feature representation in the proposed approach.\n","authors":["Zhiqiang Wang","Qiulong Yu","Sicheng Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.16555v1.pdf","comment":"9 pages,10 figures"},{"id":"http://arxiv.org/abs/2408.16553v1","updated":"2024-08-29T14:16:13Z","published":"2024-08-29T14:16:13Z","title":"Super-Resolution works for coastal simulations","summary":" Learning fine-scale details of a coastal ocean simulation from a coarse\nrepresentation is a challenging task. For real-world applications,\nhigh-resolution simulations are necessary to advance understanding of many\ncoastal processes, specifically, to predict flooding resulting from tsunamis\nand storm surges. We propose a Deep Network for Coastal Super-Resolution\n(DNCSR) for spatiotemporal enhancement to efficiently learn the high-resolution\nnumerical solution. Given images of coastal simulations produced on\nlow-resolution computational meshes using low polynomial order discontinuous\nGalerkin discretizations and a coarse temporal resolution, the proposed DNCSR\nlearns to produce high-resolution free surface elevation and velocity\nvisualizations in both time and space. To efficiently model the dynamic changes\nover time and space, we propose grid-aware spatiotemporal attention to project\nthe temporal features to the spatial domain for non-local feature matching. The\ncoordinate information is also utilized via positional encoding. For the final\nreconstruction, we use the spatiotemporal bilinear operation to interpolate the\nmissing frames and then expand the feature maps to the frequency domain for\nresidual mapping. Besides data-driven losses, the proposed physics-informed\nloss guarantees gradient consistency and momentum changes. Their combination\ncontributes to the overall 24% improvements in RMSE. To train the proposed\nmodel, we propose a large-scale coastal simulation dataset and use it for model\noptimization and evaluation. Our method shows superior super-resolution quality\nand fast computation compared to the state-of-the-art methods.\n","authors":["Zhi-Song Liu","Markus Buttner","Vadym Aizinger","Andreas Rupp"],"pdf_url":"https://arxiv.org/pdf/2408.16553v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2407.17844v2","updated":"2024-08-29T14:06:57Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n Classification: A Systematic Review","summary":" Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy concerns. The goal of this systematic review is to explore the current\nlandscape of speech-based DL approaches for PD classification, based on 33\nscientific works published between 2020 and March 2024. We discuss their\navailable resources, capabilities, potential limitations, and issues related to\nbias, explainability, and privacy. Furthermore, this review provides an\noverview of publicly accessible speech-based datasets and open-source material\nfor PD. The DL approaches are categorized into end-to-end (E2E) learning,\ntransfer learning (TL) and deep acoustic features extraction (DAFE) approaches.\nAmong E2E approaches, Convolutional Neural Networks (CNNs) are prevalent,\nthough Transformers are increasingly popular. E2E approaches face challenges\nsuch as limited data and computational resources, especially with Transformers.\nTL addresses these issues by providing more robust PD diagnosis and better\ngeneralizability across languages. DAFE aims to improve the explainability and\ninterpretability of results by examining the specific effects of deep features\non both other DL approaches and more traditional machine learning (ML) methods.\nHowever, it often underperforms compared to E2E and TL approaches.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v2.pdf","comment":"Submitted in Applied Sciences - peer reviewed Open Access journal.\n This research was funded by the NWO research programme AiNed Fellowship\n Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant\n number NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2408.16543v1","updated":"2024-08-29T14:01:30Z","published":"2024-08-29T14:01:30Z","title":"Statistical and Geometrical properties of regularized Kernel\n Kullback-Leibler divergence","summary":" In this paper, we study the statistical and geometrical properties of the\nKullback-Leibler divergence with kernel covariance operators (KKL) introduced\nby Bach [2022]. Unlike the classical Kullback-Leibler (KL) divergence that\ninvolves density ratios, the KKL compares probability distributions through\ncovariance operators (embeddings) in a reproducible kernel Hilbert space\n(RKHS), and compute the Kullback-Leibler quantum divergence. This novel\ndivergence hence shares parallel but different aspects with both the standard\nKullback-Leibler between probability distributions and kernel embeddings\nmetrics such as the maximum mean discrepancy. A limitation faced with the\noriginal KKL divergence is its inability to be defined for distributions with\ndisjoint supports. To solve this problem, we propose in this paper a\nregularised variant that guarantees that the divergence is well defined for all\ndistributions. We derive bounds that quantify the deviation of the regularised\nKKL to the original one, as well as finite-sample bounds. In addition, we\nprovide a closed-form expression for the regularised KKL, specifically\napplicable when the distributions consist of finite sets of points, which makes\nit implementable. Furthermore, we derive a Wasserstein gradient descent scheme\nof the KKL divergence in the case of discrete distributions, and study\nempirically its properties to transport a set of points to a target\ndistribution.\n","authors":["Clémentine Chazal","Anna Korba","Francis Bach"],"pdf_url":"https://arxiv.org/pdf/2408.16543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16542v1","updated":"2024-08-29T14:00:57Z","published":"2024-08-29T14:00:57Z","title":"SALSA: Speedy ASR-LLM Synchronous Aggregation","summary":" Harnessing pre-trained LLMs to improve ASR systems, particularly for\nlow-resource languages, is now an emerging area of research. Existing methods\nrange from using LLMs for ASR error correction to tightly coupled systems that\nreplace the ASR decoder with the LLM. These approaches either increase decoding\ntime or require expensive training of the cross-attention layers. We propose\nSALSA, which couples the decoder layers of the ASR to the LLM decoder, while\nsynchronously advancing both decoders. Such coupling is performed with a simple\nprojection of the last decoder state, and is thus significantly more training\nefficient than earlier approaches. A challenge of our proposed coupling is\nhandling the mismatch between the tokenizers of the LLM and ASR systems. We\nhandle this mismatch using cascading tokenization with respect to the LLM and\nASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS\nbenchmark, yielding substantial WER reductions of up to 38%.\n","authors":["Ashish Mittal","Darshan Prabhu","Sunita Sarawagi","Preethi Jyothi"],"pdf_url":"https://arxiv.org/pdf/2408.16542v1.pdf","comment":"Accepted to INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2408.16537v1","updated":"2024-08-29T13:52:28Z","published":"2024-08-29T13:52:28Z","title":"SFR-GNN: Simple and Fast Robust GNNs against Structural Attacks","summary":" Graph Neural Networks (GNNs) have demonstrated commendable performance for\ngraph-structured data. Yet, GNNs are often vulnerable to adversarial structural\nattacks as embedding generation relies on graph topology. Existing efforts are\ndedicated to purifying the maliciously modified structure or applying adaptive\naggregation, thereby enhancing the robustness against adversarial structural\nattacks. It is inevitable for a defender to consume heavy computational costs\ndue to lacking prior knowledge about modified structures. To this end, we\npropose an efficient defense method, called Simple and Fast Robust Graph Neural\nNetwork (SFR-GNN), supported by mutual information theory. The SFR-GNN first\npre-trains a GNN model using node attributes and then fine-tunes it over the\nmodified graph in the manner of contrastive learning, which is free of\npurifying modified structures and adaptive aggregation, thus achieving great\nefficiency gains. Consequently, SFR-GNN exhibits a 24%--162% speedup compared\nto advanced robust models, demonstrating superior robustness for node\nclassification tasks.\n","authors":["Xing Ai","Guanyu Zhu","Yulin Zhu","Yu Zheng","Gaolei Li","Jianhua Li","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16535v1","updated":"2024-08-29T13:50:08Z","published":"2024-08-29T13:50:08Z","title":"TinyTNAS: GPU-Free, Time-Bound, Hardware-Aware Neural Architecture\n Search for TinyML Time Series Classification","summary":" In this work, we present TinyTNAS, a novel hardware-aware multi-objective\nNeural Architecture Search (NAS) tool specifically designed for TinyML time\nseries classification. Unlike traditional NAS methods that rely on GPU\ncapabilities, TinyTNAS operates efficiently on CPUs, making it accessible for a\nbroader range of applications. Users can define constraints on RAM, FLASH, and\nMAC operations to discover optimal neural network architectures within these\nparameters. Additionally, the tool allows for time-bound searches, ensuring the\nbest possible model is found within a user-specified duration. By experimenting\nwith benchmark dataset UCI HAR, PAMAP2, WISDM, MIT BIH, and PTB Diagnostic ECG\nDatabas TinyTNAS demonstrates state-of-the-art accuracy with significant\nreductions in RAM, FLASH, MAC usage, and latency. For example, on the UCI HAR\ndataset, TinyTNAS achieves a 12x reduction in RAM usage, a 144x reduction in\nMAC operations, and a 78x reduction in FLASH memory while maintaining superior\naccuracy and reducing latency by 149x. Similarly, on the PAMAP2 and WISDM\ndatasets, it achieves a 6x reduction in RAM usage, a 40x reduction in MAC\noperations, an 83x reduction in FLASH, and a 67x reduction in latency, all\nwhile maintaining superior accuracy. Notably, the search process completes\nwithin 10 minutes in a CPU environment. These results highlight TinyTNAS's\ncapability to optimize neural network architectures effectively for\nresource-constrained TinyML applications, ensuring both efficiency and high\nperformance. The code for TinyTNAS is available at the GitHub repository and\ncan be accessed at https://github.com/BidyutSaha/TinyTNAS.git.\n","authors":["Bidyut Saha","Riya Samanta","Soumya K. Ghosh","Ram Babu Roy"],"pdf_url":"https://arxiv.org/pdf/2408.16535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15640v2","updated":"2024-08-29T13:47:38Z","published":"2024-08-28T08:52:14Z","title":"GANs Conditioning Methods: A Survey","summary":" In recent years, Generative Adversarial Networks (GANs) have seen significant\nadvancements, leading to their widespread adoption across various fields. The\noriginal GAN architecture enables the generation of images without any specific\ncontrol over the content, making it an unconditional generation process.\nHowever, many practical applications require precise control over the generated\noutput, which has led to the development of conditional GANs (cGANs) that\nincorporate explicit conditioning to guide the generation process. cGANs extend\nthe original framework by incorporating additional information (conditions),\nenabling the generation of samples that adhere to that specific criteria.\nVarious conditioning methods have been proposed, each differing in how they\nintegrate the conditioning information into both the generator and the\ndiscriminator networks. In this work, we review the conditioning methods\nproposed for GANs, exploring the characteristics of each method and\nhighlighting their unique mechanisms and theoretical foundations. Furthermore,\nwe conduct a comparative analysis of these methods, evaluating their\nperformance on various image datasets. Through these analyses, we aim to\nprovide insights into the strengths and limitations of various conditioning\ntechniques, guiding future research and application in generative modeling.\n","authors":["Anis Bourou","Auguste Genovesio","Valérie Mezger"],"pdf_url":"https://arxiv.org/pdf/2408.15640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16532v1","updated":"2024-08-29T13:43:36Z","published":"2024-08-29T13:43:36Z","title":"WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio\n Language Modeling","summary":" Language models have been effectively applied to modeling natural signals,\nsuch as images, video, speech, and audio. A crucial component of these models\nis the codec tokenizer, which compresses high-dimensional natural signals into\nlower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,\nwhich offers several advantages over previous SOTA acoustic codec models in the\naudio domain: 1)extreme compression. By compressing the layers of quantizers\nand the temporal dimension of the discrete codec, one-second audio of 24kHz\nsampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved\nsubjective quality. Despite the reduced number of tokens, WavTokenizer achieves\nstate-of-the-art reconstruction quality with outstanding UTMOS scores and\ninherently contains richer semantic information. Specifically, we achieve these\nresults by designing a broader VQ space, extended contextual windows, and\nimproved attention networks, as well as introducing a powerful multi-scale\ndiscriminator and an inverse Fourier transform structure. We conducted\nextensive reconstruction experiments in the domains of speech, audio, and\nmusic. WavTokenizer exhibited strong performance across various objective and\nsubjective metrics compared to state-of-the-art models. We also tested semantic\ninformation, VQ utilization, and adaptability to generative models.\nComprehensive ablation studies confirm the necessity of each module in\nWavTokenizer. The related code, demos, and pre-trained models are available at\nhttps://github.com/jishengpeng/WavTokenizer.\n","authors":["Shengpeng Ji","Ziyue Jiang","Xize Cheng","Yifu Chen","Minghui Fang","Jialong Zuo","Qian Yang","Ruiqi Li","Ziang Zhang","Xiaoda Yang","Rongjie Huang","Yidi Jiang","Qian Chen","Siqi Zheng","Wen Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16532v1.pdf","comment":"Working in progress. arXiv admin note: text overlap with\n arXiv:2402.12208"},{"id":"http://arxiv.org/abs/2408.16527v1","updated":"2024-08-29T13:39:01Z","published":"2024-08-29T13:39:01Z","title":"Multitask learning for improved scour detection: A dynamic wave tank\n study","summary":" Population-based structural health monitoring (PBSHM), aims to share\ninformation between members of a population. An offshore wind (OW) farm could\nbe considered as a population of nominally-identical wind-turbine structures.\nHowever, benign variations exist among members, such as geometry, sea-bed\nconditions and temperature differences. These factors could influence\nstructural properties and therefore the dynamic response, making it more\ndifficult to detect structural problems via traditional SHM techniques.\n This paper explores the use of a Bayesian hierarchical model as a means of\nmultitask learning, to infer foundation stiffness distribution parameters at\nboth population and local levels. To do this, observations of natural frequency\nfrom populations of structures were first generated from both numerical and\nexperimental models. These observations were then used in a partially-pooled\nBayesian hierarchical model in tandem with surrogate FE models of the\nstructures to infer foundation stiffness parameters. Finally, it is\ndemonstrated how the learned parameters may be used as a basis to perform more\nrobust anomaly detection (as compared to a no-pooling approach) e.g. as a\nresult of scour.\n","authors":["Simon M. Brealy","Aidan J. Hughes","Tina A. Dardeno","Lawrence A. Bull","Robin S. Mills","Nikolaos Dervilis","Keith Worden"],"pdf_url":"https://arxiv.org/pdf/2408.16527v1.pdf","comment":"25 pages, 12 figures, early work features in ISWHM 2023 conference\n proceedings and available here: arXiv:2402.19295. Submitted to the Renewable\n Energy journal"},{"id":"http://arxiv.org/abs/2408.16517v1","updated":"2024-08-29T13:28:11Z","published":"2024-08-29T13:28:11Z","title":"Adaptive Variational Continual Learning via Task-Heuristic Modelling","summary":" Variational continual learning (VCL) is a turn-key learning algorithm that\nhas state-of-the-art performance among the best continual learning models. In\nour work, we explore an extension of the generalized variational continual\nlearning (GVCL) model, named AutoVCL, which combines task heuristics for\ninformed learning and model optimization. We demonstrate that our model\noutperforms the standard GVCL with fixed hyperparameters, benefiting from the\nautomatic adjustment of the hyperparameter based on the difficulty and\nsimilarity of the incoming task compared to the previous tasks.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2408.16517v1.pdf","comment":"4 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.15126v2","updated":"2024-08-29T13:21:26Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in\nfields of materials science, chemistry, pharmacology just to name a few.\nConventional MD simulations are plagued by numerical stability as well as long\nequilibration time issues, which limits broader applications of MD simulations.\nRecently, a surge of deep learning approaches have been devised for\ntime-coarsened dynamics, which learns the state transition mechanism over much\nlarger time scales to overcome these limitations. However, only a few methods\ntarget the underlying Boltzmann distribution by resampling techniques, where\nproposals are rarely accepted as new states with low efficiency. In this work,\nwe propose a force-guided bridge matching model, FBM, a novel framework that\nfirst incorporates physical priors into bridge matching for full-atom\ntime-coarsened dynamics. With the guidance of our well-designed intermediate\nforce field, FBM is feasible to target the Boltzmann-like distribution by\ndirect inference without extra steps. Experiments on small peptides verify our\nsuperiority in terms of comprehensive metrics and demonstrate transferability\nto unseen peptide systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16495v1","updated":"2024-08-29T12:49:22Z","published":"2024-08-29T12:49:22Z","title":"On-device AI: Quantization-aware Training of Transformers in Time-Series","summary":" Artificial Intelligence (AI) models for time-series in pervasive computing\nkeep getting larger and more complicated. The Transformer model is by far the\nmost compelling of these AI models. However, it is difficult to obtain the\ndesired performance when deploying such a massive model on a sensor device with\nlimited resources. My research focuses on optimizing the Transformer model for\ntime-series forecasting tasks. The optimized model will be deployed as hardware\naccelerators on embedded Field Programmable Gate Arrays (FPGAs). I will\ninvestigate the impact of applying Quantization-aware Training to the\nTransformer model to reduce its size and runtime memory footprint while\nmaximizing the advantages of FPGAs.\n","authors":["Tianheng Ling","Gregor Schiele"],"pdf_url":"https://arxiv.org/pdf/2408.16495v1.pdf","comment":"This paper is accepted by 2023 IEEE International Conference on\n Pervasive Computing and Communications(PhD Forum)"},{"id":"http://arxiv.org/abs/2408.16463v1","updated":"2024-08-29T11:51:41Z","published":"2024-08-29T11:51:41Z","title":"An Exploratory Deep Learning Approach for Predicting Subsequent Suicidal\n Acts in Chinese Psychological Support Hotlines","summary":" Psychological support hotlines are an effective suicide prevention measure\nthat typically relies on professionals using suicide risk assessment scales to\npredict individual risk scores. However, the accuracy of scale-based predictive\nmethods for suicide risk assessment can vary widely depending on the expertise\nof the operator. This limitation underscores the need for more reliable\nmethods, prompting this research's innovative exploration of the use of\nartificial intelligence to improve the accuracy and efficiency of suicide risk\nprediction within the context of psychological support hotlines. The study\nincluded data from 1,549 subjects from 2015-2017 in China who contacted a\npsychological support hotline. Each participant was followed for 12 months to\nidentify instances of suicidal behavior. We proposed a novel multi-task\nlearning method that uses the large-scale pre-trained model Whisper for feature\nextraction and fits psychological scales while predicting the risk of suicide.\nThe proposed method yields a 2.4\\% points improvement in F1-score compared to\nthe traditional manual approach based on the psychological scales. Our model\ndemonstrated superior performance compared to the other eight popular models.\nTo our knowledge, this study is the first to apply deep learning to long-term\nspeech data to predict suicide risk in China, indicating grate potential for\nclinical applications. The source code is publicly available at:\n\\url{https://github.com/songchangwei/Suicide-Risk-Prediction}.\n","authors":["Changwei Song","Qing Zhao","Jianqiang Li","Yining Chen","Yongsheng Tong","Guanghui Fu"],"pdf_url":"https://arxiv.org/pdf/2408.16463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16457v1","updated":"2024-08-29T11:45:01Z","published":"2024-08-29T11:45:01Z","title":"HYGENE: A Diffusion-based Hypergraph Generation Method","summary":" Hypergraphs are powerful mathematical structures that can model complex,\nhigh-order relationships in various domains, including social networks,\nbioinformatics, and recommender systems. However, generating realistic and\ndiverse hypergraphs remains challenging due to their inherent complexity and\nlack of effective generative models. In this paper, we introduce a\ndiffusion-based Hypergraph Generation (HYGENE) method that addresses these\nchallenges through a progressive local expansion approach. HYGENE works on the\nbipartite representation of hypergraphs, starting with a single pair of\nconnected nodes and iteratively expanding it to form the target hypergraph. At\neach step, nodes and hyperedges are added in a localized manner using a\ndenoising diffusion process, which allows for the construction of the global\nstructure before refining local details. Our experiments demonstrated the\neffectiveness of HYGENE, proving its ability to closely mimic a variety of\nproperties in hypergraphs. To the best of our knowledge, this is the first\nattempt to employ deep learning models for hypergraph generation, and our work\naims to lay the groundwork for future research in this area.\n","authors":["Dorian Gailhard","Enzo Tartaglione","Lirida Naviner De Barros","Jhony H. Giraldo"],"pdf_url":"https://arxiv.org/pdf/2408.16457v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.11529 by other authors"},{"id":"http://arxiv.org/abs/2403.04447v2","updated":"2024-08-29T11:28:10Z","published":"2024-03-07T12:34:03Z","title":"FRRI: a novel algorithm for fuzzy-rough rule induction","summary":" Interpretability is the next frontier in machine learning research. In the\nsearch for white box models - as opposed to black box models, like random\nforests or neural networks - rule induction algorithms are a logical and\npromising option, since the rules can easily be understood by humans. Fuzzy and\nrough set theory have been successfully applied to this archetype, almost\nalways separately. As both approaches to rule induction involve granular\ncomputing based on the concept of equivalence classes, it is natural to combine\nthem. The QuickRules\\cite{JensenCornelis2009} algorithm was a first attempt at\nusing fuzzy rough set theory for rule induction. It is based on QuickReduct, a\ngreedy algorithm for building decision reducts. QuickRules already showed an\nimprovement over other rule induction methods. However, to evaluate the full\npotential of a fuzzy rough rule induction algorithm, one needs to start from\nthe foundations. In this paper, we introduce a novel rule induction algorithm\ncalled Fuzzy Rough Rule Induction (FRRI). We provide background and explain the\nworkings of our algorithm. Furthermore, we perform a computational experiment\nto evaluate the performance of our algorithm and compare it to other\nstate-of-the-art rule induction approaches. We find that our algorithm is more\naccurate while creating small rulesets consisting of relatively short rules. We\nend the paper by outlining some directions for future work.\n","authors":["Henri Bollaert","Marko Palangetić","Chris Cornelis","Salvatore Greco","Roman Słowiński"],"pdf_url":"https://arxiv.org/pdf/2403.04447v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12862v2","updated":"2024-08-29T11:18:16Z","published":"2024-04-19T13:01:59Z","title":"A Guide to Feature Importance Methods for Scientific Inference","summary":" While machine learning (ML) models are increasingly used due to their high\npredictive power, their use in understanding the data-generating process (DGP)\nis limited. Understanding the DGP requires insights into feature-target\nassociations, which many ML models cannot directly provide due to their opaque\ninternal mechanisms. Feature importance (FI) methods provide useful insights\ninto the DGP under certain conditions. Since the results of different FI\nmethods have different interpretations, selecting the correct FI method for a\nconcrete use case is crucial and still requires expert knowledge. This paper\nserves as a comprehensive guide to help understand the different\ninterpretations of global FI methods. Through an extensive review of FI methods\nand providing new proofs regarding their interpretation, we facilitate a\nthorough understanding of these methods and formulate concrete recommendations\nfor scientific inference. We conclude by discussing options for FI uncertainty\nestimation and point to directions for future research aiming at full\nstatistical inference from black-box ML models.\n","authors":["Fiona Katharina Ewald","Ludwig Bothmann","Marvin N. Wright","Bernd Bischl","Giuseppe Casalicchio","Gunnar König"],"pdf_url":"https://arxiv.org/pdf/2404.12862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15381v3","updated":"2024-08-29T11:03:57Z","published":"2024-04-23T09:44:58Z","title":"Advances and Open Challenges in Federated Foundation Models","summary":" The integration of Foundation Models (FMs) with Federated Learning (FL)\npresents a transformative paradigm in Artificial Intelligence (AI). This\nintegration offers enhanced capabilities while addressing concerns of privacy,\ndata decentralization, and computational efficiency. This paper provides a\ncomprehensive survey of the emerging field of Federated Foundation Models\n(FedFM), elucidating their synergistic relationship and exploring novel\nmethodologies, challenges, and future directions that the FL research field\nneeds to focus on in order to thrive in the age of FMs. A systematic\nmulti-tiered taxonomy is proposed, categorizing existing FedFM approaches for\nmodel training, aggregation, trustworthiness, and incentivization. Key\nchallenges, including how to enable FL to deal with high complexity of\ncomputational demands, privacy considerations, contribution evaluation, and\ncommunication efficiency, are thoroughly discussed. Moreover, the paper\nexplores the intricate challenges of communication, scalability, and security\ninherent in training/fine-tuning FMs via FL. It highlights the potential of\nquantum computing to revolutionize the processes of training, inference,\noptimization, and data encryption. This survey also introduces the\nimplementation requirement of FedFM and some practical FedFM applications.\nThen, this survey provides the lessons with a clear understanding of our\nfindings for FedFM. Finally, this survey not only provides insights into the\ncurrent state and challenges of FedFM but also paves the way for future\nresearch directions, emphasizing the need for developing trustworthy solutions.\nIt serves as a foundational guide for researchers and practitioners interested\nin contributing to this interdisciplinary and rapidly advancing field.\n","authors":["Chao Ren","Han Yu","Hongyi Peng","Xiaoli Tang","Bo Zhao","Liping Yi","Alysa Ziying Tan","Yulan Gao","Anran Li","Xiaoxiao Li","Zengxiang Li","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2404.15381v3.pdf","comment":"Survey of Federated Foundation Models (FedFM)"},{"id":"http://arxiv.org/abs/2408.16430v1","updated":"2024-08-29T10:44:59Z","published":"2024-08-29T10:44:59Z","title":"Do Recommender Systems Promote Local Music? A Reproducibility Study\n Using Music Streaming Data","summary":" This paper examines the influence of recommender systems on local music\nrepresentation, discussing prior findings from an empirical study on the LFM-2b\npublic dataset. This prior study argued that different recommender systems\nexhibit algorithmic biases shifting music consumption either towards or against\nlocal content. However, LFM-2b users do not reflect the diverse audience of\nmusic streaming services. To assess the robustness of this study's conclusions,\nwe conduct a comparative analysis using proprietary listening data from a\nglobal music streaming service, which we publicly release alongside this paper.\nWe observe significant differences in local music consumption patterns between\nour dataset and LFM-2b, suggesting that caution should be exercised when\ndrawing conclusions on local music based solely on LFM-2b. Moreover, we show\nthat the algorithmic biases exhibited in the original work vary in our dataset,\nand that several unexplored model parameters can significantly influence these\nbiases and affect the study's conclusion on both datasets. Finally, we discuss\nthe complexity of accurately labeling local music, emphasizing the risk of\nmisleading conclusions due to unreliable, biased, or incomplete labels. To\nencourage further research and ensure reproducibility, we have publicly shared\nour dataset and code.\n","authors":["Kristina Matrosova","Lilian Marey","Guillaume Salha-Galvan","Thomas Louail","Olivier Bodini","Manuel Moussallam"],"pdf_url":"https://arxiv.org/pdf/2408.16430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16429v1","updated":"2024-08-29T10:43:55Z","published":"2024-08-29T10:43:55Z","title":"Gradient-free variational learning with conditional mixture networks","summary":" Balancing computational efficiency with robust predictive performance is\ncrucial in supervised learning, especially for critical applications. Standard\ndeep learning models, while accurate and scalable, often lack probabilistic\nfeatures like calibrated predictions and uncertainty quantification. Bayesian\nmethods address these issues but can be computationally expensive as model and\ndata complexity increase. Previous work shows that fast variational methods can\nreduce the compute requirements of Bayesian methods by eliminating the need for\ngradient computation or sampling, but are often limited to simple models. We\ndemonstrate that conditional mixture networks (CMNs), a probabilistic variant\nof the mixture-of-experts (MoE) model, are suitable for fast, gradient-free\ninference and can solve complex classification tasks. CMNs employ linear\nexperts and a softmax gating network. By exploiting conditional conjugacy and\nP\\'olya-Gamma augmentation, we furnish Gaussian likelihoods for the weights of\nboth the linear experts and the gating network. This enables efficient\nvariational updates using coordinate ascent variational inference (CAVI),\navoiding traditional gradient-based optimization. We validate this approach by\ntraining two-layer CMNs on standard benchmarks from the UCI repository. Our\nmethod, CAVI-CMN, achieves competitive and often superior predictive accuracy\ncompared to maximum likelihood estimation (MLE) with backpropagation, while\nmaintaining competitive runtime and full posterior distributions over all model\nparameters. Moreover, as input size or the number of experts increases,\ncomputation time scales competitively with MLE and other gradient-based\nsolutions like black-box variational inference (BBVI), making CAVI-CMN a\npromising tool for deep, fast, and gradient-free Bayesian networks.\n","authors":["Conor Heins","Hao Wu","Dimitrije Markovic","Alexander Tschantz","Jeff Beck","Christopher Buckley"],"pdf_url":"https://arxiv.org/pdf/2408.16429v1.pdf","comment":"16 pages main text (3 figures), including references. 9 pages\n supplementary material (5 figures)"},{"id":"http://arxiv.org/abs/2408.16425v1","updated":"2024-08-29T10:35:07Z","published":"2024-08-29T10:35:07Z","title":"A Comparative Study of Hyperparameter Tuning Methods","summary":" The study emphasizes the challenge of finding the optimal trade-off between\nbias and variance, especially as hyperparameter optimization increases in\ncomplexity. Through empirical analysis, three hyperparameter tuning algorithms\nTree-structured Parzen Estimator (TPE), Genetic Search, and Random Search are\nevaluated across regression and classification tasks. The results show that\nnonlinear models, with properly tuned hyperparameters, significantly outperform\nlinear models. Interestingly, Random Search excelled in regression tasks, while\nTPE was more effective for classification tasks. This suggests that there is no\none-size-fits-all solution, as different algorithms perform better depending on\nthe task and model type. The findings underscore the importance of selecting\nthe appropriate tuning method and highlight the computational challenges\ninvolved in optimizing machine learning models, particularly as search spaces\nexpand.\n","authors":["Subhasis Dasgupta","Jaydip Sen"],"pdf_url":"https://arxiv.org/pdf/2408.16425v1.pdf","comment":"This chapter has been accepted in the edited volume titles \"Data\n Science in Theory and Practice\", editor J Sen & S Roy Choudhury. The volume\n is expected to be published in October 2024 by Cambridge Scholars Publishing,\n New Castle upon Tyne, UK. This chapter is 34 pages long and it contains 11\n tables and 8 images"},{"id":"http://arxiv.org/abs/2406.15852v2","updated":"2024-08-29T10:28:42Z","published":"2024-06-22T13:57:09Z","title":"Next Level Message-Passing with Hierarchical Support Graphs","summary":" Message-Passing Neural Networks (MPNNs) are extensively employed in graph\nlearning tasks but suffer from limitations such as the restricted scope of\ninformation exchange, by being confined to neighboring nodes during each round\nof message passing. Various strategies have been proposed to address these\nlimitations, including incorporating virtual nodes to facilitate global\ninformation exchange. In this study, we introduce the Hierarchical Support\nGraph (HSG), an extension of the virtual node concept created through recursive\ncoarsening of the original graph. This approach provides a flexible framework\nfor enhancing information flow in graphs, independent of the specific MPNN\nlayers utilized. We present a theoretical analysis of HSGs, investigate their\nempirical performance, and demonstrate that HSGs can surpass other methods\naugmented with virtual nodes, achieving state-of-the-art results across\nmultiple datasets.\n","authors":["Carlos Vonessen","Florian Grötschla","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2406.15852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16414v1","updated":"2024-08-29T10:21:00Z","published":"2024-08-29T10:21:00Z","title":"Fourier Spectral Physics Informed Neural Network: An Efficient and\n Low-Memory PINN","summary":" With growing investigations into solving partial differential equations by\nphysics-informed neural networks (PINNs), more accurate and efficient PINNs are\nrequired to meet the practical demands of scientific computing. One bottleneck\nof current PINNs is computing the high-order derivatives via automatic\ndifferentiation which often necessitates substantial computing resources. In\nthis paper, we focus on removing the automatic differentiation of the spatial\nderivatives and propose a spectral-based neural network that substitutes the\ndifferential operator with a multiplication. Compared to the PINNs, our\napproach requires lower memory and shorter training time. Thanks to the\nexponential convergence of the spectral basis, our approach is more accurate.\nMoreover, to handle the different situations between physics domain and\nspectral domain, we provide two strategies to train networks by their spectral\ninformation. Through a series of comprehensive experiments, We validate the\naforementioned merits of our proposed network.\n","authors":["Tianchi Yu","Yiming Qi","Ivan Oseledets","Shiyi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.16414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03973v2","updated":"2024-08-29T10:09:58Z","published":"2024-02-06T13:06:14Z","title":"A comparison between humans and AI at recognizing objects in unusual\n poses","summary":" Deep learning is closing the gap with human vision on several object\nrecognition benchmarks. Here we investigate this gap for challenging images\nwhere objects are seen in unusual poses. We find that humans excel at\nrecognizing objects in such poses. In contrast, state-of-the-art deep networks\nfor vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art\nlarge vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically\nbrittle on unusual poses, with the exception of Gemini showing excellent\nrobustness in that condition. As we limit image exposure time, human\nperformance degrades to the level of deep networks, suggesting that additional\nmental processes (requiring additional time) are necessary to identify objects\nin unusual poses. An analysis of error patterns of humans vs. networks reveals\nthat even time-limited humans are dissimilar to feed-forward deep networks. In\nconclusion, our comparison reveals that humans and deep networks rely on\ndifferent mechanisms for recognizing objects in unusual poses. Understanding\nthe nature of the mental processes taking place during extra viewing time may\nbe key to reproduce the robustness of human vision in silico.\n","authors":["Netta Ollikka","Amro Abbas","Andrea Perin","Markku Kilpeläinen","Stéphane Deny"],"pdf_url":"https://arxiv.org/pdf/2402.03973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16403v1","updated":"2024-08-29T10:02:29Z","published":"2024-08-29T10:02:29Z","title":"DeepSPoC: A Deep Learning-Based PDE Solver Governed by Sequential\n Propagation of Chaos","summary":" Sequential propagation of chaos (SPoC) is a recently developed tool to solve\nmean-field stochastic differential equations and their related nonlinear\nFokker-Planck equations. Based on the theory of SPoC, we present a new method\n(deepSPoC) that combines the interacting particle system of SPoC and deep\nlearning. Under the framework of deepSPoC, two classes of frequently used deep\nmodels include fully connected neural networks and normalizing flows are\nconsidered. For high-dimensional problems, spatial adaptive method are designed\nto further improve the accuracy and efficiency of deepSPoC. We analysis the\nconvergence of the framework of deepSPoC under some simplified conditions and\nalso provide a posterior error estimation for the algorithm. Finally, we test\nour methods on a wide range of different types of mean-field equations.\n","authors":["Kai Du","Yongle Xie","Tao Zhou","Yuancheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12660v2","updated":"2024-08-29T09:58:47Z","published":"2023-10-19T11:33:33Z","title":"Gradient Descent Fails to Learn High-frequency Functions and Modular\n Arithmetic","summary":" Classes of target functions containing a large number of approximately\northogonal elements are known to be hard to learn by the Statistical Query\nalgorithms. Recently this classical fact re-emerged in a theory of\ngradient-based optimization of neural networks. In the novel framework, the\nhardness of a class is usually quantified by the variance of the gradient with\nrespect to a random choice of a target function.\n A set of functions of the form $x\\to ax \\bmod p$, where $a$ is taken from\n${\\mathbb Z}_p$, has attracted some attention from deep learning theorists and\ncryptographers recently. This class can be understood as a subset of\n$p$-periodic functions on ${\\mathbb Z}$ and is tightly connected with a class\nof high-frequency periodic functions on the real line.\n We present a mathematical analysis of limitations and challenges associated\nwith using gradient-based learning techniques to train a high-frequency\nperiodic function or modular multiplication from examples. We highlight that\nthe variance of the gradient is negligibly small in both cases when either a\nfrequency or the prime base $p$ is large. This in turn prevents such a learning\nalgorithm from being successful.\n","authors":["Rustem Takhanov","Maxat Tezekbayev","Artur Pak","Arman Bolatov","Zhenisbek Assylbekov"],"pdf_url":"https://arxiv.org/pdf/2310.12660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16393v1","updated":"2024-08-29T09:55:55Z","published":"2024-08-29T09:55:55Z","title":"Illuminating the Diversity-Fitness Trade-Off in Black-Box Optimization","summary":" In real-world applications, users often favor structurally diverse design\nchoices over one high-quality solution. It is hence important to consider more\nsolutions that decision-makers can compare and further explore based on\nadditional criteria. Alongside the existing approaches of evolutionary\ndiversity optimization, quality diversity, and multimodal optimization, this\npaper presents a fresh perspective on this challenge by considering the problem\nof identifying a fixed number of solutions with a pairwise distance above a\nspecified threshold while maximizing their average quality.\n We obtain first insight into these objectives by performing a subset\nselection on the search trajectories of different well-established search\nheuristics, whether specifically designed with diversity in mind or not. We\nemphasize that the main goal of our work is not to present a new algorithm but\nto look at the problem in a more fundamental and theoretically tractable way by\nasking the question: What trade-off exists between the minimum distance within\nbatches of solutions and the average quality of their fitness? These insights\nalso provide us with a way of making general claims concerning the properties\nof optimization problems that shall be useful in turn for benchmarking\nalgorithms of the approaches enumerated above.\n A possibly surprising outcome of our empirical study is the observation that\nnaive uniform random sampling establishes a very strong baseline for our\nproblem, hardly ever outperformed by the search trajectories of the considered\nheuristics. We interpret these results as a motivation to develop algorithms\ntailored to produce diverse solutions of high average quality.\n","authors":["Maria Laura Santoni","Elena Raponi","Aneta Neumann","Frank Neumann","Mike Preuss","Carola Doerr"],"pdf_url":"https://arxiv.org/pdf/2408.16393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16391v1","updated":"2024-08-29T09:54:46Z","published":"2024-08-29T09:54:46Z","title":"TempoKGAT: A Novel Graph Attention Network Approach for Temporal Graph\n Analysis","summary":" Graph neural networks (GNN) have shown significant capabilities in handling\nstructured data, yet their application to dynamic, temporal data remains\nlimited. This paper presents a new type of graph attention network, called\nTempoKGAT, which combines time-decaying weight and a selective neighbor\naggregation mechanism on the spatial domain, which helps uncover latent\npatterns in the graph data. In this approach, a top-k neighbor selection based\non the edge weights is introduced to represent the evolving features of the\ngraph data. We evaluated the performance of our TempoKGAT on multiple datasets\nfrom the traffic, energy, and health sectors involving spatio-temporal data. We\ncompared the performance of our approach to several state-of-the-art methods\nfound in the literature on several open-source datasets. Our method shows\nsuperior accuracy on all datasets. These results indicate that TempoKGAT builds\non existing methodologies to optimize prediction accuracy and provide new\ninsights into model interpretation in temporal contexts.\n","authors":["Lena Sasal","Daniel Busby","Abdenour Hadid"],"pdf_url":"https://arxiv.org/pdf/2408.16391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09441v2","updated":"2024-08-29T09:52:58Z","published":"2024-07-12T17:27:43Z","title":"The $μ\\mathcal{G}$ Language for Programming Graph Neural Networks","summary":" Graph neural networks form a class of deep learning architectures\nspecifically designed to work with graph-structured data. As such, they share\nthe inherent limitations and problems of deep learning, especially regarding\nthe issues of explainability and trustworthiness. We propose $\\mu\\mathcal{G}$,\nan original domain-specific language for the specification of graph neural\nnetworks that aims to overcome these issues. The language's syntax is\nintroduced, and its meaning is rigorously defined by a denotational semantics.\nAn equivalent characterization in the form of an operational semantics is also\nprovided and, together with a type system, is used to prove the type soundness\nof $\\mu\\mathcal{G}$. We show how $\\mu\\mathcal{G}$ programs can be represented\nin a more user-friendly graphical visualization, and provide examples of its\ngenerality by showing how it can be used to define some of the most popular\ngraph neural network models, or to develop any custom graph processing\napplication.\n","authors":["Matteo Belenchia","Flavio Corradini","Michela Quadrini","Michele Loreti"],"pdf_url":"https://arxiv.org/pdf/2407.09441v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16389v1","updated":"2024-08-29T09:50:31Z","published":"2024-08-29T09:50:31Z","title":"Addressing Common Misinterpretations of KART and UAT in Neural Network\n Literature","summary":" This note addresses the Kolmogorov-Arnold Representation Theorem (KART) and\nthe Universal Approximation Theorem (UAT), focusing on their common\nmisinterpretations in some papers related to neural network approximation. Our\nremarks aim to support a more accurate understanding of KART and UAT among\nneural network specialists.\n","authors":["Vugar Ismailov"],"pdf_url":"https://arxiv.org/pdf/2408.16389v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2408.15294v2","updated":"2024-08-29T09:43:04Z","published":"2024-08-27T09:48:25Z","title":"Evaluating the Predictive Features of Person-Centric Knowledge Graph\n Embeddings: Unfolding Ablation Studies","summary":" Developing novel predictive models with complex biomedical information is\nchallenging due to various idiosyncrasies related to heterogeneity,\nstandardization or sparseness of the data. We previously introduced a\nperson-centric ontology to organize information about individual patients, and\na representation learning framework to extract person-centric knowledge graphs\n(PKGs) and to train Graph Neural Networks (GNNs). In this paper, we propose a\nsystematic approach to examine the results of GNN models trained with both\nstructured and unstructured information from the MIMIC-III dataset. Through\nablation studies on different clinical, demographic, and social data, we show\nthe robustness of this approach in identifying predictive features in PKGs for\nthe task of readmission prediction.\n","authors":["Christos Theodoropoulos","Natasha Mulligan","Joao Bettencourt-Silva"],"pdf_url":"https://arxiv.org/pdf/2408.15294v2.pdf","comment":"Published in the 34th Medical Informatics Europe Conference"},{"id":"http://arxiv.org/abs/2408.16379v1","updated":"2024-08-29T09:41:17Z","published":"2024-08-29T09:41:17Z","title":"TG-PhyNN: An Enhanced Physically-Aware Graph Neural Network framework\n for forecasting Spatio-Temporal Data","summary":" Accurately forecasting dynamic processes on graphs, such as traffic flow or\ndisease spread, remains a challenge. While Graph Neural Networks (GNNs) excel\nat modeling and forecasting spatio-temporal data, they often lack the ability\nto directly incorporate underlying physical laws. This work presents TG-PhyNN,\na novel Temporal Graph Physics-Informed Neural Network framework. TG-PhyNN\nleverages the power of GNNs for graph-based modeling while simultaneously\nincorporating physical constraints as a guiding principle during training. This\nis achieved through a two-step prediction strategy that enables the calculation\nof physical equation derivatives within the GNN architecture. Our findings\ndemonstrate that TG-PhyNN significantly outperforms traditional forecasting\nmodels (e.g., GRU, LSTM, GAT) on real-world spatio-temporal datasets like\nPedalMe (traffic flow), COVID-19 spread, and Chickenpox outbreaks. These\ndatasets are all governed by well-defined physical principles, which TG-PhyNN\neffectively exploits to offer more reliable and accurate forecasts in various\ndomains where physical processes govern the dynamics of data. This paves the\nway for improved forecasting in areas like traffic flow prediction, disease\noutbreak prediction, and potentially other fields where physics plays a crucial\nrole.\n","authors":["Zakaria Elabid","Lena Sasal","Daniel Busby","Abdenour Hadid"],"pdf_url":"https://arxiv.org/pdf/2408.16379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08800v2","updated":"2024-08-29T09:27:45Z","published":"2024-06-13T04:33:05Z","title":"Can Synthetic Audio From Generative Foundation Models Assist Audio\n Recognition and Speech Modeling?","summary":" Recent advances in foundation models have enabled audio-generative models\nthat produce high-fidelity sounds associated with music, events, and human\nactions. Despite the success achieved in modern audio-generative models, the\nconventional approach to assessing the quality of the audio generation relies\nheavily on distance metrics like Frechet Audio Distance. In contrast, we aim to\nevaluate the quality of audio generation by examining the effectiveness of\nusing them as training data. Specifically, we conduct studies to explore the\nuse of synthetic audio for audio recognition. Moreover, we investigate whether\nsynthetic audio can serve as a resource for data augmentation in speech-related\nmodeling. Our comprehensive experiments demonstrate the potential of using\nsynthetic audio for audio recognition and speech-related modeling. Our code is\navailable at https://github.com/usc-sail/SynthAudio.\n","authors":["Tiantian Feng","Dimitrios Dimitriadis","Shrikanth Narayanan"],"pdf_url":"https://arxiv.org/pdf/2406.08800v2.pdf","comment":"Accepted to 2024 INTERSPEECH; corrections to ActivityNet labels"},{"id":"http://arxiv.org/abs/2406.05482v4","updated":"2024-08-29T08:46:46Z","published":"2024-06-08T14:14:19Z","title":"Efficient Topology-aware Data Augmentation for High-Degree Graph Neural\n Networks","summary":" In recent years, graph neural networks (GNNs) have emerged as a potent tool\nfor learning on graph-structured data and won fruitful successes in varied\nfields. The majority of GNNs follow the message-passing paradigm, where\nrepresentations of each node are learned by recursively aggregating features of\nits neighbors. However, this mechanism brings severe over-smoothing and\nefficiency issues over high-degree graphs (HDGs), wherein most nodes have\ndozens (or even hundreds) of neighbors, such as social networks, transaction\ngraphs, power grids, etc. Additionally, such graphs usually encompass rich and\ncomplex structure semantics, which are hard to capture merely by feature\naggregations in GNNs. Motivated by the above limitations, we propose TADA, an\nefficient and effective front-mounted data augmentation framework for GNNs on\nHDGs. Under the hood, TADA includes two key modules: (i) feature expansion with\nstructure embeddings, and (ii) topology- and attribute-aware graph\nsparsification. The former obtains augmented node features and enhanced model\ncapacity by encoding the graph structure into high-quality structure embeddings\nwith our highly-efficient sketching method. Further, by exploiting\ntask-relevant features extracted from graph structures and attributes, the\nsecond module enables the accurate identification and reduction of numerous\nredundant/noisy edges from the input graph, thereby alleviating over-smoothing\nand facilitating faster feature aggregations over HDGs. Empirically, TADA\nconsiderably improves the predictive performance of mainstream GNN models on 8\nreal homophilic/heterophilic HDGs in terms of node classification, while\nachieving efficient training and inference processes.\n","authors":["Yurui Lai","Xiaoyang Lin","Renchi Yang","Hongtao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.05482v4.pdf","comment":"This is the technical report for the paper accepted to KDD 2024. 16\n pages"},{"id":"http://arxiv.org/abs/2408.16349v1","updated":"2024-08-29T08:36:22Z","published":"2024-08-29T08:36:22Z","title":"Machine learning models for daily rainfall forecasting in Northern\n Tropical Africa using tropical wave predictors","summary":" Numerical weather prediction (NWP) models often underperform compared to\nsimpler climatology-based precipitation forecasts in northern tropical Africa,\neven after statistical postprocessing. AI-based forecasting models show promise\nbut have avoided precipitation due to its complexity. Synoptic-scale forcings\nlike African easterly waves and other tropical waves (TWs) are important for\npredictability in tropical Africa, yet their value for predicting daily\nrainfall remains unexplored. This study uses two machine-learning models--gamma\nregression and a convolutional neural network (CNN)--trained on TW predictors\nfrom satellite-based GPM IMERG data to predict daily rainfall during the\nJuly-September monsoon season. Predictor variables are derived from the local\namplitude and phase information of seven TW from the target and\nup-and-downstream neighboring grids at 1-degree spatial resolution. The ML\nmodels are combined with Easy Uncertainty Quantification (EasyUQ) to generate\ncalibrated probabilistic forecasts and are compared with three benchmarks:\nExtended Probabilistic Climatology (EPC15), ECMWF operational ensemble forecast\n(ENS), and a probabilistic forecast from the ENS control member using EasyUQ\n(CTRL EasyUQ). The study finds that downstream predictor variables offer the\nhighest predictability, with downstream tropical depression (TD)-type\nwave-based predictors being most important. Other waves like mixed-Rossby\ngravity (MRG), Kelvin, and inertio-gravity waves also contribute significantly\nbut show regional preferences. ENS forecasts exhibit poor skill due to\nmiscalibration. CTRL EasyUQ shows improvement over ENS and marginal enhancement\nover EPC15. Both gamma regression and CNN forecasts significantly outperform\nbenchmarks in tropical Africa. This study highlights the potential of ML models\ntrained on TW-based predictors to improve daily precipitation forecasts in\ntropical Africa.\n","authors":["Athul Rasheeda Satheesh","Peter Knippertz","Andreas H. Fink"],"pdf_url":"https://arxiv.org/pdf/2408.16349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.12326v2","updated":"2024-08-29T08:27:27Z","published":"2024-02-19T18:00:30Z","title":"PsychoGAT: A Novel Psychological Measurement Paradigm through\n Interactive Fiction Games with LLM Agents","summary":" Psychological measurement is essential for mental health, self-understanding,\nand personal development. Traditional methods, such as self-report scales and\npsychologist interviews, often face challenges with engagement and\naccessibility. While game-based and LLM-based tools have been explored to\nimprove user interest and automate assessment, they struggle to balance\nengagement with generalizability. In this work, we propose PsychoGAT\n(Psychological Game AgenTs) to achieve a generic gamification of psychological\nassessment. The main insight is that powerful LLMs can function both as adept\npsychologists and innovative game designers. By incorporating LLM agents into\ndesignated roles and carefully managing their interactions, PsychoGAT can\ntransform any standardized scales into personalized and engaging interactive\nfiction games. To validate the proposed method, we conduct psychometric\nevaluations to assess its effectiveness and employ human evaluators to examine\nthe generated content across various psychological constructs, including\ndepression, cognitive distortions, and personality traits. Results demonstrate\nthat PsychoGAT serves as an effective assessment tool, achieving statistically\nsignificant excellence in psychometric metrics such as reliability, convergent\nvalidity, and discriminant validity. Moreover, human evaluations confirm\nPsychoGAT's enhancements in content coherence, interactivity, interest,\nimmersion, and satisfaction.\n","authors":["Qisen Yang","Zekun Wang","Honghui Chen","Shenzhi Wang","Yifan Pu","Xin Gao","Wenhao Huang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2402.12326v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2408.14144v2","updated":"2024-08-29T08:27:26Z","published":"2024-08-26T09:42:18Z","title":"Neighborhood and Global Perturbations Supported SAM in Federated\n Learning: From Local Tweaks To Global Awareness","summary":" Federated Learning (FL) can be coordinated under the orchestration of a\ncentral server to collaboratively build a privacy-preserving model without the\nneed for data exchange. However, participant data heterogeneity leads to local\noptima divergence, subsequently affecting convergence outcomes. Recent research\nhas focused on global sharpness-aware minimization (SAM) and dynamic\nregularization techniques to enhance consistency between global and local\ngeneralization and optimization objectives. Nonetheless, the estimation of\nglobal SAM introduces additional computational and memory overhead, while\ndynamic regularization suffers from bias in the local and global dual variables\ndue to training isolation. In this paper, we propose a novel FL algorithm,\nFedTOGA, designed to consider optimization and generalization objectives while\nmaintaining minimal uplink communication overhead. By linking local\nperturbations to global updates, global generalization consistency is improved.\nAdditionally, global updates are used to correct local dynamic regularizers,\nreducing dual variables bias and enhancing optimization consistency. Global\nupdates are passively received by clients, reducing overhead. We also propose\nneighborhood perturbation to approximate local perturbation, analyzing its\nstrengths and limitations. Theoretical analysis shows FedTOGA achieves faster\nconvergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate\nthat FedTOGA outperforms state-of-the-art algorithms, with a 1\\% accuracy\nincrease and 30\\% faster convergence, achieving state-of-the-art.\n","authors":["Boyuan Li","Zihao Peng","Yafei Li","Mingliang Xu","Shengbo Chen","Baofeng Ji","Cong Shen"],"pdf_url":"https://arxiv.org/pdf/2408.14144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16337v1","updated":"2024-08-29T08:20:02Z","published":"2024-08-29T08:20:02Z","title":"Do Graph Neural Networks Work for High Entropy Alloys?","summary":" Graph neural networks (GNNs) have excelled in predictive modeling for both\ncrystals and molecules, owing to the expressiveness of graph representations.\nHigh-entropy alloys (HEAs), however, lack chemical long-range order, limiting\nthe applicability of current graph representations. To overcome this challenge,\nwe propose a representation of HEAs as a collection of local environment (LE)\ngraphs. Based on this representation, we introduce the LESets machine learning\nmodel, an accurate, interpretable GNN for HEA property prediction. We\ndemonstrate the accuracy of LESets in modeling the mechanical properties of\nquaternary HEAs. Through analyses and interpretation, we further extract\ninsights into the modeling and design of HEAs. In a broader sense, LESets\nextends the potential applicability of GNNs to disordered materials with\ncombinatorial complexity formed by diverse constituents and their flexible\nconfigurations.\n","authors":["Hengrui Zhang","Ruishu Huang","Jie Chen","James M. Rondinelli","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2408.16337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11299v2","updated":"2024-08-29T08:14:31Z","published":"2023-12-18T15:49:03Z","title":"Uncertainty-based Fairness Measures","summary":" Unfair predictions of machine learning (ML) models impede their broad\nacceptance in real-world settings. Tackling this arduous challenge first\nnecessitates defining what it means for an ML model to be fair. This has been\naddressed by the ML community with various measures of fairness that depend on\nthe prediction outcomes of the ML models, either at the group level or the\nindividual level. These fairness measures are limited in that they utilize\npoint predictions, neglecting their variances, or uncertainties, making them\nsusceptible to noise, missingness and shifts in data. In this paper, we first\nshow that an ML model may appear to be fair with existing point-based fairness\nmeasures but biased against a demographic group in terms of prediction\nuncertainties. Then, we introduce new fairness measures based on different\ntypes of uncertainties, namely, aleatoric uncertainty and epistemic\nuncertainty. We demonstrate on many datasets that (i) our uncertainty-based\nmeasures are complementary to existing measures of fairness, and (ii) they\nprovide more insights about the underlying issues leading to bias.\n","authors":["Selim Kuzucu","Jiaee Cheong","Hatice Gunes","Sinan Kalkan"],"pdf_url":"https://arxiv.org/pdf/2312.11299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16336v1","updated":"2024-08-29T08:14:20Z","published":"2024-08-29T08:14:20Z","title":"GL-TSVM: A robust and smooth twin support vector machine with guardian\n loss function","summary":" Twin support vector machine (TSVM), a variant of support vector machine\n(SVM), has garnered significant attention due to its $3/4$ times lower\ncomputational complexity compared to SVM. However, due to the utilization of\nthe hinge loss function, TSVM is sensitive to outliers or noise. To remedy it,\nwe introduce the guardian loss (G-loss), a novel loss function distinguished by\nits asymmetric, bounded, and smooth characteristics. We then fuse the proposed\nG-loss function into the TSVM and yield a robust and smooth classifier termed\nGL-TSVM. Further, to adhere to the structural risk minimization (SRM) principle\nand reduce overfitting, we incorporate a regularization term into the objective\nfunction of GL-TSVM. To address the optimization challenges of GL-TSVM, we\ndevise an efficient iterative algorithm. The experimental analysis on UCI and\nKEEL datasets substantiates the effectiveness of the proposed GL-TSVM in\ncomparison to the baseline models. Moreover, to showcase the efficacy of the\nproposed GL-TSVM in the biomedical domain, we evaluated it on the breast cancer\n(BreaKHis) and schizophrenia datasets. The outcomes strongly demonstrate the\ncompetitiveness of the proposed GL-TSVM against the baseline models.\n","authors":["Mushir Akhtar","M. Tanveer","Mohd. Arshad"],"pdf_url":"https://arxiv.org/pdf/2408.16336v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.18101"},{"id":"http://arxiv.org/abs/2408.16333v1","updated":"2024-08-29T08:12:18Z","published":"2024-08-29T08:12:18Z","title":"Self-Improving Diffusion Models with Synthetic Data","summary":" The artificial intelligence (AI) world is running out of real data for\ntraining increasingly large generative models, resulting in accelerating\npressure to train on synthetic data. Unfortunately, training new generative\nmodels with synthetic data from current or past generation models creates an\nautophagous (self-consuming) loop that degrades the quality and/or diversity of\nthe synthetic data in what has been termed model autophagy disorder (MAD) and\nmodel collapse. Current thinking around model autophagy recommends that\nsynthetic data is to be avoided for model training lest the system deteriorate\ninto MADness. In this paper, we take a different tack that treats synthetic\ndata differently from real data. Self-IMproving diffusion models with Synthetic\ndata (SIMS) is a new training concept for diffusion models that uses\nself-synthesized data to provide negative guidance during the generation\nprocess to steer a model's generative process away from the non-ideal synthetic\ndata manifold and towards the real data distribution. We demonstrate that SIMS\nis capable of self-improvement; it establishes new records based on the\nFr\\'echet inception distance (FID) metric for CIFAR-10 and ImageNet-64\ngeneration and achieves competitive results on FFHQ-64 and ImageNet-512.\nMoreover, SIMS is, to the best of our knowledge, the first prophylactic\ngenerative AI algorithm that can be iteratively trained on self-generated\nsynthetic data without going MAD. As a bonus, SIMS can adjust a diffusion\nmodel's synthetic data distribution to match any desired in-domain target\ndistribution to help mitigate biases and ensure fairness.\n","authors":["Sina Alemohammad","Ahmed Imtiaz Humayun","Shruti Agarwal","John Collomosse","Richard Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2408.16333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18269v2","updated":"2024-08-29T08:07:43Z","published":"2024-07-19T22:51:41Z","title":"LaMAGIC: Language-Model-based Topology Generation for Analog Integrated\n Circuits","summary":" In the realm of electronic and electrical engineering, automation of analog\ncircuit is increasingly vital given the complexity and customized requirements\nof modern applications. However, existing methods only develop search-based\nalgorithms that require many simulation iterations to design a custom circuit\ntopology, which is usually a time-consuming process. To this end, we introduce\nLaMAGIC, a pioneering language model-based topology generation model that\nleverages supervised finetuning for automated analog circuit design. LaMAGIC\ncan efficiently generate an optimized circuit design from the custom\nspecification in a single pass. Our approach involves a meticulous development\nand analysis of various input and output formulations for circuit. These\nformulations can ensure canonical representations of circuits and align with\nthe autoregressive nature of LMs to effectively addressing the challenges of\nrepresenting analog circuits as graphs. The experimental results show that\nLaMAGIC achieves a success rate of up to 96\\% under a strict tolerance of 0.01.\nWe also examine the scalability and adaptability of LaMAGIC, specifically\ntesting its performance on more complex circuits. Our findings reveal the\nenhanced effectiveness of our adjacency matrix-based circuit formulation with\nfloating-point input, suggesting its suitability for handling intricate circuit\ndesigns. This research not only demonstrates the potential of language models\nin graph generation, but also builds a foundational framework for future\nexplorations in automated analog circuit design.\n","authors":["Chen-Chia Chang","Yikang Shen","Shaoze Fan","Jing Li","Shun Zhang","Ningyuan Cao","Yiran Chen","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.18269v2.pdf","comment":"Proceedings of the 41st International Conference on Machine Learning,\n PMLR 235:6253-6262 https://proceedings.mlr.press/v235/chang24c.html"},{"id":"http://arxiv.org/abs/2408.16321v1","updated":"2024-08-29T07:48:55Z","published":"2024-08-29T07:48:55Z","title":"Minimising changes to audit when updating decision trees","summary":" Interpretable models are important, but what happens when the model is\nupdated on new training data? We propose an algorithm for updating a decision\ntree while minimising the number of changes to the tree that a human would need\nto audit. We achieve this via a greedy approach that incorporates the number of\nchanges to the tree as part of the objective function. We compare our algorithm\nto existing methods and show that it sits in a sweet spot between final\naccuracy and number of changes to audit.\n","authors":["Anj Simmons","Scott Barnett","Anupam Chaudhuri","Sankhya Singh","Shangeetha Sivasothy"],"pdf_url":"https://arxiv.org/pdf/2408.16321v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2310.15274v2","updated":"2024-08-29T07:32:45Z","published":"2023-10-23T18:20:54Z","title":"1 From the Pursuit of Universal AGI Architecture to Systematic Approach\n to Heterogenous AGI: Addressing Alignment, Energy, & AGI Grand Challenges","summary":" AI faces a trifecta of grand challenges: the Energy Wall, the Alignment\nProblem and the Leap from Narrow AI to AGI. Contemporary AI solutions consume\nunsustainable amounts of energy during model training and daily operations.\nMaking things worse, the amount of computation required to train each new AI\nmodel has been doubling every 2 months since 2020, directly translating to\nunprecedented increases in energy consumption.\n The leap from AI to AGI requires multiple functional subsystems operating in\na balanced manner, which requires a system architecture. However, the current\napproach to artificial intelligence lacks system design; even though system\ncharacteristics play a key role in the human brain; from the way it processes\ninformation to how it makes decisions. System design is the key to alignment,\none of the most challenging goals in AI. This difficulty stems from the fact\nthat the complexity of human moral system requires a similarly sophisticated\nsystem for alignment. Without accurately reflecting the complexity of these\ncore moral subsystems and systems, aligning AI with human values becomes\nsignificantly more challenging.\n In this paper, we posit that system design is the missing piece in overcoming\nthe grand challenges. We present a Systematic Approach to AGI that utilizes\nsystem design principles to AGI, while providing ways to overcome the energy\nwall and the alignment challenges. This paper asserts that artificial\nintelligence can be realized through a multiplicity of design-specific\npathways, rather than a singular, overarching AGI architecture. AGI systems may\nexhibit diverse architectural configurations and capabilities, contingent upon\ntheir intended use cases. It advocates for a focus on employing system design\nprinciples as a guiding framework, rather than solely concentrating on a\nuniversal AGI architecture.\n","authors":["Eren Kurshan"],"pdf_url":"https://arxiv.org/pdf/2310.15274v2.pdf","comment":"International Journal on Semantic Computing (2024) Categories:\n Artificial Intelligence; AI; Artificial General Intelligence; AGI; System\n Design; System Architecture"},{"id":"http://arxiv.org/abs/2408.16315v1","updated":"2024-08-29T07:32:30Z","published":"2024-08-29T07:32:30Z","title":"Passenger hazard perception based on EEG signals for highly automated\n driving vehicles","summary":" Enhancing the safety of autonomous vehicles is crucial, especially given\nrecent accidents involving automated systems. As passengers in these vehicles,\nhumans' sensory perception and decision-making can be integrated with\nautonomous systems to improve safety. This study explores neural mechanisms in\npassenger-vehicle interactions, leading to the development of a Passenger\nCognitive Model (PCM) and the Passenger EEG Decoding Strategy (PEDS). Central\nto PEDS is a novel Convolutional Recurrent Neural Network (CRNN) that captures\nspatial and temporal EEG data patterns. The CRNN, combined with stacking\nalgorithms, achieves an accuracy of $85.0\\% \\pm 3.18\\%$. Our findings highlight\nthe predictive power of pre-event EEG data, enhancing the detection of\nhazardous scenarios and offering a network-driven framework for safer\nautonomous vehicles.\n","authors":["Ashton Yu Xuan Tan","Yingkai Yang","Xiaofei Zhang","Bowen Li","Xiaorong Gao","Sifa Zheng","Jianqiang Wang","Xinyu Gu","Jun Li","Yang Zhao","Yuxin Zhang","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2408.16315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16293v1","updated":"2024-08-29T06:49:20Z","published":"2024-08-29T06:49:20Z","title":"Physics of Language Models: Part 2.2, How to Learn From Mistakes on\n Grade-School Math Problems","summary":" Language models have demonstrated remarkable performance in solving reasoning\ntasks; however, even the strongest models still occasionally make reasoning\nmistakes. Recently, there has been active research aimed at improving reasoning\naccuracy, particularly by using pretrained language models to \"self-correct\"\ntheir mistakes via multi-round prompting. In this paper, we follow this line of\nwork but focus on understanding the usefulness of incorporating\n\"error-correction\" data directly into the pretraining stage. This data consists\nof erroneous solution steps immediately followed by their corrections. Using a\nsynthetic math dataset, we show promising results: this type of pretrain data\ncan help language models achieve higher reasoning accuracy directly (i.e.,\nthrough simple auto-regression, without multi-round prompting) compared to\npretraining on the same amount of error-free data. We also delve into many\ndetails, such as (1) how this approach differs from beam search, (2) how such\ndata can be prepared, (3) whether masking is needed on the erroneous tokens,\n(4) the amount of error required, (5) whether such data can be deferred to the\nfine-tuning stage, and many others.\n","authors":["Tian Ye","Zicheng Xu","Yuanzhi Li","Zeyuan Allen-Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.16293v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.20311"},{"id":"http://arxiv.org/abs/2408.16291v1","updated":"2024-08-29T06:48:07Z","published":"2024-08-29T06:48:07Z","title":"Flexible framework for generating synthetic electrocardiograms and\n photoplethysmograms","summary":" By generating synthetic biosignals, the quantity and variety of health data\ncan be increased. This is especially useful when training machine learning\nmodels by enabling data augmentation and introduction of more physiologically\nplausible variation to the data. For these purposes, we have developed a\nsynthetic biosignal model for two signal modalities, electrocardiography (ECG)\nand photoplethysmography (PPG). The model produces realistic signals that\naccount for physiological effects such as breathing modulation and changes in\nheart rate due to physical stress. Arrhythmic signals can be generated with\nbeat intervals extracted from real measurements. The model also includes a\nflexible approach to adding different kinds of noise and signal artifacts. The\nnoise is generated from power spectral densities extracted from both measured\nnoisy signals and modeled power spectra. Importantly, the model also\nautomatically produces labels for noise, segmentation (e.g. P and T waves, QRS\ncomplex, for electrocardiograms), and artifacts. We assessed how this\ncomprehensive model can be used in practice to improve the performance of\nmodels trained on ECG or PPG data. For example, we trained an LSTM to detect\nECG R-peaks using both real ECG signals from the MIT-BIH arrythmia set and our\nnew generator. The F1 score of the model was 0.83 using real data, in\ncomparison to 0.98 using our generator. In addition, the model can be used for\nexample in signal segmentation, quality detection and bench-marking detection\nalgorithms. The model code has been released in\n\\url{https://github.com/UTU-Health-Research/framework_for_synthetic_biosignals}\n","authors":["Katri Karhinoja","Antti Vasankari","Jukka-Pekka Sirkiä","Antti Airola","David Wong","Matti Kaisti"],"pdf_url":"https://arxiv.org/pdf/2408.16291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16288v1","updated":"2024-08-29T06:40:01Z","published":"2024-08-29T06:40:01Z","title":"OpenFGL: A Comprehensive Benchmarks for Federated Graph Learning","summary":" Federated graph learning (FGL) has emerged as a promising distributed\ntraining paradigm for graph neural networks across multiple local systems\nwithout direct data sharing. This approach is particularly beneficial in\nprivacy-sensitive scenarios and offers a new perspective on addressing\nscalability challenges in large-scale graph learning. Despite the proliferation\nof FGL, the diverse motivations from practical applications, spanning various\nresearch backgrounds and experimental settings, pose a significant challenge to\nfair evaluation. To fill this gap, we propose OpenFGL, a unified benchmark\ndesigned for the primary FGL scenarios: Graph-FL and Subgraph-FL. Specifically,\nOpenFGL includes 38 graph datasets from 16 application domains, 8 federated\ndata simulation strategies that emphasize graph properties, and 5 graph-based\ndownstream tasks. Additionally, it offers 18 recently proposed SOTA FGL\nalgorithms through a user-friendly API, enabling a thorough comparison and\ncomprehensive evaluation of their effectiveness, robustness, and efficiency.\nEmpirical results demonstrate the ability of FGL while also revealing its\npotential limitations, offering valuable insights for future exploration in\nthis thriving field.\n","authors":["Xunkai Li","Yinlin Zhu","Boyang Pang","Guochen Yan","Yeyu Yan","Zening Li","Zhengyu Wu","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16288v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.16286v1","updated":"2024-08-29T06:37:16Z","published":"2024-08-29T06:37:16Z","title":"Near-Optimal Policy Identification in Robust Constrained Markov Decision\n Processes via Epigraph Form","summary":" Designing a safe policy for uncertain environments is crucial in real-world\ncontrol applications. However, this challenge remains inadequately addressed\nwithin the Markov decision process (MDP) framework. This paper presents the\nfirst algorithm capable of identifying a near-optimal policy in a robust\nconstrained MDP (RCMDP), where an optimal policy minimizes cumulative cost\nwhile satisfying constraints in the worst-case scenario across a set of\nenvironments. We first prove that the conventional Lagrangian max-min\nformulation with policy gradient methods can become trapped in suboptimal\nsolutions by encountering a sum of conflicting gradients from the objective and\nconstraint functions during its inner minimization problem. To address this, we\nleverage the epigraph form of the RCMDP problem, which resolves the conflict by\nselecting a single gradient from either the objective or the constraints.\nBuilding on the epigraph form, we propose a binary search algorithm with a\npolicy gradient subroutine and prove that it identifies an\n$\\varepsilon$-optimal policy in an RCMDP with\n$\\tilde{\\mathcal{O}}(\\varepsilon^{-4})$ policy evaluations.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Wataru Kumagai","Kenta Hoshino","Yohei Hosoe","Kazumi Kasaura","Masashi Hamaya","Paavo Parmas","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2408.16286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16285v1","updated":"2024-08-29T06:30:23Z","published":"2024-08-29T06:30:23Z","title":"ART: Actually Robust Training","summary":" Current interest in deep learning captures the attention of many programmers\nand researchers. Unfortunately, the lack of a unified schema for developing\ndeep learning models results in methodological inconsistencies, unclear\ndocumentation, and problems with reproducibility. Some guidelines have been\nproposed, yet currently, they lack practical implementations. Furthermore,\nneural network training often takes on the form of trial and error, lacking a\nstructured and thoughtful process. To alleviate these issues, in this paper, we\nintroduce Art, a Python library designed to help automatically impose rules and\nstandards while developing deep learning pipelines. Art divides model\ndevelopment into a series of smaller steps of increasing complexity, each\nconcluded with a validation check improving the interpretability and robustness\nof the process. The current version of Art comes equipped with nine predefined\nsteps inspired by Andrej Karpathy's Recipe for Training Neural Networks, a\nvisualization dashboard, and integration with loggers such as Neptune. The code\nrelated to this paper is available at:\nhttps://github.com/SebChw/Actually-Robust-Training.\n","authors":["Sebastian Chwilczyński","Kacper Trębacz","Karol Cyganik","Mateusz Małecki","Dariusz Brzezinski"],"pdf_url":"https://arxiv.org/pdf/2408.16285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16284v1","updated":"2024-08-29T06:27:42Z","published":"2024-08-29T06:27:42Z","title":"Enhancing Customer Churn Prediction in Telecommunications: An Adaptive\n Ensemble Learning Approach","summary":" Customer churn, the discontinuation of services by existing customers, poses\na significant challenge to the telecommunications industry. This paper proposes\na novel adaptive ensemble learning framework for highly accurate customer churn\nprediction. The framework integrates multiple base models, including XGBoost,\nLightGBM, LSTM, a Multi-Layer Perceptron (MLP) neural network, and Support\nVector Machine (SVM). These models are strategically combined using a stacking\nensemble method, further enhanced by meta-feature generation from base model\npredictions. A rigorous data preprocessing pipeline, coupled with a\nmulti-faceted feature engineering approach, optimizes model performance. The\nframework is evaluated on three publicly available telecom churn datasets,\ndemonstrating substantial accuracy improvements over state-of-the-art\ntechniques. The research achieves a remarkable 99.28% accuracy, signifying a\nmajor advancement in churn prediction.The implications of this research for\ndeveloping proactive customer retention strategies withinthe telecommunications\nindustry are discussed.\n","authors":["Mohammed Affan Shaikhsurab","Pramod Magadum"],"pdf_url":"https://arxiv.org/pdf/2408.16284v1.pdf","comment":"12 pages,2 figures"},{"id":"http://arxiv.org/abs/2405.07288v2","updated":"2024-08-29T06:22:48Z","published":"2024-05-12T14:01:05Z","title":"Erasing Concepts from Text-to-Image Diffusion Models with Few-shot\n Unlearning","summary":" Generating images from text has become easier because of the scaling of\ndiffusion models and advancements in the field of vision and language. These\nmodels are trained using vast amounts of data from the Internet. Hence, they\noften contain undesirable content such as copyrighted material. As it is\nchallenging to remove such data and retrain the models, methods for erasing\nspecific concepts from pre-trained models have been investigated. We propose a\nnovel concept-erasure method that updates the text encoder using few-shot\nunlearning in which a few real images are used. The discussion regarding the\ngenerated images after erasing a concept has been lacking. While there are\nmethods for specifying the transition destination for concepts, the validity of\nthe specified concepts is unclear. Our method implicitly achieves this by\ntransitioning to the latent concepts inherent in the model or the images. Our\nmethod can erase a concept within 10 s, making concept erasure more accessible\nthan ever before. Implicitly transitioning to related concepts leads to more\nnatural concept erasure. We applied the proposed method to various concepts and\nconfirmed that concept erasure can be achieved tens to hundreds of times faster\nthan with current methods. By varying the parameters to be updated, we obtained\nresults suggesting that, like previous research, knowledge is primarily\naccumulated in the feed-forward networks of the text encoder. Our code is\navailable at \\url{https://github.com/fmp453/few-shot-erasing}\n","authors":["Masane Fuchi","Tomohiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2405.07288v2.pdf","comment":"25 pages, 28 figures, accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2304.14326v2","updated":"2024-08-29T06:17:11Z","published":"2023-04-27T16:58:29Z","title":"A Best-of-Both-Worlds Algorithm for Constrained MDPs with Long-Term\n Constraints","summary":" We study online learning in episodic constrained Markov decision processes\n(CMDPs), where the learner aims at collecting as much reward as possible over\nthe episodes, while satisfying some long-term constraints during the learning\nprocess. Rewards and constraints can be selected either stochastically or\nadversarially, and the transition function is not known to the learner. While\nonline learning in classical (unconstrained) MDPs has received considerable\nattention over the last years, the setting of CMDPs is still largely\nunexplored. This is surprising, since in real-world applications, such as,\ne.g., autonomous driving, automated bidding, and recommender systems, there are\nusually additional constraints and specifications that an agent has to obey\nduring the learning process. In this paper, we provide the first\nbest-of-both-worlds algorithm for CMDPs with long-term constraints, in the\nflavor of Balseiro et al. (2023). Our algorithm is capable of handling settings\nin which rewards and constraints are selected either stochastically or\nadversarially, without requiring any knowledge of the underling process.\nMoreover, our algorithm matches state-of-the-art regret and constraint\nviolation bounds for settings in which constraints are selected stochastically,\nwhile it is the first to provide guarantees in the case in which they are\nchosen adversarially.\n","authors":["Jacopo Germano","Francesco Emanuele Stradi","Gianmarco Genalti","Matteo Castiglioni","Alberto Marchesi","Nicola Gatti"],"pdf_url":"https://arxiv.org/pdf/2304.14326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14014v2","updated":"2024-08-29T06:04:57Z","published":"2024-08-26T04:39:33Z","title":"Category-Theoretical and Topos-Theoretical Frameworks in Machine\n Learning: A Survey","summary":" In this survey, we provide an overview of category theory-derived machine\nlearning from four mainstream perspectives: gradient-based learning,\nprobability-based learning, invariance and equivalence-based learning, and\ntopos-based learning. For the first three topics, we primarily review research\nin the past five years, updating and expanding on the previous survey by\nShiebler et al.. The fourth topic, which delves into higher category theory,\nparticularly topos theory, is surveyed for the first time in this paper. In\ncertain machine learning methods, the compositionality of functors plays a\nvital role, prompting the development of specific categorical frameworks.\nHowever, when considering how the global properties of a network reflect in\nlocal structures and how geometric properties are expressed with logic, the\ntopos structure becomes particularly significant and profound.\n","authors":["Yiyang Jia","Guohong Peng","Zheng Yang","Tianhao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06380v3","updated":"2024-08-29T05:59:24Z","published":"2023-10-10T07:46:54Z","title":"CAST: Cluster-Aware Self-Training for Tabular Data via Reliable\n Confidence","summary":" Tabular data is one of the most widely used data modalities, encompassing\nnumerous datasets with substantial amounts of unlabeled data. Despite this\nprevalence, there is a notable lack of simple and versatile methods for\nutilizing unlabeled data in the tabular domain, where both gradient-boosting\ndecision trees and neural networks are employed. In this context, self-training\nhas gained attraction due to its simplicity and versatility, yet it is\nvulnerable to noisy pseudo-labels caused by erroneous confidence. Several\nsolutions have been proposed to handle this problem, but they often compromise\nthe inherent advantages of self-training, resulting in limited applicability in\nthe tabular domain. To address this issue, we explore a novel direction of\nreliable confidence in self-training contexts and conclude that self-training\ncan be improved by making that the confidence, which represents the value of\nthe pseudo-label, aligns with the cluster assumption. In this regard, we\npropose Cluster-Aware Self-Training (CAST) for tabular data, which enhances\nexisting self-training algorithms at a negligible cost while maintaining\nsimplicity and versatility. Concretely, CAST calibrates confidence by\nregularizing the classifier's confidence based on local density for each class\nin the labeled training data, resulting in lower confidence for pseudo-labels\nin low-density regions. Extensive empirical evaluations on up to 21 real-world\ndatasets confirm not only the superior performance of CAST but also its\nrobustness in various setups in self-training contexts.\n","authors":["Minwook Kim","Juseong Kim","Ki Beom Kim","Giltae Song"],"pdf_url":"https://arxiv.org/pdf/2310.06380v3.pdf","comment":"11 pages for main body, and 10 additional pages for appendix"},{"id":"http://arxiv.org/abs/2408.16278v1","updated":"2024-08-29T05:56:35Z","published":"2024-08-29T05:56:35Z","title":"Web Service QoS Prediction via Extended Canonical Polyadic-based Tensor\n Network","summary":" Today, numerous web services with similar functionalities are available on\nthe Internet. Users often evaluate the Quality of Service (QoS) to choose the\nbest option among them. Predicting the QoS values of these web services is a\nsignificant challenge in the field of web services. A Canonical Polyadic\n(CP)-based tensor network model has proven to be efficient for predicting\ndynamic QoS data. However, current CP-based tensor network models do not\nconsider the correlation of users and services in the low-dimensional latent\nfeature space, thereby limiting model's prediction capability. To tackle this\nissue, this paper proposes an Extended Canonical polyadic-based Tensor Network\n(ECTN) model. It models the correlation of users and services via building a\nrelation dimension between user feature and service feature in low-dimensional\nspace, and then designs an extended CP decomposition structure to improve\nprediction accuracy. Experiments are conducted on two public dynamic QoS data,\nand the results show that compared with state-of-the-art QoS prediction models,\nthe ECTN obtains higher prediction accuracy.\n","authors":["Qu Wang","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16278v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.01942v4","updated":"2024-08-29T05:48:42Z","published":"2024-03-04T11:24:51Z","title":"Mitigating Label Noise on Graph via Topological Sample Selection","summary":" Despite the success of the carefully-annotated benchmarks, the effectiveness\nof existing graph neural networks (GNNs) can be considerably impaired in\npractice when the real-world graph data is noisily labeled. Previous\nexplorations in sample selection have been demonstrated as an effective way for\nrobust learning with noisy labels, however, the conventional studies focus on\ni.i.d data, and when moving to non-iid graph data and GNNs, two notable\nchallenges remain: (1) nodes located near topological class boundaries are very\ninformative for classification but cannot be successfully distinguished by the\nheuristic sample selection. (2) there is no available measure that considers\nthe graph topological information to promote sample selection in a graph. To\naddress this dilemma, we propose a $\\textit{Topological Sample Selection}$\n(TSS) method that boosts the informative sample selection process in a graph by\nutilising topological information. We theoretically prove that our procedure\nminimizes an upper bound of the expected risk under target clean distribution,\nand experimentally show the superiority of our method compared with\nstate-of-the-art baselines.\n","authors":["Yuhao Wu","Jiangchao Yao","Xiaobo Xia","Jun Yu","Ruxin Wang","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.01942v4.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.14400v2","updated":"2024-08-29T05:37:38Z","published":"2024-08-26T16:34:13Z","title":"Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation\n for Global Solar Mapping","summary":" The transition to renewable energy, particularly solar, is key to mitigating\nclimate change. Google's Solar API aids this transition by estimating solar\npotential from aerial imagery, but its impact is constrained by geographical\ncoverage. This paper proposes expanding the API's reach using satellite\nimagery, enabling global solar potential assessment. We tackle challenges\ninvolved in building a Digital Surface Model (DSM) and roof instance\nsegmentation from lower resolution and single oblique views using deep learning\nmodels. Our models, trained on aligned satellite and aerial datasets, produce\n25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch\nerror and ~56% IOU on roof segmentation, they significantly enhance the Solar\nAPI's potential to promote solar adoption.\n","authors":["Vishal Batchu","Alex Wilson","Betty Peng","Carl Elkin","Umangi Jain","Christopher Van Arsdale","Ross Goroshin","Varun Gulshan"],"pdf_url":"https://arxiv.org/pdf/2408.14400v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.16262v1","updated":"2024-08-29T04:57:44Z","published":"2024-08-29T04:57:44Z","title":"On Convergence of Average-Reward Q-Learning in Weakly Communicating\n Markov Decision Processes","summary":" This paper analyzes reinforcement learning (RL) algorithms for Markov\ndecision processes (MDPs) under the average-reward criterion. We focus on\nQ-learning algorithms based on relative value iteration (RVI), which are\nmodel-free stochastic analogues of the classical RVI method for average-reward\nMDPs. These algorithms have low per-iteration complexity, making them\nwell-suited for large state space problems. We extend the almost-sure\nconvergence analysis of RVI Q-learning algorithms developed by Abounadi,\nBertsekas, and Borkar (2001) from unichain to weakly communicating MDPs. This\nextension is important both practically and theoretically: weakly communicating\nMDPs cover a much broader range of applications compared to unichain MDPs, and\ntheir optimality equations have a richer solution structure (with multiple\ndegrees of freedom), introducing additional complexity in proving algorithmic\nconvergence. We also characterize the sets to which RVI Q-learning algorithms\nconverge, showing that they are compact, connected, potentially nonconvex, and\ncomprised of solutions to the average-reward optimality equation, with exactly\none less degree of freedom than the general solution set of this equation.\nFurthermore, we extend our analysis to two RVI-based hierarchical\naverage-reward RL algorithms using the options framework, proving their\nalmost-sure convergence and characterizing their sets of convergence under the\nassumption that the underlying semi-Markov decision process is weakly\ncommunicating.\n","authors":["Yi Wan","Huizhen Yu","Richard S. Sutton"],"pdf_url":"https://arxiv.org/pdf/2408.16262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16261v1","updated":"2024-08-29T04:46:49Z","published":"2024-08-29T04:46:49Z","title":"Evaluating Time-Series Training Dataset through Lens of Spectrum in Deep\n State Space Models","summary":" This study investigates a method to evaluate time-series datasets in terms of\nthe performance of deep neural networks (DNNs) with state space models (deep\nSSMs) trained on the dataset. SSMs have attracted attention as components\ninside DNNs to address time-series data. Since deep SSMs have powerful\nrepresentation capacities, training datasets play a crucial role in solving a\nnew task. However, the effectiveness of training datasets cannot be known until\ndeep SSMs are actually trained on them. This can increase the cost of data\ncollection for new tasks, as a trial-and-error process of data collection and\ntime-consuming training are needed to achieve the necessary performance. To\nadvance the practical use of deep SSMs, the metric of datasets to estimate the\nperformance early in the training can be one key element. To this end, we\nintroduce the concept of data evaluation methods used in system identification.\nIn system identification of linear dynamical systems, the effectiveness of\ndatasets is evaluated by using the spectrum of input signals. We introduce this\nconcept to deep SSMs, which are nonlinear dynamical systems. We propose the\nK-spectral metric, which is the sum of the top-K spectra of signals inside deep\nSSMs, by focusing on the fact that each layer of a deep SSM can be regarded as\na linear dynamical system. Our experiments show that the K-spectral metric has\na large absolute value of the correlation coefficient with the performance and\ncan be used to evaluate the quality of training datasets.\n","authors":["Sekitoshi Kanai","Yasutoshi Ida","Kazuki Adachi","Mihiro Uchida","Tsukasa Yoshida","Shin'ya Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2408.16261v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16256v1","updated":"2024-08-29T04:35:36Z","published":"2024-08-29T04:35:36Z","title":"Coalitions of AI-based Methods Predict 15-Year Risks of Breast Cancer\n Metastasis Using Real-World Clinical Data with AUC up to 0.9","summary":" Breast cancer is one of the two cancers responsible for the most deaths in\nwomen, with about 42,000 deaths each year in the US. That there are over\n300,000 breast cancers newly diagnosed each year suggests that only a fraction\nof the cancers result in mortality. Thus, most of the women undergo seemingly\ncurative treatment for localized cancers, but a significant later succumb to\nmetastatic disease for which current treatments are only temporizing for the\nvast majority. The current prognostic metrics are of little actionable value\nfor 4 of the 5 women seemingly cured after local treatment, and many women are\nexposed to morbid and even mortal adjuvant therapies unnecessarily, with these\nadjuvant therapies reducing metastatic recurrence by only a third. Thus, there\nis a need for better prognostics to target aggressive treatment at those who\nare likely to relapse and spare those who were actually cured. While there is a\nplethora of molecular and tumor-marker assays in use and under-development to\ndetect recurrence early, these are time consuming, expensive and still often\nun-validated as to actionable prognostic utility. A different approach would\nuse large data techniques to determine clinical and histopathological\nparameters that would provide accurate prognostics using existing data. Herein,\nwe report on machine learning, together with grid search and Bayesian Networks\nto develop algorithms that present a AUC of up to 0.9 in ROC analyses, using\nonly extant data. Such algorithms could be rapidly translated to clinical\nmanagement as they do not require testing beyond routine tumor evaluations.\n","authors":["Xia Jiang","Yijun Zhou","Alan Wells","Adam Brufsky"],"pdf_url":"https://arxiv.org/pdf/2408.16256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16249v1","updated":"2024-08-29T04:06:34Z","published":"2024-08-29T04:06:34Z","title":"Iterated Energy-based Flow Matching for Sampling from Boltzmann\n Densities","summary":" In this work, we consider the problem of training a generator from\nevaluations of energy functions or unnormalized densities. This is a\nfundamental problem in probabilistic inference, which is crucial for scientific\napplications such as learning the 3D coordinate distribution of a molecule. To\nsolve this problem, we propose iterated energy-based flow matching (iEFM), the\nfirst off-policy approach to train continuous normalizing flow (CNF) models\nfrom unnormalized densities. We introduce the simulation-free energy-based flow\nmatching objective, which trains the model to predict the Monte Carlo\nestimation of the marginal vector field constructed from known energy\nfunctions. Our framework is general and can be extended to variance-exploding\n(VE) and optimal transport (OT) conditional probability paths. We evaluate iEFM\non a two-dimensional Gaussian mixture model (GMM) and an eight-dimensional\nfour-particle double-well potential (DW-4) energy function. Our results\ndemonstrate that iEFM outperforms existing methods, showcasing its potential\nfor efficient and scalable probabilistic modeling in complex high-dimensional\nsystems.\n","authors":["Dongyeop Woo","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2408.16249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16246v1","updated":"2024-08-29T03:58:19Z","published":"2024-08-29T03:58:19Z","title":"PACiM: A Sparsity-Centric Hybrid Compute-in-Memory Architecture via\n Probabilistic Approximation","summary":" Approximate computing emerges as a promising approach to enhance the\nefficiency of compute-in-memory (CiM) systems in deep neural network\nprocessing. However, traditional approximate techniques often significantly\ntrade off accuracy for power efficiency, and fail to reduce data transfer\nbetween main memory and CiM banks, which dominates power consumption. This\npaper introduces a novel probabilistic approximate computation (PAC) method\nthat leverages statistical techniques to approximate multiply-and-accumulation\n(MAC) operations, reducing approximation error by 4X compared to existing\napproaches. PAC enables efficient sparsity-based computation in CiM systems by\nsimplifying complex MAC vector computations into scalar calculations. Moreover,\nPAC enables sparsity encoding and eliminates the LSB activations transmission,\nsignificantly reducing data reads and writes. This sets PAC apart from\ntraditional approximate computing techniques, minimizing not only computation\npower but also memory accesses by 50%, thereby boosting system-level\nefficiency. We developed PACiM, a sparsity-centric architecture that fully\nexploits sparsity to reduce bit-serial cycles by 81% and achieves a peak 8b/8b\nefficiency of 14.63 TOPS/W in 65 nm CMOS while maintaining high accuracy of\n93.85/72.36/66.02% on CIFAR-10/CIFAR-100/ImageNet benchmarks using a ResNet-18\nmodel, demonstrating the effectiveness of our PAC methodology.\n","authors":["Wenlun Zhang","Shimpei Ando","Yung-Chin Chen","Satomi Miyagi","Shinya Takamaeda-Yamazaki","Kentaro Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2408.16246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16245v1","updated":"2024-08-29T03:56:40Z","published":"2024-08-29T03:56:40Z","title":"Large-Scale Multi-omic Biosequence Transformers for Modeling\n Peptide-Nucleotide Interactions","summary":" The transformer architecture has revolutionized bioinformatics and driven\nprogress in the understanding and prediction of the properties of biomolecules.\nAlmost all research on large-scale biosequence transformers has focused on one\ndomain at a time (single-omic), usually nucleotides or peptides. These models\nhave seen incredible success in downstream tasks in each domain and have\nachieved particularly noteworthy breakthroughs in sequences of peptides and\nstructural modeling. However, these single-omic models are naturally incapable\nof modeling multi-omic tasks, one of the most biologically critical being\nnucleotide-peptide interactions.\n We present our work training the first multi-omic nucleotide-peptide\nfoundation models. We show that these multi-omic models (MOMs) can learn joint\nrepresentations between various single-omic distributions that are emergently\nconsistent with the Central Dogma of molecular biology, despite only being\ntrained on unlabeled biosequences. We further demonstrate that MOMs can be\nfine-tuned to achieve state-of-the-art results on peptide-nucleotide\ninteraction tasks, namely predicting the change in Gibbs free energy\n({\\Delta}G) of the binding interaction between a given oligonucleotide and\npeptide, as well as the effect on this binding interaction due to mutations in\nthe oligonucleotide sequence ({\\Delta}{\\Delta}G).\n Remarkably, we show that multi-omic biosequence transformers emergently learn\nuseful structural information without any prior structural training, allowing\nus to predict which peptide residues are most involved in the\npeptide-nucleotide binding interaction. Lastly, we provide evidence that\nmulti-omic biosequence models are non-inferior to foundation models trained on\nsingle-omics distributions, suggesting a more generalized or foundational\napproach to building these models.\n","authors":["Sully F. Chen","Robert J. Steele","Beakal Lemeneh","Shivanand P. Lad","Eric Oermann"],"pdf_url":"https://arxiv.org/pdf/2408.16245v1.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16232v1","updated":"2024-08-29T03:12:04Z","published":"2024-08-29T03:12:04Z","title":"Enhancing Conditional Image Generation with Explainable Latent Space\n Manipulation","summary":" In the realm of image synthesis, achieving fidelity to a reference image\nwhile adhering to conditional prompts remains a significant challenge. This\npaper proposes a novel approach that integrates a diffusion model with latent\nspace manipulation and gradient-based selective attention mechanisms to address\nthis issue. Leveraging Grad-SAM (Gradient-based Selective Attention\nManipulation), we analyze the cross attention maps of the cross attention\nlayers and gradients for the denoised latent vector, deriving importance scores\nof elements of denoised latent vector related to the subject of interest. Using\nthis information, we create masks at specific timesteps during denoising to\npreserve subjects while seamlessly integrating the reference image features.\nThis approach ensures the faithful formation of subjects based on conditional\nprompts, while concurrently refining the background for a more coherent\ncomposition. Our experiments on places365 dataset demonstrate promising\nresults, with our proposed model achieving the lowest mean and median Frechet\nInception Distance (FID) scores compared to baseline models, indicating\nsuperior fidelity preservation. Furthermore, our model exhibits competitive\nperformance in aligning the generated images with provided textual\ndescriptions, as evidenced by high CLIP scores. These results highlight the\neffectiveness of our approach in both fidelity preservation and textual context\npreservation, offering a significant advancement in text-to-image synthesis\ntasks.\n","authors":["Kshitij Pathania"],"pdf_url":"https://arxiv.org/pdf/2408.16232v1.pdf","comment":"7 pages , 5 figures"},{"id":"http://arxiv.org/abs/2312.02139v3","updated":"2024-08-29T03:09:40Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":" Diffusion models with their powerful expressivity and high sample quality\nhave achieved State-Of-The-Art (SOTA) performance in the generative domain. The\npioneering Vision Transformer (ViT) has also demonstrated strong modeling\ncapabilities and scalability, especially for recognition tasks. In this paper,\nwe study the effectiveness of ViTs in diffusion-based generative learning and\npropose a new model denoted as Diffusion Vision Transformers (DiffiT).\nSpecifically, we propose a methodology for finegrained control of the denoising\nprocess and introduce the Time-dependant Multihead Self Attention (TMSA)\nmechanism. DiffiT is surprisingly effective in generating high-fidelity images\nwith significantly better parameter efficiency. We also propose latent and\nimage space DiffiT models and show SOTA performance on a variety of\nclass-conditional and unconditional synthesis tasks at different resolutions.\nThe Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256\ndataset while having 19.85%, 16.88% less parameters than other\nTransformer-based diffusion models such as MDT and DiT,respectively. Code:\nhttps://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v3.pdf","comment":"Accepted to ECCV'24"},{"id":"http://arxiv.org/abs/2408.16228v1","updated":"2024-08-29T03:03:35Z","published":"2024-08-29T03:03:35Z","title":"Policy Adaptation via Language Optimization: Decomposing Tasks for\n Few-Shot Imitation","summary":" Learned language-conditioned robot policies often struggle to effectively\nadapt to new real-world tasks even when pre-trained across a diverse set of\ninstructions. We propose a novel approach for few-shot adaptation to unseen\ntasks that exploits the semantic understanding of task decomposition provided\nby vision-language models (VLMs). Our method, Policy Adaptation via Language\nOptimization (PALO), combines a handful of demonstrations of a task with\nproposed language decompositions sampled from a VLM to quickly enable rapid\nnonparametric adaptation, avoiding the need for a larger fine-tuning dataset.\nWe evaluate PALO on extensive real-world experiments consisting of challenging\nunseen, long-horizon robot manipulation tasks. We find that PALO is able of\nconsistently complete long-horizon, multi-tier tasks in the real world,\noutperforming state of the art pre-trained generalist policies, and methods\nthat have access to the same demonstrations.\n","authors":["Vivek Myers","Bill Chunyuan Zheng","Oier Mees","Sergey Levine","Kuan Fang"],"pdf_url":"https://arxiv.org/pdf/2408.16228v1.pdf","comment":"27 pages, 14 figures"},{"id":"http://arxiv.org/abs/2307.03411v2","updated":"2024-08-29T02:45:14Z","published":"2023-07-07T06:26:44Z","title":"Learning from Heterogeneity: A Dynamic Learning Framework for\n Hypergraphs","summary":" Graph neural network (GNN) has gained increasing popularity in recent years\nowing to its capability and flexibility in modeling complex graph structure\ndata. Among all graph learning methods, hypergraph learning is a technique for\nexploring the implicit higher-order correlations when training the embedding\nspace of the graph. In this paper, we propose a hypergraph learning framework\nnamed LFH that is capable of dynamic hyperedge construction and attentive\nembedding update utilizing the heterogeneity attributes of the graph.\nSpecifically, in our framework, the high-quality features are first generated\nby the pairwise fusion strategy that utilizes explicit graph structure\ninformation when generating initial node embedding. Afterwards, a hypergraph is\nconstructed through the dynamic grouping of implicit hyperedges, followed by\nthe type-specific hypergraph learning process. To evaluate the effectiveness of\nour proposed framework, we conduct comprehensive experiments on several popular\ndatasets with eleven state-of-the-art models on both node classification and\nlink prediction tasks, which fall into categories of homogeneous pairwise graph\nlearning, heterogeneous pairwise graph learning, and hypergraph learning. The\nexperiment results demonstrate a significant performance gain (average 12.5% in\nnode classification and 13.3% in link prediction) compared with recent\nstate-of-the-art methods.\n","authors":["Tiehua Zhang","Yuze Liu","Zhishu Shen","Xingjun Ma","Peng Qi","Zhijun Ding","Jiong Jin"],"pdf_url":"https://arxiv.org/pdf/2307.03411v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07585v2","updated":"2024-08-29T02:37:03Z","published":"2024-03-12T12:15:57Z","title":"Communication Optimization for Distributed Training: Architecture,\n Advances, and Opportunities","summary":" The past few years have witnessed the flourishing of large-scale deep neural\nnetwork models with ever-growing parameter numbers. Training such large-scale\nmodels typically requires massive memory and computing resources, necessitating\ndistributed training. As GPU performance has rapidly evolved in recent years,\ncomputation time has shrunk, making communication a larger portion of the\noverall training time. Consequently, optimizing communication for distributed\ntraining has become crucial. In this article, we briefly introduce the general\narchitecture of distributed deep neural network training and analyze\nrelationships among Parallelization Strategy, Collective Communication Library,\nand Network from the perspective of communication optimization, which forms a\nthree-layer paradigm. We then review current representative research advances\nwithin this three-layer paradigm. We find that layers in the current\nthree-layer paradigm are relatively independent and there is a rich design\nspace for cross-layer collaborative optimization in distributed training\nscenarios. Therefore, we advocate \"Vertical\" and \"Horizontal\" co-designs which\nextend the three-layer paradigm to a five-layer paradigm. We also advocate\n\"Intra-Inter\" and \"Host-Net\" co-designs to further utilize the potential of\nheterogeneous resources. We hope this article can shed some light on future\nresearch on communication optimization for distributed training.\n","authors":["Yunze Wei","Tianshuo Hu","Cong Liang","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2403.07585v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.21191v2","updated":"2024-08-29T02:27:19Z","published":"2024-07-30T20:58:36Z","title":"GenRec: Generative Sequential Recommendation with Large Language Models","summary":" Sequential recommendation is a task to capture hidden user preferences from\nhistorical user item interaction data and recommend next items for the user.\nSignificant progress has been made in this domain by leveraging classification\nbased learning methods. Inspired by the recent paradigm of 'pretrain, prompt\nand predict' in NLP, we consider sequential recommendation as a sequence to\nsequence generation task and propose a novel model named Generative\nRecommendation (GenRec). Unlike classification based models that learn explicit\nuser and item representations, GenRec utilizes the sequence modeling capability\nof Transformer and adopts the masked item prediction objective to effectively\nlearn the hidden bidirectional sequential patterns. Different from existing\ngenerative sequential recommendation models, GenRec does not rely on manually\ndesigned hard prompts. The input to GenRec is textual user item sequence and\nthe output is top ranked next items. Moreover, GenRec is lightweight and\nrequires only a few hours to train effectively in low-resource settings, making\nit highly applicable to real-world scenarios and helping to democratize large\nlanguage models in the sequential recommendation domain. Our extensive\nexperiments have demonstrated that GenRec generalizes on various public\nreal-world datasets and achieves state-of-the-art results. Our experiments also\nvalidate the effectiveness of the the proposed masked item prediction objective\nthat improves the model performance by a large margin.\n","authors":["Panfeng Cao","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2407.21191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16218v1","updated":"2024-08-29T02:21:11Z","published":"2024-08-29T02:21:11Z","title":"Targeted Cause Discovery with Data-Driven Learning","summary":" We propose a novel machine learning approach for inferring causal variables\nof a target variable from observations. Our goal is to identify both direct and\nindirect causes within a system, thereby efficiently regulating the target\nvariable when the difficulty and cost of intervening on each causal variable\nvary. Our method employs a neural network trained to identify causality through\nsupervised learning on simulated data. By implementing a local-inference\nstrategy, we achieve linear complexity with respect to the number of variables,\nefficiently scaling up to thousands of variables. Empirical results demonstrate\nthe effectiveness of our method in identifying causal relationships within\nlarge-scale gene regulatory networks, outperforming existing causal discovery\nmethods that primarily focus on direct causality. We validate our model's\ngeneralization capability across novel graph structures and generating\nmechanisms, including gene regulatory networks of E. coli and the human K562\ncell line. Implementation codes are available at\nhttps://github.com/snu-mllab/Targeted-Cause-Discovery.\n","authors":["Jang-Hyun Kim","Claudia Skok Gibbs","Sangdoo Yun","Hyun Oh Song","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2408.16218v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2408.16215v1","updated":"2024-08-29T02:18:28Z","published":"2024-08-29T02:18:28Z","title":"Adversarial Network Optimization under Bandit Feedback: Maximizing\n Utility in Non-Stationary Multi-Hop Networks","summary":" Stochastic Network Optimization (SNO) concerns scheduling in stochastic\nqueueing systems. It has been widely studied in network theory. Classical SNO\nalgorithms require network conditions to be stationary with time, which fails\nto capture the non-stationary components in many real-world scenarios. Many\nexisting algorithms also assume knowledge of network conditions before\ndecision, which rules out applications where unpredictability presents.\n Motivated by these issues, we consider Adversarial Network Optimization (ANO)\nunder bandit feedback. Specifically, we consider the task of *i)* maximizing\nsome unknown and time-varying utility function associated to scheduler's\nactions, where *ii)* the underlying network is a non-stationary multi-hop one\nwhose conditions change arbitrarily with time, and *iii)* only bandit feedback\n(effect of actually deployed actions) is revealed after decisions. Our proposed\n`UMO2` algorithm ensures network stability and also matches the utility\nmaximization performance of any \"mildly varying\" reference policy up to a\npolynomially decaying gap. To our knowledge, no previous ANO algorithm handled\nmulti-hop networks or achieved utility guarantees under bandit feedback,\nwhereas ours can do both.\n Technically, our method builds upon a novel integration of online learning\ninto Lyapunov analyses: To handle complex inter-dependencies among queues in\nmulti-hop networks, we propose meticulous techniques to balance online learning\nand Lyapunov arguments. To tackle the learning obstacles due to potentially\nunbounded queue sizes, we design a new online linear optimization algorithm\nthat automatically adapts to loss magnitudes. To maximize utility, we propose a\nbandit convex optimization algorithm with novel queue-dependent learning rate\nscheduling that suites drastically varying queue lengths. Our new insights in\nonline learning can be of independent interest.\n","authors":["Yan Dai","Longbo Huang"],"pdf_url":"https://arxiv.org/pdf/2408.16215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13989v2","updated":"2024-08-29T02:17:51Z","published":"2024-07-19T02:34:10Z","title":"Enhancing Data-Limited Graph Neural Networks by Actively Distilling\n Knowledge from Large Language Models","summary":" Graphs are pervasive in the real-world, such as social network analysis,\nbioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great\nability in node classification, a fundamental task on graphs. Unfortunately,\nconventional GNNs still face challenges in scenarios with few labeled nodes,\ndespite the prevalence of few-shot node classification tasks in real-world\napplications. To address this challenge, various approaches have been proposed,\nincluding graph meta-learning, transfer learning, and methods based on Large\nLanguage Models (LLMs). However, traditional meta-learning and transfer\nlearning methods often require prior knowledge from base classes or fail to\nexploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based\nmethods may overlook the zero-shot capabilities of LLMs and rely heavily on the\nquality of generated contexts. In this paper, we propose a novel approach that\nintegrates LLMs and GNNs, leveraging the zero-shot inference and reasoning\ncapabilities of LLMs and employing a Graph-LLM-based active learning paradigm\nto enhance GNNs' performance. Extensive experiments demonstrate the\neffectiveness of our model in improving node classification accuracy with\nconsiderably limited labeled data, surpassing state-of-the-art baselines by\nsignificant margins.\n","authors":["Quan Li","Tianxiang Zhao","Lingwei Chen","Junjie Xu","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13989v2.pdf","comment":"10 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2408.16212v1","updated":"2024-08-29T02:09:19Z","published":"2024-08-29T02:09:19Z","title":"The Application of Machine Learning in Tidal Evolution Simulation of\n Star-Planet Systems","summary":" With the release of a large amount of astronomical data, an increasing number\nof close-in hot Jupiters have been discovered. Calculating their evolutionary\ncurves using star-planet interaction models presents a challenge. To expedite\nthe generation of evolutionary curves for these close-in hot Jupiter systems,\nwe utilized tidal interaction models established on MESA to create 15,745\nsamples of star-planet systems and 7,500 samples of stars. Additionally, we\nemployed a neural network (Multi-Layer Perceptron - MLP) to predict the\nevolutionary curves of the systems, including stellar effective temperature,\nradius, stellar rotation period, and planetary orbital period. The median\nrelative errors of the predicted evolutionary curves were found to be 0.15%,\n0.43%, 2.61%, and 0.57%, respectively. Furthermore, the speed at which we\ngenerate evolutionary curves exceeds that of model-generated curves by more\nthan four orders of magnitude. We also extracted features of planetary\nmigration states and utilized lightGBM to classify the samples into 6\ncategories for prediction. We found that by combining three types that undergo\nlong-term double synchronization into one label, the classifier effectively\nrecognized these features. Apart from systems experiencing long-term double\nsynchronization, the median relative errors of the predicted evolutionary\ncurves were all below 4%. Our work provides an efficient method to save\nsignificant computational resources and time with minimal loss in accuracy.\nThis research also lays the foundation for analyzing the evolutionary\ncharacteristics of systems under different migration states, aiding in the\nunderstanding of the underlying physical mechanisms of such systems. Finally,\nto a large extent, our approach could replace the calculations of theoretical\nmodels.\n","authors":["Shuaishuai Guo","Jianheng Guo","KaiFan Ji","Hui Liu","Lei Xing"],"pdf_url":"https://arxiv.org/pdf/2408.16212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16208v1","updated":"2024-08-29T02:03:05Z","published":"2024-08-29T02:03:05Z","title":"ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology\n Report Generation Metrics","summary":" Given the rapidly expanding capabilities of generative AI models for\nradiology, there is a need for robust metrics that can accurately measure the\nquality of AI-generated radiology reports across diverse hospitals. We develop\nReXamine-Global, a LLM-powered, multi-site framework that tests metrics across\ndifferent writing styles and patient populations, exposing gaps in their\ngeneralization. First, our method tests whether a metric is undesirably\nsensitive to reporting style, providing different scores depending on whether\nAI-generated reports are stylistically similar to ground-truth reports or not.\nSecond, our method measures whether a metric reliably agrees with experts, or\nwhether metric and expert scores of AI-generated report quality diverge for\nsome sites. Using 240 reports from 6 hospitals around the world, we apply\nReXamine-Global to 7 established report evaluation metrics and uncover serious\ngaps in their generalizability. Developers can apply ReXamine-Global when\ndesigning new report evaluation metrics, ensuring their robustness across\nsites. Additionally, our analysis of existing metrics can guide users of those\nmetrics towards evaluation procedures that work reliably at their sites of\ninterest.\n","authors":["Oishi Banerjee","Agustina Saenz","Kay Wu","Warren Clements","Adil Zia","Dominic Buensalido","Helen Kavnoudias","Alain S. Abi-Ghanem","Nour El Ghawi","Cibele Luna","Patricia Castillo","Khaled Al-Surimi","Rayyan A. Daghistani","Yuh-Min Chen","Heng-sheng Chao","Lars Heiliger","Moon Kim","Johannes Haubold","Frederic Jonske","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2408.16208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15591v2","updated":"2024-08-29T02:01:56Z","published":"2024-08-28T07:31:32Z","title":"VFLIP: A Backdoor Defense for Vertical Federated Learning via\n Identification and Purification","summary":" Vertical Federated Learning (VFL) focuses on handling vertically partitioned\ndata over FL participants. Recent studies have discovered a significant\nvulnerability in VFL to backdoor attacks which specifically target the distinct\ncharacteristics of VFL. Therefore, these attacks may neutralize existing\ndefense mechanisms designed primarily for Horizontal Federated Learning (HFL)\nand deep neural networks. In this paper, we present the first backdoor defense,\ncalled VFLIP, specialized for VFL. VFLIP employs the identification and\npurification techniques that operate at the inference stage, consequently\nimproving the robustness against backdoor attacks to a great extent. VFLIP\nfirst identifies backdoor-triggered embeddings by adopting a participant-wise\nanomaly detection approach. Subsequently, VFLIP conducts purification which\nremoves the embeddings identified as malicious and reconstructs all the\nembeddings based on the remaining embeddings. We conduct extensive experiments\non CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate\nthat VFLIP can effectively mitigate backdoor attacks in VFL.\nhttps://github.com/blingcho/VFLIP-esorics24\n","authors":["Yungi Cho","Woorim Han","Miseon Yu","Younghan Lee","Ho Bae","Yunheung Paek"],"pdf_url":"https://arxiv.org/pdf/2408.15591v2.pdf","comment":"Accepted by 29th European Symposium on Research in Computer Security\n (ESORICS 2024)"},{"id":"http://arxiv.org/abs/2406.02126v3","updated":"2024-08-29T02:00:25Z","published":"2024-06-04T09:10:14Z","title":"CityLight: A Universal Model for Coordinated Traffic Signal Control in\n City-scale Heterogeneous Intersections","summary":" The increasingly severe congestion problem in modern cities strengthens the\nsignificance of developing city-scale traffic signal control (TSC) methods for\ntraffic efficiency enhancement. While reinforcement learning has been widely\nexplored in TSC, most of them still target small-scale optimization and cannot\ndirectly scale to the city level due to unbearable resource demand. Only a few\nof them manage to tackle city-level optimization, namely a thousand-scale\noptimization, by incorporating parameter-sharing mechanisms, but hardly have\nthey fully tackled the heterogeneity of intersections and intricate\nbetween-intersection interactions inherent in real-world city road networks. To\nfill in the gap, we target at the two important challenges in adopting\nparameter-sharing paradigms to solve TSC: inconsistency of inner state\nrepresentations for intersections heterogeneous in configuration, scale, and\norders of available traffic phases; intricacy of impacts from neighborhood\nintersections that have various relative traffic relationships due to\ninconsistent phase orders and diverse relative positioning. Our method,\nCityLight, features a universal representation module that not only aligns the\nstate representations of intersections by reindexing their phases based on\ntheir semantics and designing heterogeneity-preserving observations, but also\nencodes the narrowed relative traffic relation types to project the\nneighborhood intersections onto a uniform relative traffic impact space. We\nfurther attentively fuse neighborhood representations based on their competing\nrelations and incorporate neighborhood-integrated rewards to boost\ncoordination. Extensive experiments with hundreds to tens of thousands of\nintersections validate the surprising effectiveness and generalizability of\nCityLight, with an overall performance gain of 11.68% and a 22.59% improvement\nin transfer scenarios in throughput.\n","authors":["Jinwei Zeng","Chao Yu","Xinyi Yang","Wenxuan Ao","Qianyue Hao","Jian Yuan","Yong Li","Yu Wang","Huazhong Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02126v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16204v1","updated":"2024-08-29T01:50:13Z","published":"2024-08-29T01:50:13Z","title":"Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient\n Manipulation","summary":" Micro-batch clipping, a gradient clipping method, has recently shown\npotential in enhancing auto-speech recognition (ASR) model performance.\nHowever, the underlying mechanism behind this improvement remains mysterious,\nparticularly the observation that only certain micro-batch sizes are\nbeneficial. In this paper, we make the first attempt to explain this\nphenomenon. Inspired by recent data pruning research, we assume that specific\ntraining samples may impede model convergence during certain training phases.\nUnder this assumption, the convergence analysis shows that micro-batch clipping\ncan improve the convergence rate asymptotically at the cost of an additional\nconstant bias that does not diminish with more training iterations. The bias is\ndependent on a few factors and can be minimized at specific micro-batch size,\nthereby elucidating the existence of the sweet-spot micro-batch size observed\npreviously. We also verify the effectiveness of micro-batch clipping beyond\nspeech models on vision and language models, and show promising performance\ngains in these domains. An exploration of potential limitations shows that\nmicro-batch clipping is less effective when training data originates from\nmultiple distinct domains.\n","authors":["Lun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16202v1","updated":"2024-08-29T01:47:09Z","published":"2024-08-29T01:47:09Z","title":"Short-Term Electricity-Load Forecasting by Deep Learning: A\n Comprehensive Survey","summary":" Short-Term Electricity-Load Forecasting (STELF) refers to the prediction of\nthe immediate demand (in the next few hours to several days) for the power\nsystem. Various external factors, such as weather changes and the emergence of\nnew electricity consumption scenarios, can impact electricity demand, causing\nload data to fluctuate and become non-linear, which increases the complexity\nand difficulty of STELF. In the past decade, deep learning has been applied to\nSTELF, modeling and predicting electricity demand with high accuracy, and\ncontributing significantly to the development of STELF. This paper provides a\ncomprehensive survey on deep-learning-based STELF over the past ten years. It\nexamines the entire forecasting process, including data pre-processing, feature\nextraction, deep-learning modeling and optimization, and results evaluation.\nThis paper also identifies some research challenges and potential research\ndirections to be further investigated in future work.\n","authors":["Qi Dong","Rubing Huang","Chenhui Cui","Dave Towey","Ling Zhou","Jinyu Tian","Jianzhou Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16201v1","updated":"2024-08-29T01:46:37Z","published":"2024-08-29T01:46:37Z","title":"Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on\n Model-free Products","summary":" Anomaly detection is a long-standing challenge in manufacturing systems.\nTraditionally, anomaly detection has relied on human inspectors. However, 3D\npoint clouds have gained attention due to their robustness to environmental\nfactors and their ability to represent geometric data. Existing 3D anomaly\ndetection methods generally fall into two categories. One compares scanned 3D\npoint clouds with design files, assuming these files are always available.\nHowever, such assumptions are often violated in many real-world applications\nwhere model-free products exist, such as fresh produce (i.e., ``Cookie\",\n``Potato\", etc.), dentures, bone, etc. The other category compares patches of\nscanned 3D point clouds with a library of normal patches named memory bank.\nHowever, those methods usually fail to detect incomplete shapes, which is a\nfairly common defect type (i.e., missing pieces of different products). The\nmain challenge is that missing areas in 3D point clouds represent the absence\nof scanned points. This makes it infeasible to compare the missing region with\nexisting point cloud patches in the memory bank. To address these two\nchallenges, we proposed a unified, unsupervised 3D anomaly detection framework\ncapable of identifying all types of defects on model-free products. Our method\nintegrates two detection modules: a feature-based detection module and a\nreconstruction-based detection module. Feature-based detection covers geometric\ndefects, such as dents, holes, and cracks, while the reconstruction-based\nmethod detects missing regions. Additionally, we employ a One-class Support\nVector Machine (OCSVM) to fuse the detection results from both modules. The\nresults demonstrate that (1) our proposed method outperforms the\nstate-of-the-art methods in identifying incomplete shapes and (2) it still\nmaintains comparable performance with the SOTA methods in detecting all other\ntypes of anomalies.\n","authors":["Jiayu Liu","Shancong Mou","Nathan Gaw","Yinan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.01970v7","updated":"2024-08-29T01:38:57Z","published":"2022-05-04T09:37:16Z","title":"Non-Stationary Bandit Learning via Predictive Sampling","summary":" Thompson sampling has proven effective across a wide range of stationary\nbandit environments. However, as we demonstrate in this paper, it can perform\npoorly when applied to non-stationary environments. We attribute such failures\nto the fact that, when exploring, the algorithm does not differentiate actions\nbased on how quickly the information acquired loses its usefulness due to\nnon-stationarity. Building upon this insight, we propose predictive sampling,\nan algorithm that deprioritizes acquiring information that quickly loses\nusefulness. A theoretical guarantee on the performance of predictive sampling\nis established through a Bayesian regret bound. We provide versions of\npredictive sampling for which computations tractably scale to complex bandit\nenvironments of practical interest. Through numerical simulations, we\ndemonstrate that predictive sampling outperforms Thompson sampling in all\nnon-stationary environments examined.\n","authors":["Yueyang Liu","Xu Kuang","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2205.01970v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06727v2","updated":"2024-08-29T01:26:31Z","published":"2023-06-11T17:17:48Z","title":"A Normalized Bottleneck Distance on Persistence Diagrams and Homology\n Preservation under Dimension Reduction","summary":" Persistence diagrams (PDs) are used as signatures of point cloud data. Two\nclouds of points can be compared using the bottleneck distance d_B between\ntheir PDs. A potential drawback of this pipeline is that point clouds sampled\nfrom topologically similar manifolds can have arbitrarily large d_B when there\nis a large scaling between them. This situation is typical in dimension\nreduction frameworks.\n We define, and study properties of, a new scale-invariant distance between\nPDs termed normalized bottleneck distance, d_N. In defining d_N, we develop a\nbroader framework called metric decomposition for comparing finite metric\nspaces of equal cardinality with a bijection. We utilize metric decomposition\nto prove a stability result for d_N by deriving an explicit bound on the\ndistortion of the bijective map. We then study two popular dimension reduction\ntechniques, Johnson-Lindenstrauss (JL) projections and metric multidimensional\nscaling (mMDS), and a third class of general biLipschitz mappings. We provide\nnew bounds on how well these dimension reduction techniques preserve homology\nwith respect to d_N. For a JL map f that transforms input X to f(X), we show\nthat d_N(dgm(X),dgm(f(X))) < e, where dgm(X) is the Vietoris-Rips PD of X, and\npairwise distances are preserved by f up to the tolerance 0 < \\epsilon < 1. For\nmMDS, we present new bounds for d_B and d_N between PDs of X and its projection\nin terms of the eigenvalues of the covariance matrix. And for k-biLipschitz\nmaps, we show that d_N is bounded by the product of (k^2-1)/k and the ratio of\ndiameters of X and f(X). Finally, we use computational experiments to\ndemonstrate the increased effectiveness of using the normalized bottleneck\ndistance for clustering sets of point clouds sampled from different shapes.\n","authors":["Nathan H. May","Bala Krishnamoorthy","Patrick Gambill"],"pdf_url":"https://arxiv.org/pdf/2306.06727v2.pdf","comment":"Added computational experiments; published in La Matematica"},{"id":"http://arxiv.org/abs/2408.16191v1","updated":"2024-08-29T01:09:30Z","published":"2024-08-29T01:09:30Z","title":"Variational Mode-Driven Graph Convolutional Network for Spatiotemporal\n Traffic Forecasting","summary":" This paper focuses on spatio-temporal (ST) traffic prediction traffic using\ngraph neural networks. Given that ST data consists of non-stationary and\ncomplex time events, interpreting and predicting such trends is comparatively\ncomplicated. Representation of ST data in modes helps us infer behavior and\nassess the impact of noise on prediction applications. We propose a framework\nthat decomposes ST data into modes using the variational mode decomposition\n(VMD) method, which is then fed into the neural network for forecasting future\nstates. This hybrid approach is known as a variational mode graph convolutional\nnetwork (VMGCN). Instead of exhaustively searching for the number of modes,\nthey are determined using the reconstruction loss from the real-time\napplication data. We also study the significance of each mode and the impact of\nbandwidth constraints on different horizon predictions in traffic flow data. We\nevaluate the performance of our proposed network on the LargeST dataset for\nboth short and long-term predictions. Our framework yields better results\ncompared to state-of-the-art methods.\n","authors":["Osama Ahmad","Zubair Khalid"],"pdf_url":"https://arxiv.org/pdf/2408.16191v1.pdf","comment":"IEEE Transactions on Intelligent Transportation Systems Submission,\n 2024"},{"id":"http://arxiv.org/abs/2408.16189v1","updated":"2024-08-29T01:02:40Z","published":"2024-08-29T01:02:40Z","title":"A More Unified Theory of Transfer Learning","summary":" We show that some basic moduli of continuity $\\delta$ -- which measure how\nfast target risk decreases as source risk decreases -- appear to be at the root\nof many of the classical relatedness measures in transfer learning and related\nliterature. Namely, bounds in terms of $\\delta$ recover many of the existing\nbounds in terms of other measures of relatedness -- both in regression and\nclassification -- and can at times be tighter.\n We are particularly interested in general situations where the learner has\naccess to both source data and some or no target data. The unified perspective\nallowed by the moduli $\\delta$ allow us to extend many existing notions of\nrelatedness at once to these scenarios involving target data: interestingly,\nwhile $\\delta$ itself might not be efficiently estimated, adaptive procedures\nexist -- based on reductions to confidence sets -- which can get nearly tight\nrates in terms of $\\delta$ with no prior distributional knowledge. Such\nadaptivity to unknown $\\delta$ immediately implies adaptivity to many classical\nrelatedness notions, in terms of combined source and target samples' sizes.\n","authors":["Steve Hanneke","Samory Kpotufe"],"pdf_url":"https://arxiv.org/pdf/2408.16189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13467v2","updated":"2024-08-29T00:54:27Z","published":"2024-08-24T05:03:08Z","title":"LlamaDuo: LLMOps Pipeline for Seamless Migration from Service LLMs to\n Small-Scale Local LLMs","summary":" The widespread adoption of cloud-based proprietary large language models\n(LLMs) has introduced significant challenges, including operational\ndependencies, privacy concerns, and the necessity of continuous internet\nconnectivity. In this work, we introduce an LLMOps pipeline, \"LlamaDuo\", for\nthe seamless migration of knowledge and abilities from service-oriented LLMs to\nsmaller, locally manageable models. This pipeline is crucial for ensuring\nservice continuity in the presence of operational failures, strict privacy\npolicies, or offline requirements. Our LlamaDuo involves fine-tuning a small\nlanguage model against the service LLM using a synthetic dataset generated by\nthe latter. If the performance of the fine-tuned model falls short of\nexpectations, it is enhanced by further fine-tuning with additional similar\ndata created by the service LLM. This iterative process guarantees that the\nsmaller model can eventually match or even surpass the service LLM's\ncapabilities in specific downstream tasks, offering a practical and scalable\nsolution for managing AI deployments in constrained environments. Extensive\nexperiments with leading edge LLMs are conducted to demonstrate the\neffectiveness, adaptability, and affordability of LlamaDuo across various\ndownstream tasks. Our pipeline implementation is available at\nhttps://github.com/deep-diver/llamaduo.\n","authors":["Chansung Park","Juyong Jiang","Fan Wang","Sayak Paul","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2408.13467v2.pdf","comment":"28 pages, 18 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.16187v1","updated":"2024-08-29T00:53:21Z","published":"2024-08-29T00:53:21Z","title":"Real-Time Energy Pricing in New Zealand: An Evolving Stream Analysis","summary":" This paper introduces a group of novel datasets representing real-time\ntime-series and streaming data of energy prices in New Zealand, sourced from\nthe Electricity Market Information (EMI) website maintained by the New Zealand\ngovernment. The datasets are intended to address the scarcity of proper\ndatasets for streaming regression learning tasks. We conduct extensive analyses\nand experiments on these datasets, covering preprocessing techniques,\nregression tasks, prediction intervals, concept drift detection, and anomaly\ndetection. Our experiments demonstrate the datasets' utility and highlight the\nchallenges and opportunities for future research in energy price forecasting.\n","authors":["Yibin Sun","Heitor Murilo Gomes","Bernhard Pfahringer","Albert Bifet"],"pdf_url":"https://arxiv.org/pdf/2408.16187v1.pdf","comment":"12 Pages, 8 figures, short version accepted by PRICAI"},{"id":"http://arxiv.org/abs/2408.16186v1","updated":"2024-08-29T00:50:35Z","published":"2024-08-29T00:50:35Z","title":"Single-Loop Deterministic and Stochastic Interior-Point Algorithms for\n Nonlinearly Constrained Optimization","summary":" An interior-point algorithm framework is proposed, analyzed, and tested for\nsolving nonlinearly constrained continuous optimization problems. The main\nsetting of interest is when the objective and constraint functions may be\nnonlinear and/or nonconvex, and when constraint values and derivatives are\ntractable to compute, but objective function values and derivatives can only be\nestimated. The algorithm is intended primarily for a setting that is similar\nfor stochastic-gradient methods for unconstrained optimization, namely, the\nsetting when stochastic-gradient estimates are available and employed in place\nof gradients of the objective, and when no objective function values (nor\nestimates of them) are employed. This is achieved by the interior-point\nframework having a single-loop structure rather than the nested-loop structure\nthat is typical of contemporary interior-point methods. For completeness,\nconvergence guarantees for the framework are provided both for deterministic\nand stochastic settings. Numerical experiments show that the algorithm yields\ngood performance on a large set of test problems.\n","authors":["Frank E. Curtis","Xin Jiang","Qi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04992v2","updated":"2024-08-29T00:40:05Z","published":"2024-07-06T07:56:23Z","title":"Scalable Variational Causal Discovery Unconstrained by Acyclicity","summary":" Bayesian causal discovery offers the power to quantify epistemic\nuncertainties among a broad range of structurally diverse causal theories\npotentially explaining the data, represented in forms of directed acyclic\ngraphs (DAGs). However, existing methods struggle with efficient DAG sampling\ndue to the complex acyclicity constraint. In this study, we propose a scalable\nBayesian approach to effectively learn the posterior distribution over causal\ngraphs given observational data thanks to the ability to generate DAGs without\nexplicitly enforcing acyclicity. Specifically, we introduce a novel\ndifferentiable DAG sampling method that can generate a valid acyclic causal\ngraph by mapping an unconstrained distribution of implicit topological orders\nto a distribution over DAGs. Given this efficient DAG sampling scheme, we are\nable to model the posterior distribution over causal graphs using a simple\nvariational distribution over a continuous domain, which can be learned via the\nvariational inference framework. Extensive empirical experiments on both\nsimulated and real datasets demonstrate the superior performance of the\nproposed model compared to several state-of-the-art baselines.\n","authors":["Nu Hoang","Bao Duong","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.04992v2.pdf","comment":"Accepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2408.16181v1","updated":"2024-08-29T00:36:34Z","published":"2024-08-29T00:36:34Z","title":"A Minibatch-SGD-Based Learning Meta-Policy for Inventory Systems with\n Myopic Optimal Policy","summary":" Stochastic gradient descent (SGD) has proven effective in solving many\ninventory control problems with demand learning. However, it often faces the\npitfall of an infeasible target inventory level that is lower than the current\ninventory level. Several recent works (e.g., Huh and Rusmevichientong (2009),\nShi et al.(2016)) are successful to resolve this issue in various inventory\nsystems. However, their techniques are rather sophisticated and difficult to be\napplied to more complicated scenarios such as multi-product and\nmulti-constraint inventory systems.\n In this paper, we address the infeasible-target-inventory-level issue from a\nnew technical perspective -- we propose a novel minibatch-SGD-based\nmeta-policy. Our meta-policy is flexible enough to be applied to a general\ninventory systems framework covering a wide range of inventory management\nproblems with myopic clairvoyant optimal policy. By devising the optimal\nminibatch scheme, our meta-policy achieves a regret bound of\n$\\mathcal{O}(\\sqrt{T})$ for the general convex case and $\\mathcal{O}(\\log T)$\nfor the strongly convex case. To demonstrate the power and flexibility of our\nmeta-policy, we apply it to three important inventory control problems:\nmulti-product and multi-constraint systems, multi-echelon serial systems, and\none-warehouse and multi-store systems by carefully designing\napplication-specific subroutines.We also conduct extensive numerical\nexperiments to demonstrate that our meta-policy enjoys competitive regret\nperformance, high computational efficiency, and low variances among a wide\nrange of applications.\n","authors":["Jiameng Lyu","Jinxing Xie","Shilin Yuan","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16181v1.pdf","comment":"Forthcoming in Management Science"},{"id":"http://arxiv.org/abs/2407.04980v2","updated":"2024-08-29T00:34:59Z","published":"2024-07-06T07:19:21Z","title":"Enabling Causal Discovery in Post-Nonlinear Models with Normalizing\n Flows","summary":" Post-nonlinear (PNL) causal models stand out as a versatile and adaptable\nframework for modeling intricate causal relationships. However, accurately\ncapturing the invertibility constraint required in PNL models remains\nchallenging in existing studies. To address this problem, we introduce CAF-PoNo\n(Causal discovery via Normalizing Flows for Post-Nonlinear models), harnessing\nthe power of the normalizing flows architecture to enforce the crucial\ninvertibility constraint in PNL models. Through normalizing flows, our method\nprecisely reconstructs the hidden noise, which plays a vital role in\ncause-effect identification through statistical independence testing.\nFurthermore, the proposed approach exhibits remarkable extensibility, as it can\nbe seamlessly expanded to facilitate multivariate causal discovery via causal\norder identification, empowering us to efficiently unravel complex causal\nrelationships. Extensive experimental evaluations on both simulated and real\ndatasets consistently demonstrate that the proposed method outperforms several\nstate-of-the-art approaches in both bivariate and multivariate causal discovery\ntasks.\n","authors":["Nu Hoang","Bao Duong","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.04980v2.pdf","comment":"Acepted at ECAI 2024"},{"id":"http://arxiv.org/abs/2108.02497v5","updated":"2024-08-29T10:12:35Z","published":"2021-08-05T10:15:17Z","title":"How to avoid machine learning pitfalls: a guide for academic researchers","summary":" Mistakes in machine learning practice are commonplace, and can result in a\nloss of confidence in the findings and products of machine learning. This guide\noutlines common mistakes that occur when using machine learning, and what can\nbe done to avoid them. Whilst it should be accessible to anyone with a basic\nunderstanding of machine learning techniques, it focuses on issues that are of\nparticular concern within academic research, such as the need to do rigorous\ncomparisons and reach valid conclusions. It covers five stages of the machine\nlearning process: what to do before model building, how to reliably build\nmodels, how to robustly evaluate models, how to compare models fairly, and how\nto report results.\n","authors":["Michael A. Lones"],"pdf_url":"https://arxiv.org/pdf/2108.02497v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16945v1","updated":"2024-08-29T23:51:51Z","published":"2024-08-29T23:51:51Z","title":"Different Victims, Same Layout: Email Visual Similarity Detection for\n Enhanced Email Protection","summary":" In the pursuit of an effective spam detection system, the focus has often\nbeen on identifying known spam patterns either through rule-based detection\nsystems or machine learning (ML) solutions. However, both systems are\nsusceptible to evasion techniques and zero-day attacks that can be achieved at\nlow cost. Therefore, an email that bypassed the defense system once can do it\nagain in the following days, even though rules are updated or the ML models are\nretrained. The recurrence of failures to detect emails that exhibit layout\nsimilarities to previously undetected spam is concerning for customers and can\nerode their trust in a company. Our observations show that threat actors reuse\nemail kits extensively and can bypass detection with little effort, for\nexample, by making changes to the content of emails. In this work, we propose\nan email visual similarity detection approach, named Pisco, to improve the\ndetection capabilities of an email threat defense system. We apply our proof of\nconcept to some real-world samples received from different sources. Our results\nshow that email kits are being reused extensively and visually similar emails\nare sent to our customers at various time intervals. Therefore, this method\ncould be very helpful in situations where detection features that rely on\ncontextual information and keywords are bypassed, an occurrence our\nobservations show happens frequently.\n","authors":["Sachin Shukla","Omid Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2408.16945v1.pdf","comment":"To be published in the proceedings of the ACM Conference on Computer\n and Communications Security (ACM CCS 2024)"},{"id":"http://arxiv.org/abs/2408.16944v1","updated":"2024-08-29T23:48:08Z","published":"2024-08-29T23:48:08Z","title":"FlowRetrieval: Flow-Guided Data Retrieval for Few-Shot Imitation\n Learning","summary":" Few-shot imitation learning relies on only a small amount of task-specific\ndemonstrations to efficiently adapt a policy for a given downstream tasks.\nRetrieval-based methods come with a promise of retrieving relevant past\nexperiences to augment this target data when learning policies. However,\nexisting data retrieval methods fall under two extremes: they either rely on\nthe existence of exact behaviors with visually similar scenes in the prior\ndata, which is impractical to assume; or they retrieve based on semantic\nsimilarity of high-level language descriptions of the task, which might not be\nthat informative about the shared low-level behaviors or motions across tasks\nthat is often a more important factor for retrieving relevant data for policy\nlearning. In this work, we investigate how we can leverage motion similarity in\nthe vast amount of cross-task data to improve few-shot imitation learning of\nthe target task. Our key insight is that motion-similar data carries rich\ninformation about the effects of actions and object interactions that can be\nleveraged during few-shot adaptation. We propose FlowRetrieval, an approach\nthat leverages optical flow representations for both extracting similar motions\nto target tasks from prior data, and for guiding learning of a policy that can\nmaximally benefit from such data. Our results show FlowRetrieval significantly\noutperforms prior methods across simulated and real-world domains, achieving on\naverage 27% higher success rate than the best retrieval-based prior method. In\nthe Pen-in-Cup task with a real Franka Emika robot, FlowRetrieval achieves 3.7x\nthe performance of the baseline imitation learning technique that learns from\nall prior and target data. Website: https://flow-retrieval.github.io\n","authors":["Li-Heng Lin","Yuchen Cui","Amber Xie","Tianyu Hua","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2408.16944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16941v1","updated":"2024-08-29T23:36:10Z","published":"2024-08-29T23:36:10Z","title":"Efficient Transonic Aeroelastic Model Reduction Using Optimized Sparse\n Multi-Input Polynomial Functionals","summary":" Nonlinear aeroelastic reduced-order models (ROMs) based on machine learning\nor artificial intelligence algorithms can be complex and computationally\ndemanding to train, meaning that for practical aeroelastic applications, the\nconservative nature of linearization is often favored. Therefore, there is a\nrequirement for novel nonlinear aeroelastic model reduction approaches that are\naccurate, simple and, most importantly, efficient to generate. This paper\nproposes a novel formulation for the identification of a compact multi-input\nVolterra series, where Orthogonal Matching Pursuit is used to obtain a set of\noptimally sparse nonlinear multi-input ROM coefficients from unsteady\naerodynamic training data. The framework is exemplified using the Benchmark\nSupercritical Wing, considering; forced response, flutter and limit cycle\noscillation. The simple and efficient Optimal Sparsity Multi-Input ROM\n(OSM-ROM) framework performs with high accuracy compared to the full-order\naeroelastic model, requiring only a fraction of the tens-of-thousands of\npossible multi-input terms to be identified and allowing a 96% reduction in the\nnumber of training samples.\n","authors":["Michael Candon","Maciej Balajewicz","Arturo Delgado-Gutierrez","Pier Marzocca","Earl H. Dowell"],"pdf_url":"https://arxiv.org/pdf/2408.16941v1.pdf","comment":"24 pages, preprint, under review"},{"id":"http://arxiv.org/abs/2408.16939v1","updated":"2024-08-29T23:22:40Z","published":"2024-08-29T23:22:40Z","title":"Theoretical Insights into Overparameterized Models in Multi-Task and\n Replay-Based Continual Learning","summary":" Multi-task learning (MTL) is a machine learning paradigm that aims to improve\nthe generalization performance of a model on multiple related tasks by training\nit simultaneously on those tasks. Unlike MTL, where the model has instant\naccess to the training data of all tasks, continual learning (CL) involves\nadapting to new sequentially arriving tasks over time without forgetting the\npreviously acquired knowledge. Despite the wide practical adoption of CL and\nMTL and extensive literature on both areas, there remains a gap in the\ntheoretical understanding of these methods when used with overparameterized\nmodels such as deep neural networks. This paper studies the overparameterized\nlinear models as a proxy for more complex models. We develop theoretical\nresults describing the effect of various system parameters on the model's\nperformance in an MTL setup. Specifically, we study the impact of model size,\ndataset size, and task similarity on the generalization error and knowledge\ntransfer. Additionally, we present theoretical results to characterize the\nperformance of replay-based CL models. Our results reveal the impact of buffer\nsize and model capacity on the forgetting rate in a CL setup and help shed\nlight on some of the state-of-the-art CL methods. Finally, through extensive\nempirical evaluations, we demonstrate that our theoretical findings are also\napplicable to deep neural networks, offering valuable guidance for designing\nMTL and CL models in practice.\n","authors":["Mohammadamin Banayeeanzade","Mahdi Soltanolkotabi","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2408.16939v1.pdf","comment":"41 pages, 21 figures"},{"id":"http://arxiv.org/abs/2405.08174v2","updated":"2024-08-29T23:21:03Z","published":"2024-05-13T20:39:27Z","title":"Estimating Direct and Indirect Causal Effects of Spatiotemporal\n Interventions in Presence of Spatial Interference","summary":" Spatial interference (SI) occurs when the treatment at one location affects\nthe outcomes at other locations. Accounting for spatial interference in\nspatiotemporal settings poses further challenges as interference violates the\nstable unit treatment value assumption, making it infeasible for standard\ncausal inference methods to quantify the effects of time-varying treatment at\nspatially varying outcomes. In this paper, we first formalize the concept of\nspatial interference in case of time-varying treatment assignments by extending\nthe potential outcome framework under the assumption of no unmeasured\nconfounding. We then propose our deep learning based potential outcome model\nfor spatiotemporal causal inference. We utilize latent factor modeling to\nreduce the bias due to time-varying confounding while leveraging the power of\nU-Net architecture to capture global and local spatial interference in data\nover time. Our causal estimators are an extension of average treatment effect\n(ATE) for estimating direct (DATE) and indirect effects (IATE) of spatial\ninterference on treated and untreated data. Being the first of its kind deep\nlearning based spatiotemporal causal inference technique, our approach shows\nadvantages over several baseline methods based on the experiment results on two\nsynthetic datasets, with and without spatial interference. Our results on\nreal-world climate dataset also align with domain knowledge, further\ndemonstrating the effectiveness of our proposed method.\n","authors":["Sahara Ali","Omar Faruque","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2405.08174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16929v1","updated":"2024-08-29T22:08:07Z","published":"2024-08-29T22:08:07Z","title":"AI-driven Reverse Engineering of QML Models","summary":" Quantum machine learning (QML) is a rapidly emerging area of research, driven\nby the capabilities of Noisy Intermediate-Scale Quantum (NISQ) devices. With\nthe progress in the research of QML models, there is a rise in third-party\nquantum cloud services to cater to the increasing demand for resources. New\nsecurity concerns surface, specifically regarding the protection of\nintellectual property (IP) from untrustworthy service providers. One of the\nmost pressing risks is the potential for reverse engineering (RE) by malicious\nactors who may steal proprietary quantum IPs such as trained parameters and QML\narchitecture, modify them to remove additional watermarks or signatures and\nre-transpile them for other quantum hardware. Prior work presents a brute force\napproach to RE the QML parameters which takes exponential time overhead. In\nthis paper, we introduce an autoencoder-based approach to extract the\nparameters from transpiled QML models deployed on untrusted third-party\nvendors. We experiment on multi-qubit classifiers and note that they can be\nreverse-engineered under restricted conditions with a mean error of order\n10^-1. The amount of time taken to prepare the dataset and train the model to\nreverse engineer the QML circuit being of the order 10^3 seconds (which is\n10^2x better than the previously reported value for 4-layered 4-qubit\nclassifiers) makes the threat of RE highly potent, underscoring the need for\ncontinued development of effective defenses.\n","authors":["Archisman Ghosh","Swaroop Ghosh"],"pdf_url":"https://arxiv.org/pdf/2408.16929v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.16913v1","updated":"2024-08-29T21:21:53Z","published":"2024-08-29T21:21:53Z","title":"Analyzing Inference Privacy Risks Through Gradients in Machine Learning","summary":" In distributed learning settings, models are iteratively updated with shared\ngradients computed from potentially sensitive user data. While previous work\nhas studied various privacy risks of sharing gradients, our paper aims to\nprovide a systematic approach to analyze private information leakage from\ngradients. We present a unified game-based framework that encompasses a broad\nrange of attacks including attribute, property, distributional, and user\ndisclosures. We investigate how different uncertainties of the adversary affect\ntheir inferential power via extensive experiments on five datasets across\nvarious data modalities. Our results demonstrate the inefficacy of solely\nrelying on data aggregation to achieve privacy against inference attacks in\ndistributed learning. We further evaluate five types of defenses, namely,\ngradient pruning, signed gradient descent, adversarial perturbations,\nvariational information bottleneck, and differential privacy, under both static\nand adaptive adversary settings. We provide an information-theoretic view for\nanalyzing the effectiveness of these defenses against inference from gradients.\nFinally, we introduce a method for auditing attribute inference privacy,\nimproving the empirical estimation of worst-case privacy through crafting\nadversarial canary records.\n","authors":["Zhuohang Li","Andrew Lowy","Jing Liu","Toshiaki Koike-Akino","Kieran Parsons","Bradley Malin","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16896v1","updated":"2024-08-29T20:39:54Z","published":"2024-08-29T20:39:54Z","title":"DLFormer: Enhancing Explainability in Multivariate Time Series\n Forecasting using Distributed Lag Embedding","summary":" . Most real-world variables are multivariate time series influenced by past\nvalues and explanatory factors. Consequently, predicting these time series data\nusing artificial intelligence is ongoing. In particular, in fields such as\nhealthcare and finance, where reliability is crucial, having understandable\nexplanations for predictions is essential. However, achieving a balance between\nhigh prediction accuracy and intuitive explainability has proven challenging.\nAlthough attention-based models have limitations in representing the individual\ninfluences of each variable, these models can influence the temporal\ndependencies in time series prediction and the magnitude of the influence of\nindividual variables. To address this issue, this study introduced DLFormer, an\nattention-based architecture integrated with distributed lag embedding, to\ntemporally embed individual variables and capture their temporal influence.\nThrough validation against various real-world datasets, DLFormer showcased\nsuperior performance improvements compared to existing attention-based\nhigh-performance models. Furthermore, comparing the relationships between\nvariables enhanced the reliability of explainability.\n","authors":["Younghwi Kim","Dohee Kim","Sunghyun Sim"],"pdf_url":"https://arxiv.org/pdf/2408.16896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16893v1","updated":"2024-08-29T20:27:05Z","published":"2024-08-29T20:27:05Z","title":"Exploring Multiple Strategies to Improve Multilingual Coreference\n Resolution in CorefUD","summary":" Coreference resolution, the task of identifying expressions in text that\nrefer to the same entity, is a critical component in various natural language\nprocessing (NLP) applications. This paper presents our end-to-end neural\ncoreference resolution system, utilizing the CorefUD 1.1 dataset, which spans\n17 datasets across 12 languages. We first establish strong baseline models,\nincluding monolingual and cross-lingual variations, and then propose several\nextensions to enhance performance across diverse linguistic contexts. These\nextensions include cross-lingual training, incorporation of syntactic\ninformation, a Span2Head model for optimized headword prediction, and advanced\nsingleton modeling. We also experiment with headword span representation and\nlong-documents modeling through overlapping segments. The proposed extensions,\nparticularly the heads-only approach, singleton modeling, and long document\nprediction significantly improve performance across most datasets. We also\nperform zero-shot cross-lingual experiments, highlighting the potential and\nlimitations of cross-lingual transfer in coreference resolution. Our findings\ncontribute to the development of robust and scalable coreference systems for\nmultilingual coreference resolution. Finally, we evaluate our model on CorefUD\n1.1 test set and surpass the best model from CRAC 2023 shared task of a\ncomparable size by a large margin. Our nodel is available on GitHub:\n\\url{https://github.com/ondfa/coref-multiling}\n","authors":["Ondřej Pražák","Miloslav Konopík"],"pdf_url":"https://arxiv.org/pdf/2408.16893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16892v1","updated":"2024-08-29T20:26:27Z","published":"2024-08-29T20:26:27Z","title":"Tex-ViT: A Generalizable, Robust, Texture-based dual-branch\n cross-attention deepfake detector","summary":" Deepfakes, which employ GAN to produce highly realistic facial modification,\nare widely regarded as the prevailing method. Traditional CNN have been able to\nidentify bogus media, but they struggle to perform well on different datasets\nand are vulnerable to adversarial attacks due to their lack of robustness.\nVision transformers have demonstrated potential in the realm of image\nclassification problems, but they require enough training data. Motivated by\nthese limitations, this publication introduces Tex-ViT (Texture-Vision\nTransformer), which enhances CNN features by combining ResNet with a vision\ntransformer. The model combines traditional ResNet features with a texture\nmodule that operates in parallel on sections of ResNet before each\ndown-sampling operation. The texture module then serves as an input to the dual\nbranch of the cross-attention vision transformer. It specifically focuses on\nimproving the global texture module, which extracts feature map correlation.\nEmpirical analysis reveals that fake images exhibit smooth textures that do not\nremain consistent over long distances in manipulations. Experiments were\nperformed on different categories of FF++, such as DF, f2f, FS, and NT,\ntogether with other types of GAN datasets in cross-domain scenarios.\nFurthermore, experiments also conducted on FF++, DFDCPreview, and Celeb-DF\ndataset underwent several post-processing situations, such as blurring,\ncompression, and noise. The model surpassed the most advanced models in terms\nof generalization, achieving a 98% accuracy in cross-domain scenarios. This\ndemonstrates its ability to learn the shared distinguishing textural\ncharacteristics in the manipulated samples. These experiments provide evidence\nthat the proposed model is capable of being applied to various situations and\nis resistant to many post-processing procedures.\n","authors":["Deepak Dagar","Dinesh Kumar Vishwakarma"],"pdf_url":"https://arxiv.org/pdf/2408.16892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06266v3","updated":"2024-08-29T20:26:19Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16890v1","updated":"2024-08-29T20:22:22Z","published":"2024-08-29T20:22:22Z","title":"Robotic warehousing operations: a learn-then-optimize approach to\n large-scale neighborhood search","summary":" The rapid deployment of robotics technologies requires dedicated optimization\nalgorithms to manage large fleets of autonomous agents. This paper supports\nrobotic parts-to-picker operations in warehousing by optimizing\norder-workstation assignments, item-pod assignments and the schedule of order\nfulfillment at workstations. The model maximizes throughput, while managing\nhuman workload at the workstations and congestion in the facility. We solve it\nvia large-scale neighborhood search, with a novel learn-then-optimize approach\nto subproblem generation. The algorithm relies on an offline machine learning\nprocedure to predict objective improvements based on subproblem features, and\nan online optimization model to generate a new subproblem at each iteration. In\ncollaboration with Amazon Robotics, we show that our model and algorithm\ngenerate much stronger solutions for practical problems than state-of-the-art\napproaches. In particular, our solution enhances the utilization of robotic\nfleets by coordinating robotic tasks for human operators to pick multiple items\nat once, and by coordinating robotic routes to avoid congestion in the\nfacility.\n","authors":["Cynthia Barnhart","Alexandre Jacquillat","Alexandria Schmid"],"pdf_url":"https://arxiv.org/pdf/2408.16890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00303v3","updated":"2024-08-29T20:21:07Z","published":"2024-05-01T03:59:06Z","title":"Joint Optimization of Piecewise Linear Ensembles","summary":" Tree ensembles achieve state-of-the-art performance on numerous prediction\ntasks. We propose $\\textbf{J}$oint $\\textbf{O}$ptimization of\n$\\textbf{P}$iecewise $\\textbf{L}$inear $\\textbf{En}$sembles (JOPLEn), which\njointly fits piecewise linear models at all leaf nodes of an existing tree\nensemble. In addition to enhancing the ensemble expressiveness, JOPLEn allows\nseveral common penalties, including sparsity-promoting and subspace-norms, to\nbe applied to nonlinear prediction. For example, JOPLEn with a nuclear norm\npenalty learns subspace-aligned functions. Additionally, JOPLEn (combined with\na Dirty LASSO penalty) is an effective feature selection method for nonlinear\nprediction in multitask learning. Finally, we demonstrate the performance of\nJOPLEn on 153 regression and classification datasets and with a variety of\npenalties. JOPLEn leads to improved prediction performance relative to not only\nstandard random forest and boosted tree ensembles, but also other methods for\nenhancing tree ensembles.\n","authors":["Matt Raymond","Angela Violi","Clayton Scott"],"pdf_url":"https://arxiv.org/pdf/2405.00303v3.pdf","comment":"7 pages, 4 figures, accepted to IEEE MLSP 2024 While preparing the\n code release, we found minor bugs in the penalty gradient computation and the\n validation set preprocessing. Fixing these bugs provides the updated results\n shown in Figure 1 and Section 3.1. The conclusions of the paper remain the\n same"},{"id":"http://arxiv.org/abs/2408.16889v1","updated":"2024-08-29T20:20:49Z","published":"2024-08-29T20:20:49Z","title":"LLaVA-Chef: A Multi-modal Generative Model for Food Recipes","summary":" In the rapidly evolving landscape of online recipe sharing within a\nglobalized context, there has been a notable surge in research towards\ncomprehending and generating food recipes. Recent advancements in large\nlanguage models (LLMs) like GPT-2 and LLaVA have paved the way for Natural\nLanguage Processing (NLP) approaches to delve deeper into various facets of\nfood-related tasks, encompassing ingredient recognition and comprehensive\nrecipe generation. Despite impressive performance and multi-modal adaptability\nof LLMs, domain-specific training remains paramount for their effective\napplication. This work evaluates existing LLMs for recipe generation and\nproposes LLaVA-Chef, a novel model trained on a curated dataset of diverse\nrecipe prompts in a multi-stage approach. First, we refine the mapping of\nvisual food image embeddings to the language space. Second, we adapt LLaVA to\nthe food domain by fine-tuning it on relevant recipe data. Third, we utilize\ndiverse prompts to enhance the model's recipe comprehension. Finally, we\nimprove the linguistic quality of generated recipes by penalizing the model\nwith a custom loss function. LLaVA-Chef demonstrates impressive improvements\nover pretrained LLMs and prior works. A detailed qualitative analysis reveals\nthat LLaVA-Chef generates more detailed recipes with precise ingredient\nmentions, compared to existing approaches.\n","authors":["Fnu Mohbat","Mohammed J. Zaki"],"pdf_url":"https://arxiv.org/pdf/2408.16889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16883v1","updated":"2024-08-29T20:12:01Z","published":"2024-08-29T20:12:01Z","title":"Revising Multimodal VAEs with Diffusion Decoders","summary":" Multimodal VAEs often struggle with generating high-quality outputs, a\nchallenge that extends beyond the inherent limitations of the VAE framework.\nThe core issue lies in the restricted joint representation of the latent space,\nparticularly when complex modalities like images are involved. Feedforward\ndecoders, commonly used for these intricate modalities, inadvertently constrain\nthe joint latent space, leading to a degradation in the quality of the other\nmodalities as well. Although recent studies have shown improvement by\nintroducing modality-specific representations, the issue remains significant.\nIn this work, we demonstrate that incorporating a flexible diffusion decoder\nspecifically for the image modality not only enhances the generation quality of\nthe images but also positively impacts the performance of the other modalities\nthat rely on feedforward decoders. This approach addresses the limitations\nimposed by conventional joint representations and opens up new possibilities\nfor improving multimodal generation tasks using the multimodal VAE framework.\nOur model provides state-of-the-art results compared to other multimodal VAEs\nin different datasets with higher coherence and superior quality in the\ngenerated modalities\n","authors":["Daniel Wesego","Amirmohammad Rooshenas"],"pdf_url":"https://arxiv.org/pdf/2408.16883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16882v1","updated":"2024-08-29T20:09:20Z","published":"2024-08-29T20:09:20Z","title":"Coverage Analysis of Multi-Environment Q-Learning Algorithms for\n Wireless Network Optimization","summary":" Q-learning is widely used to optimize wireless networks with unknown system\ndynamics. Recent advancements include ensemble multi-environment hybrid\nQ-learning algorithms, which utilize multiple Q-learning algorithms across\nstructurally related but distinct Markovian environments and outperform\nexisting Q-learning algorithms in terms of accuracy and complexity in\nlarge-scale wireless networks. We herein conduct a comprehensive coverage\nanalysis to ensure optimal data coverage conditions for these algorithms.\nInitially, we establish upper bounds on the expectation and variance of\ndifferent coverage coefficients. Leveraging these bounds, we present an\nalgorithm for efficient initialization of these algorithms. We test our\nalgorithm on two distinct real-world wireless networks. Numerical simulations\nshow that our algorithm can achieve %50 less policy error and %40 less runtime\ncomplexity than state-of-the-art reinforcement learning algorithms.\nFurthermore, our algorithm exhibits robustness to changes in network settings\nand parameters. We also numerically validate our theoretical results.\n","authors":["Talha Bozkus","Urbashi Mitra"],"pdf_url":"https://arxiv.org/pdf/2408.16882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16877v1","updated":"2024-08-29T19:58:46Z","published":"2024-08-29T19:58:46Z","title":"Longitudinal Modularity, a Modularity for Link Streams","summary":" Temporal networks are commonly used to model real-life phenomena. When these\nphenomena represent interactions and are captured at a fine-grained temporal\nresolution, they are modeled as link streams. Community detection is an\nessential network analysis task. Although many methods exist for static\nnetworks, and some methods have been developed for temporal networks\nrepresented as sequences of snapshots, few works can handle link streams. This\narticle introduces the first adaptation of the well-known Modularity quality\nfunction to link streams. Unlike existing methods, it is independent of the\ntime scale of analysis. After introducing the quality function, and its\nrelation to existing static and dynamic definitions of Modularity, we show\nexperimentally its relevance for dynamic community evaluation.\n","authors":["Victor Brabant","Yasaman Asgari","Pierre Borgnat","Angela Bonifati","Remy Cazabet"],"pdf_url":"https://arxiv.org/pdf/2408.16877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16875v1","updated":"2024-08-29T19:57:52Z","published":"2024-08-29T19:57:52Z","title":"Learning Multi-agent Multi-machine Tending by Mobile Robots","summary":" Robotics can help address the growing worker shortage challenge of the\nmanufacturing industry. As such, machine tending is a task collaborative robots\ncan tackle that can also highly boost productivity. Nevertheless, existing\nrobotics systems deployed in that sector rely on a fixed single-arm setup,\nwhereas mobile robots can provide more flexibility and scalability. In this\nwork, we introduce a multi-agent multi-machine tending learning framework by\nmobile robots based on Multi-agent Reinforcement Learning (MARL) techniques\nwith the design of a suitable observation and reward. Moreover, an\nattention-based encoding mechanism is developed and integrated into Multi-agent\nProximal Policy Optimization (MAPPO) algorithm to boost its performance for\nmachine tending scenarios. Our model (AB-MAPPO) outperformed MAPPO in this new\nchallenging scenario in terms of task success, safety, and resources\nutilization. Furthermore, we provided an extensive ablation study to support\nour various design decisions.\n","authors":["Abdalwhab Abdalwhab","Giovanni Beltrame","Samira Ebrahimi Kahou","David St-Onge"],"pdf_url":"https://arxiv.org/pdf/2408.16875v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.16871v1","updated":"2024-08-29T19:40:04Z","published":"2024-08-29T19:40:04Z","title":"GSTAM: Efficient Graph Distillation with Structural Attention-Matching","summary":" Graph distillation has emerged as a solution for reducing large graph\ndatasets to smaller, more manageable, and informative ones. Existing methods\nprimarily target node classification, involve computationally intensive\nprocesses, and fail to capture the true distribution of the full graph dataset.\nTo address these issues, we introduce Graph Distillation with Structural\nAttention Matching (GSTAM), a novel method for condensing graph classification\ndatasets. GSTAM leverages the attention maps of GNNs to distill structural\ninformation from the original dataset into synthetic graphs. The structural\nattention-matching mechanism exploits the areas of the input graph that GNNs\nprioritize for classification, effectively distilling such information into the\nsynthetic graphs and improving overall distillation performance. Comprehensive\nexperiments demonstrate GSTAM's superiority over existing methods, achieving\n0.45% to 6.5% better performance in extreme condensation ratios, highlighting\nits potential use in advancing distillation for graph classification tasks\n(Code available at https://github.com/arashrasti96/GSTAM).\n","authors":["Arash Rasti-Meymandi","Ahmad Sajedi","Zhaopan Xu","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2408.16871v1.pdf","comment":"Accepted at ECCV-DD 2024"},{"id":"http://arxiv.org/abs/2406.05464v2","updated":"2024-08-29T19:30:55Z","published":"2024-06-08T12:58:13Z","title":"DAISY: Data Adaptive Self-Supervised Early Exit for Speech\n Representation Models","summary":" Self-supervised speech models have shown to be useful for various tasks, but\ntheir large size limits the use in devices with low computing power and memory.\nIn this work, we explore early exit, an approach for reducing latency by\nexiting the forward process of a network early. Most approaches of early exit\nneed a separate early exit model for each task, with some even requiring\nfine-tuning of the entire pretrained model. We introduce Data Adaptive\nSelf-Supervised Early Exit (DAISY), an approach that decides when to exit based\non the self-supervised loss, eliminating the need for multiple round of\ntraining and fine-tuning. DAISY matches the performance of HuBERT on the\nMiniSUPERB benchmark, but with much faster inference times. Our analysis on the\nadaptivity of DAISY shows that the model exits early (using fewer layers) on\nclean data while exits late (using more layers) on noisy data, dynamically\nadjusting the computational cost of inference based on the noise level of each\nsample.\n","authors":["Tzu-Quan Lin","Hung-yi Lee","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2406.05464v2.pdf","comment":"Accepted by Interspeech 2024"},{"id":"http://arxiv.org/abs/2207.07174v2","updated":"2024-08-29T19:27:49Z","published":"2022-07-14T19:20:30Z","title":"Attribute Graphs Underlying Molecular Generative Models: Path to\n Learning with Limited Data","summary":" Training generative models that capture rich semantics of the data and\ninterpreting the latent representations encoded by such models are very\nimportant problems in un-/self-supervised learning. In this work, we provide a\nsimple algorithm that relies on perturbation experiments on latent codes of a\npre-trained generative autoencoder to uncover an attribute graph that is\nimplied by the generative model. We perform perturbation experiments to check\nfor influence of a given latent variable on a subset of attributes. Given this,\nwe show that one can fit an effective graphical model that models a structural\nequation model between latent codes taken as exogenous variables and attributes\ntaken as observed variables. One interesting aspect is that a single latent\nvariable controls multiple overlapping subsets of attributes unlike\nconventional approaches that try to impose full independence. Using a\npre-trained generative autoencoder trained on a large dataset of small\nmolecules, we demonstrate that the graphical model between various molecular\nattributes and latent codes learned by our algorithm can be used to predict a\nspecific property for molecules which are drawn from a different distribution.\nWe compare prediction models trained on various feature subsets chosen by\nsimple baselines, as well as existing causal discovery and sparse\nlearning/feature selection methods, with the ones in the derived Markov blanket\nfrom our method. Results show empirically that the predictor that relies on our\nMarkov blanket attributes is robust to distribution shifts when transferred or\nfine-tuned with a few samples from the new distribution, especially when\ntraining data is limited.\n","authors":["Samuel C. Hoffman","Payel Das","Karthikeyan Shanmugam","Kahini Wadhawan","Prasanna Sattigeri"],"pdf_url":"https://arxiv.org/pdf/2207.07174v2.pdf","comment":"New experiments; reframed contributions"},{"id":"http://arxiv.org/abs/2211.09944v3","updated":"2024-08-29T19:25:59Z","published":"2022-11-17T23:38:29Z","title":"MelHuBERT: A simplified HuBERT on Mel spectrograms","summary":" Self-supervised models have had great success in learning speech\nrepresentations that can generalize to various downstream tasks. However, most\nself-supervised models require a large amount of compute and multiple GPUs to\ntrain, significantly hampering the development of self-supervised learning. In\nan attempt to reduce the computation of training, we revisit the training of\nHuBERT, a highly successful self-supervised model. We improve and simplify\nseveral key components, including the loss function, input representation, and\ntraining in multiple stages. Our model, MelHuBERT, is able to achieve favorable\nperformance on phone recognition, speaker identification, and automatic speech\nrecognition against HuBERT, while saving 31.2% of the pre-training time, or\nequivalently 33.5% MACs per one second speech. The code and pre-trained models\nare available in https://github.com/nervjack2/MelHuBERT.\n","authors":["Tzu-Quan Lin","Hung-yi Lee","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2211.09944v3.pdf","comment":"ASRU 2023"},{"id":"http://arxiv.org/abs/2406.11402v2","updated":"2024-08-29T19:24:29Z","published":"2024-06-17T10:45:36Z","title":"Are Small Language Models Ready to Compete with Large Language Models\n for Practical Applications?","summary":" The rapid rise of Language Models (LMs) has expanded their use in several\napplications. Yet, due to constraints of model size, associated cost, or\nproprietary restrictions, utilizing state-of-the-art (SOTA) LLMs is not always\nfeasible. With open, smaller LMs emerging, more applications can leverage their\ncapabilities, but selecting the right LM can be challenging as smaller LMs\ndon't perform well universally. This work tries to bridge this gap by proposing\na framework to experimentally evaluate small, open LMs in practical settings\nthrough measuring semantic correctness of outputs across three practical\naspects: task types, application domains and reasoning types, using diverse\nprompt styles. It also conducts an in-depth comparison of 10 small, open LMs to\nidentify best LM and prompt style depending on specific application requirement\nusing the proposed framework. We also show that if selected appropriately, they\ncan outperform SOTA LLMs like DeepSeek-v2, GPT-4o-mini, Gemini-1.5-Pro, and\neven compete with GPT-4o.\n","authors":["Neelabh Sinha","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2406.11402v2.pdf","comment":"Submitted to ARR"},{"id":"http://arxiv.org/abs/2408.16868v1","updated":"2024-08-29T19:22:37Z","published":"2024-08-29T19:22:37Z","title":"Characterization of point-source transient events with a rolling-shutter\n compressed sensing system","summary":" Point-source transient events (PSTEs) - optical events that are both\nextremely fast and extremely small - pose several challenges to an imaging\nsystem. Due to their speed, accurately characterizing such events often\nrequires detectors with very high frame rates. Due to their size, accurately\ndetecting such events requires maintaining coverage over an extended\nfield-of-view, often through the use of imaging focal plane arrays (FPA) with a\nglobal shutter readout. Traditional imaging systems that meet these\nrequirements are costly in terms of price, size, weight, power consumption, and\ndata bandwidth, and there is a need for cheaper solutions with adequate\ntemporal and spatial coverage. To address these issues, we develop a novel\ncompressed sensing algorithm adapted to the rolling shutter readout of an\nimaging system. This approach enables reconstruction of a PSTE signature at the\nsampling rate of the rolling shutter, offering a 1-2 order of magnitude\ntemporal speedup and a proportional reduction in data bandwidth. We present\nempirical results demonstrating accurate recovery of PSTEs using measurements\nthat are spatially undersampled by a factor of 25, and our simulations show\nthat, relative to other compressed sensing algorithms, our algorithm is both\nfaster and yields higher quality reconstructions. We also present theoretical\nresults characterizing our algorithm and corroborating simulations. The\npotential impact of our work includes the development of much faster, cheaper\nsensor solutions for PSTE detection and characterization.\n","authors":["Frank Qiu","Joshua Michalenko","Lilian K. Casias","Cameron J. Radosevich","Jon Slater","Eric A. Shields"],"pdf_url":"https://arxiv.org/pdf/2408.16868v1.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2406.03886v2","updated":"2024-08-29T19:11:21Z","published":"2024-06-06T09:24:21Z","title":"BiomedBench: A benchmark suite of TinyML biomedical applications for\n low-power wearables","summary":" The design of low-power wearables for the biomedical domain has received a\nlot of attention in recent decades, as technological advances in chip\nmanufacturing have allowed real-time monitoring of patients using\nlow-complexity ML within the mW range. Despite advances in application and\nhardware design research, the domain lacks a systematic approach to hardware\nevaluation. In this work, we propose BiomedBench, a new benchmark suite\ncomposed of complete end-to-end TinyML biomedical applications for real-time\nmonitoring of patients using wearable devices. Each application presents\ndifferent requirements during typical signal acquisition and processing phases,\nincluding varying computational workloads and relations between active and idle\ntimes. Furthermore, our evaluation of five state-of-the-art low-power platforms\nin terms of energy efficiency shows that modern platforms cannot effectively\ntarget all types of biomedical applications. BiomedBench is released as an\nopen-source suite to standardize hardware evaluation and guide hardware and\napplication design in the TinyML wearable domain.\n","authors":["Dimitrios Samakovlis","Stefano Albini","Rubén Rodríguez Álvarez","Denisa-Andreea Constantinescu","Pasquale Davide Schiavone","Miguel Peón Quirós","David Atienza"],"pdf_url":"https://arxiv.org/pdf/2406.03886v2.pdf","comment":"7 pages, 5 figures. Sumbitted to Design & Test Special Issue TinyML"},{"id":"http://arxiv.org/abs/2310.12404v2","updated":"2024-08-29T19:08:54Z","published":"2023-10-19T01:20:12Z","title":"Loop Copilot: Conducting AI Ensembles for Music Generation and Iterative\n Editing","summary":" Creating music is iterative, requiring varied methods at each stage. However,\nexisting AI music systems fall short in orchestrating multiple subsystems for\ndiverse needs. To address this gap, we introduce Loop Copilot, a novel system\nthat enables users to generate and iteratively refine music through an\ninteractive, multi-round dialogue interface. The system uses a large language\nmodel to interpret user intentions and select appropriate AI models for task\nexecution. Each backend model is specialized for a specific task, and their\noutputs are aggregated to meet the user's requirements. To ensure musical\ncoherence, essential attributes are maintained in a centralized table. We\nevaluate the effectiveness of the proposed system through semi-structured\ninterviews and questionnaires, highlighting its utility not only in\nfacilitating music creation but also its potential for broader applications.\n","authors":["Yixiao Zhang","Akira Maezawa","Gus Xia","Kazuhiko Yamamoto","Simon Dixon"],"pdf_url":"https://arxiv.org/pdf/2310.12404v2.pdf","comment":"Source code and demo video are available at\n \\url{https://sites.google.com/view/loop-copilot}"},{"id":"http://arxiv.org/abs/2408.16862v1","updated":"2024-08-29T18:58:39Z","published":"2024-08-29T18:58:39Z","title":"Probabilistic Decomposed Linear Dynamical Systems for Robust Discovery\n of Latent Neural Dynamics","summary":" Time-varying linear state-space models are powerful tools for obtaining\nmathematically interpretable representations of neural signals. For example,\nswitching and decomposed models describe complex systems using latent variables\nthat evolve according to simple locally linear dynamics. However, existing\nmethods for latent variable estimation are not robust to dynamical noise and\nsystem nonlinearity due to noise-sensitive inference procedures and limited\nmodel formulations. This can lead to inconsistent results on signals with\nsimilar dynamics, limiting the model's ability to provide scientific insight.\nIn this work, we address these limitations and propose a probabilistic approach\nto latent variable estimation in decomposed models that improves robustness\nagainst dynamical noise. Additionally, we introduce an extended latent dynamics\nmodel to improve robustness against system nonlinearities. We evaluate our\napproach on several synthetic dynamical systems, including an\nempirically-derived brain-computer interface experiment, and demonstrate more\naccurate latent variable inference in nonlinear systems with diverse noise\nconditions. Furthermore, we apply our method to a real-world clinical\nneurophysiology dataset, illustrating the ability to identify interpretable and\ncoherent structure where previous models cannot.\n","authors":["Yenho Chen","Noga Mudrik","Kyle A. Johnsen","Sankaraleengam Alagapan","Adam S. Charles","Christopher J. Rozell"],"pdf_url":"https://arxiv.org/pdf/2408.16862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16852v1","updated":"2024-08-29T18:34:59Z","published":"2024-08-29T18:34:59Z","title":"The Star Geometry of Critic-Based Regularizer Learning","summary":" Variational regularization is a classical technique to solve statistical\ninference tasks and inverse problems, with modern data-driven approaches\nparameterizing regularizers via deep neural networks showcasing impressive\nempirical performance. Recent works along these lines learn task-dependent\nregularizers. This is done by integrating information about the measurements\nand ground-truth data in an unsupervised, critic-based loss function, where the\nregularizer attributes low values to likely data and high values to unlikely\ndata. However, there is little theory about the structure of regularizers\nlearned via this process and how it relates to the two data distributions. To\nmake progress on this challenge, we initiate a study of optimizing critic-based\nloss functions to learn regularizers over a particular family of regularizers:\ngauges (or Minkowski functionals) of star-shaped bodies. This family contains\nregularizers that are commonly employed in practice and shares properties with\nregularizers parameterized by deep neural networks. We specifically investigate\ncritic-based losses derived from variational representations of statistical\ndistances between probability measures. By leveraging tools from star geometry\nand dual Brunn-Minkowski theory, we illustrate how these losses can be\ninterpreted as dual mixed volumes that depend on the data distribution. This\nallows us to derive exact expressions for the optimal regularizer in certain\ncases. Finally, we identify which neural network architectures give rise to\nsuch star body gauges and when do such regularizers have favorable properties\nfor optimization. More broadly, this work highlights how the tools of star\ngeometry can aid in understanding the geometry of unsupervised regularizer\nlearning.\n","authors":["Oscar Leong","Eliza O'Reilly","Yong Sheng Soh"],"pdf_url":"https://arxiv.org/pdf/2408.16852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16849v1","updated":"2024-08-29T18:27:32Z","published":"2024-08-29T18:27:32Z","title":"Machine Learning-Based Research on the Adaptability of Adolescents to\n Online Education","summary":" With the rapid advancement of internet technology, the adaptability of\nadolescents to online learning has emerged as a focal point of interest within\nthe educational sphere. However, the academic community's efforts to develop\npredictive models for adolescent online learning adaptability require further\nrefinement and expansion. Utilizing data from the \"Chinese Adolescent Online\nEducation Survey\" spanning the years 2014 to 2016, this study implements five\nmachine learning algorithms - logistic regression, K-nearest neighbors, random\nforest, XGBoost, and CatBoost - to analyze the factors influencing adolescent\nonline learning adaptability and to determine the model best suited for\nprediction. The research reveals that the duration of courses, the financial\nstatus of the family, and age are the primary factors affecting students'\nadaptability in online learning environments. Additionally, age significantly\nimpacts students' adaptive capacities. Among the predictive models, the random\nforest, XGBoost, and CatBoost algorithms demonstrate superior forecasting\ncapabilities, with the random forest model being particularly adept at\ncapturing the characteristics of students' adaptability.\n","authors":["Mingwei Wang","Sitong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16849v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.16625v1","updated":"2024-08-29T15:34:25Z","published":"2024-08-29T15:34:25Z","title":"MultiMediate'24: Multi-Domain Engagement Estimation","summary":" Estimating the momentary level of participant's engagement is an important\nprerequisite for assistive systems that support human interactions. Previous\nwork has addressed this task in within-domain evaluation scenarios, i.e.\ntraining and testing on the same dataset. This is in contrast to real-life\nscenarios where domain shifts between training and testing data frequently\noccur. With MultiMediate'24, we present the first challenge addressing\nmulti-domain engagement estimation. As training data, we utilise the NOXI\ndatabase of dyadic novice-expert interactions. In addition to within-domain\ntest data, we add two new test domains. First, we introduce recordings\nfollowing the NOXI protocol but covering languages that are not present in the\nNOXI training data. Second, we collected novel engagement annotations on the\nMPIIGroupInteraction dataset which consists of group discussions between three\nto four people. In this way, MultiMediate'24 evaluates the ability of\napproaches to generalise across factors such as language and cultural\nbackground, group size, task, and screen-mediated vs. face-to-face interaction.\nThis paper describes the MultiMediate'24 challenge and presents baseline\nresults. In addition, we discuss selected challenge solutions.\n","authors":["Philipp Müller","Michal Balazia","Tobias Baur","Michael Dietz","Alexander Heimerl","Anna Penzkofer","Dominik Schiller","François Brémond","Jan Alexandersson","Elisabeth André","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2408.16625v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.08256"},{"id":"http://arxiv.org/abs/2408.16564v1","updated":"2024-08-29T14:30:56Z","published":"2024-08-29T14:30:56Z","title":"Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing\n Interaction and Causal Processing","summary":" Humans naturally perform audiovisual speech recognition (AVSR), enhancing the\naccuracy and robustness by integrating auditory and visual information. Spiking\nneural networks (SNNs), which mimic the brain's information-processing\nmechanisms, are well-suited for emulating the human capability of AVSR. Despite\ntheir potential, research on SNNs for AVSR is scarce, with most existing\naudio-visual multimodal methods focused on object or digit recognition. These\nmodels simply integrate features from both modalities, neglecting their unique\ncharacteristics and interactions. Additionally, they often rely on future\ninformation for current processing, which increases recognition latency and\nlimits real-time applicability. Inspired by human speech perception, this paper\nproposes a novel human-inspired SNN named HI-AVSNN for AVSR, incorporating\nthree key characteristics: cueing interaction, causal processing and spike\nactivity. For cueing interaction, we propose a visual-cued auditory attention\nmodule (VCA2M) that leverages visual cues to guide attention to auditory\nfeatures. We achieve causal processing by aligning the SNN's temporal dimension\nwith that of visual and auditory features and applying temporal masking to\nutilize only past and current information. To implement spike activity, in\naddition to using SNNs, we leverage the event camera to capture lip movement as\nspikes, mimicking the human retina and providing efficient visual data. We\nevaluate HI-AVSNN on an audiovisual speech recognition dataset combining the\nDVS-Lip dataset with its corresponding audio samples. Experimental results\ndemonstrate the superiority of our proposed fusion method, outperforming\nexisting audio-visual SNN fusion methods and achieving a 2.27% improvement in\naccuracy over the only existing SNN-based AVSR method.\n","authors":["Qianhui Liu","Jiadong Wang","Yang Wang","Xin Yang","Gang Pan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2408.16564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16532v1","updated":"2024-08-29T13:43:36Z","published":"2024-08-29T13:43:36Z","title":"WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio\n Language Modeling","summary":" Language models have been effectively applied to modeling natural signals,\nsuch as images, video, speech, and audio. A crucial component of these models\nis the codec tokenizer, which compresses high-dimensional natural signals into\nlower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,\nwhich offers several advantages over previous SOTA acoustic codec models in the\naudio domain: 1)extreme compression. By compressing the layers of quantizers\nand the temporal dimension of the discrete codec, one-second audio of 24kHz\nsampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved\nsubjective quality. Despite the reduced number of tokens, WavTokenizer achieves\nstate-of-the-art reconstruction quality with outstanding UTMOS scores and\ninherently contains richer semantic information. Specifically, we achieve these\nresults by designing a broader VQ space, extended contextual windows, and\nimproved attention networks, as well as introducing a powerful multi-scale\ndiscriminator and an inverse Fourier transform structure. We conducted\nextensive reconstruction experiments in the domains of speech, audio, and\nmusic. WavTokenizer exhibited strong performance across various objective and\nsubjective metrics compared to state-of-the-art models. We also tested semantic\ninformation, VQ utilization, and adaptability to generative models.\nComprehensive ablation studies confirm the necessity of each module in\nWavTokenizer. The related code, demos, and pre-trained models are available at\nhttps://github.com/jishengpeng/WavTokenizer.\n","authors":["Shengpeng Ji","Ziyue Jiang","Xize Cheng","Yifu Chen","Minghui Fang","Jialong Zuo","Qian Yang","Ruiqi Li","Ziang Zhang","Xiaoda Yang","Rongjie Huang","Yidi Jiang","Qian Chen","Siqi Zheng","Wen Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16532v1.pdf","comment":"Working in progress. arXiv admin note: text overlap with\n arXiv:2402.12208"},{"id":"http://arxiv.org/abs/2408.16879v1","updated":"2024-08-29T20:05:02Z","published":"2024-08-29T20:05:02Z","title":"MSLIQA: Enhancing Learning Representations for Image Quality Assessment\n through Multi-Scale Learning","summary":" No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due\nto the diversity of distortions and the lack of large annotated datasets. Many\nstudies have attempted to tackle these challenges by developing more accurate\nNR-IQA models, often employing complex and computationally expensive networks,\nor by bridging the domain gap between various distortions to enhance\nperformance on test datasets. In our work, we improve the performance of a\ngeneric lightweight NR-IQA model by introducing a novel augmentation strategy\nthat boosts its performance by almost 28\\%. This augmentation strategy enables\nthe network to better discriminate between different distortions in various\nparts of the image by zooming in and out. Additionally, the inclusion of\ntest-time augmentation further enhances performance, making our lightweight\nnetwork's results comparable to the current state-of-the-art models, simply\nthrough the use of augmentations.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildiyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.16879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16809v1","updated":"2024-08-29T17:59:57Z","published":"2024-08-29T17:59:57Z","title":"See or Guess: Counterfactually Regularized Image Captioning","summary":" Image captioning, which generates natural language descriptions of the visual\ninformation in an image, is a crucial task in vision-language research.\nPrevious models have typically addressed this task by aligning the generative\ncapabilities of machines with human intelligence through statistical fitting of\nexisting datasets. While effective for normal images, they may struggle to\naccurately describe those where certain parts of the image are obscured or\nedited, unlike humans who excel in such cases. These weaknesses they exhibit,\nincluding hallucinations and limited interpretability, often hinder performance\nin scenarios with shifted association patterns. In this paper, we present a\ngeneric image captioning framework that employs causal inference to make\nexisting models more capable of interventional tasks, and counterfactually\nexplainable. Our approach includes two variants leveraging either total effect\nor natural direct effect. Integrating them into the training process enables\nmodels to handle counterfactual scenarios, increasing their generalizability.\nExtensive experiments on various datasets show that our method effectively\nreduces hallucinations and improves the model's faithfulness to images,\ndemonstrating high portability across both small-scale and large-scale\nimage-to-text models. The code is available at\nhttps://github.com/Aman-4-Real/See-or-Guess.\n","authors":["Qian Cao","Xu Chen","Ruihua Song","Xiting Wang","Xinting Huang","Yuchen Ren"],"pdf_url":"https://arxiv.org/pdf/2408.16809v1.pdf","comment":"Accepted by ACM MM 2024"}]},"2024-08-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.17443v1","updated":"2024-08-30T17:52:55Z","published":"2024-08-30T17:52:55Z","title":"Bridging Episodes and Semantics: A Novel Framework for Long-Form Video\n Understanding","summary":" While existing research often treats long-form videos as extended short\nvideos, we propose a novel approach that more accurately reflects human\ncognition. This paper introduces BREASE: BRidging Episodes And SEmantics for\nLong-Form Video Understanding, a model that simulates episodic memory\naccumulation to capture action sequences and reinforces them with semantic\nknowledge dispersed throughout the video. Our work makes two key contributions:\nFirst, we develop an Episodic COmpressor (ECO) that efficiently aggregates\ncrucial representations from micro to semi-macro levels. Second, we propose a\nSemantics reTRiever (SeTR) that enhances these aggregated representations with\nsemantic information by focusing on the broader context, dramatically reducing\nfeature dimensionality while preserving relevant macro-level information.\nExtensive experiments demonstrate that BREASE achieves state-of-the-art\nperformance across multiple long video understanding benchmarks in both\nzero-shot and fully-supervised settings. The project page and code are at:\nhttps://joslefaure.github.io/assets/html/hermes.html.\n","authors":["Gueter Josmy Faure","Jia-Fong Yeh","Min-Hung Chen","Hung-Ting Su","Winston H. Hsu","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.17443v1.pdf","comment":"Accepted to the EVAL-FoMo Workshop at ECCV'24. Project page:\n https://joslefaure.github.io/assets/html/hermes.html"},{"id":"http://arxiv.org/abs/2408.17437v1","updated":"2024-08-30T17:41:30Z","published":"2024-08-30T17:41:30Z","title":"SYNTHEVAL: Hybrid Behavioral Testing of NLP Models with Synthetic\n CheckLists","summary":" Traditional benchmarking in NLP typically involves using static held-out test\nsets. However, this approach often results in an overestimation of performance\nand lacks the ability to offer comprehensive, interpretable, and dynamic\nassessments of NLP models. Recently, works like DynaBench (Kiela et al., 2021)\nand CheckList (Ribeiro et al., 2020) have addressed these limitations through\nbehavioral testing of NLP models with test types generated by a multistep\nhuman-annotated pipeline. Unfortunately, manually creating a variety of test\ntypes requires much human labor, often at prohibitive cost. In this work, we\npropose SYNTHEVAL, a hybrid behavioral testing framework that leverages large\nlanguage models (LLMs) to generate a wide range of test types for a\ncomprehensive evaluation of NLP models. SYNTHEVAL first generates sentences via\nLLMs using controlled generation, and then identifies challenging examples by\ncomparing the predictions made by LLMs with task-specific NLP models. In the\nlast stage, human experts investigate the challenging examples, manually design\ntemplates, and identify the types of failures the taskspecific models\nconsistently exhibit. We apply SYNTHEVAL to two classification tasks, sentiment\nanalysis and toxic language detection, and show that our framework is effective\nin identifying weaknesses of strong models on these tasks. We share our code in\nhttps://github.com/Loreley99/SynthEval_CheckList.\n","authors":["Raoyuan Zhao","Abdullatif Köksal","Yihong Liu","Leonie Weissweiler","Anna Korhonen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2408.17437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17428v1","updated":"2024-08-30T17:26:05Z","published":"2024-08-30T17:26:05Z","title":"CLOCR-C: Context Leveraging OCR Correction with Pre-trained Language\n Models","summary":" The digitisation of historical print media archives is crucial for increasing\naccessibility to contemporary records. However, the process of Optical\nCharacter Recognition (OCR) used to convert physical records to digital text is\nprone to errors, particularly in the case of newspapers and periodicals due to\ntheir complex layouts. This paper introduces Context Leveraging OCR Correction\n(CLOCR-C), which utilises the infilling and context-adaptive abilities of\ntransformer-based language models (LMs) to improve OCR quality. The study aims\nto determine if LMs can perform post-OCR correction, improve downstream NLP\ntasks, and the value of providing the socio-cultural context as part of the\ncorrection process. Experiments were conducted using seven LMs on three\ndatasets: the 19th Century Serials Edition (NCSE) and two datasets from the\nOverproof collection. The results demonstrate that some LMs can significantly\nreduce error rates, with the top-performing model achieving over a 60%\nreduction in character error rate on the NCSE dataset. The OCR improvements\nextend to downstream tasks, such as Named Entity Recognition, with increased\nCosine Named Entity Similarity. Furthermore, the study shows that providing\nsocio-cultural context in the prompts improves performance, while misleading\nprompts lower performance. In addition to the findings, this study releases a\ndataset of 91 transcribed articles from the NCSE, containing a total of 40\nthousand words, to support further research in this area. The findings suggest\nthat CLOCR-C is a promising approach for enhancing the quality of existing\ndigital archives by leveraging the socio-cultural information embedded in the\nLMs and the text requiring correction.\n","authors":["Jonathan Bourne"],"pdf_url":"https://arxiv.org/pdf/2408.17428v1.pdf","comment":"13 pages, 3 figures, currently under peer review"},{"id":"http://arxiv.org/abs/2403.12212v2","updated":"2024-08-30T17:02:11Z","published":"2024-03-18T19:53:56Z","title":"Evaluating Named Entity Recognition: A comparative analysis of mono- and\n multilingual transformer models on a novel Brazilian corporate earnings call\n transcripts dataset","summary":" Since 2018, when the Transformer architecture was introduced, Natural\nLanguage Processing has gained significant momentum with pre-trained\nTransformer-based models that can be fine-tuned for various tasks. Most models\nare pre-trained on large English corpora, making them less applicable to other\nlanguages, such as Brazilian Portuguese. In our research, we identified two\nmodels pre-trained in Brazilian Portuguese (BERTimbau and PTT5) and two\nmultilingual models (mBERT and mT5). BERTimbau and mBERT use only the Encoder\nmodule, while PTT5 and mT5 use both the Encoder and Decoder. Our study aimed to\nevaluate their performance on a financial Named Entity Recognition (NER) task\nand determine the computational requirements for fine-tuning and inference. To\nthis end, we developed the Brazilian Financial NER (BraFiNER) dataset,\ncomprising sentences from Brazilian banks' earnings calls transcripts annotated\nusing a weakly supervised approach. Additionally, we introduced a novel\napproach that reframes the token classification task as a text generation\nproblem. After fine-tuning the models, we evaluated them using performance and\nerror metrics. Our findings reveal that BERT-based models consistently\noutperform T5-based models. While the multilingual models exhibit comparable\nmacro F1-scores, BERTimbau demonstrates superior performance over PTT5. In\nterms of error metrics, BERTimbau outperforms the other models. We also\nobserved that PTT5 and mT5 generated sentences with changes in monetary and\npercentage values, highlighting the importance of accuracy and consistency in\nthe financial domain. Our findings provide insights into the differing\nperformance of BERT- and T5-based models for the NER task.\n","authors":["Ramon Abilio","Guilherme Palermo Coelho","Ana Estela Antunes da Silva"],"pdf_url":"https://arxiv.org/pdf/2403.12212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06120v2","updated":"2024-08-30T16:42:05Z","published":"2024-02-09T01:10:25Z","title":"Exploring Group and Symmetry Principles in Large Language Models","summary":" Large Language Models (LLMs) have demonstrated impressive performance across\na wide range of applications; however, assessing their reasoning capabilities\nremains a significant challenge. In this paper, we introduce a framework\ngrounded in group and symmetry principles, which have played a crucial role in\nfields such as physics and mathematics, and offer another way to evaluate their\ncapabilities. While the proposed framework is general, to showcase the benefits\nof employing these properties, we focus on arithmetic reasoning and investigate\nthe performance of these models on four group properties: closure, identity,\ninverse, and associativity. Our findings reveal that LLMs studied in this work\nstruggle to preserve group properties across different test regimes. In the\nclosure test, we observe biases towards specific outputs and an abrupt\ndegradation in their performance from 100% to 0% after a specific sequence\nlength. They also perform poorly in the identity test, which represents adding\nirrelevant information in the context, and show sensitivity when subjected to\ninverse test, which examines the robustness of the model with respect to\nnegation. In addition, we demonstrate that breaking down problems into smaller\nsteps helps LLMs in the associativity test that we have conducted. To support\nthese tests we have developed a synthetic dataset which will be released.\n","authors":["Shima Imani","Hamid Palangi"],"pdf_url":"https://arxiv.org/pdf/2402.06120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02175v3","updated":"2024-08-30T16:40:15Z","published":"2024-05-03T15:25:48Z","title":"Hoaxpedia: A Unified Wikipedia Hoax Articles Dataset","summary":" Hoaxes are a recognised form of disinformation created deliberately, with\npotential serious implications in the credibility of reference knowledge\nresources such as Wikipedia. What makes detecting Wikipedia hoaxes hard is that\nthey often are written according to the official style guidelines. In this\nwork, we first provide a systematic analysis of similarities and discrepancies\nbetween legitimate and hoax Wikipedia articles, and introduce Hoaxpedia, a\ncollection of 311 hoax articles (from existing literature and official\nWikipedia lists), together with semantically similar legitimate articles, which\ntogether form a binary text classification dataset aimed at fostering research\nin automated hoax detection. In this paper, We report results after analyzing\nseveral language models, hoax-to-legit ratios, and the amount of text\nclassifiers are exposed to (full article vs the article's definition alone).\nOur results suggest that detecting deceitful content in Wikipedia based on\ncontent alone is hard but feasible, and complement our analysis with a study on\nthe differences in distributions in edit histories, and find that looking at\nthis feature yields better classification results than context.\n","authors":["Hsuvas Borkakoty","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2405.02175v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15379v2","updated":"2024-08-30T16:30:39Z","published":"2024-08-27T19:33:15Z","title":"DualKanbaFormer: Kolmogorov-Arnold Networks and State Space Model\n Transformer for Multimodal Aspect-based Sentiment Analysis","summary":" Multimodal aspect-based sentiment analysis (MABSA) enhances sentiment\ndetection by combining text with other data types like images. However, despite\nsetting significant benchmarks, attention mechanisms exhibit limitations in\nefficiently modelling long-range dependencies between aspect and opinion\ntargets within the text. They also face challenges in capturing global-context\ndependencies for visual representations. To this end, we propose\nKolmogorov-Arnold Networks (KANs) and Selective State Space model (Mamba)\ntransformer (DualKanbaFormer), a novel architecture to address the above\nissues. We leverage the power of Mamba to capture global context dependencies,\nMulti-head Attention (MHA) to capture local context dependencies, and KANs to\ncapture non-linear modelling patterns for both textual representations (textual\nKanbaFormer) and visual representations (visual KanbaFormer). Furthermore, we\nfuse the textual KanbaFormer and visual KanbaFomer with a gated fusion layer to\ncapture the inter-modality dynamics. According to extensive experimental\nresults, our model outperforms some state-of-the-art (SOTA) studies on two\npublic datasets.\n","authors":["Adamu Lawan","Juhua Pu","Haruna Yunusa","Muhammad Lawan","Aliyu Umar","Adamu Sani Yahya"],"pdf_url":"https://arxiv.org/pdf/2408.15379v2.pdf","comment":"10 pages, 2 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2405.12363v2","updated":"2024-08-30T16:23:13Z","published":"2024-05-20T20:27:00Z","title":"Question-Based Retrieval using Atomic Units for Enterprise RAG","summary":" Enterprise retrieval augmented generation (RAG) offers a highly flexible\nframework for combining powerful large language models (LLMs) with internal,\npossibly temporally changing, documents. In RAG, documents are first chunked.\nRelevant chunks are then retrieved for a user query, which are passed as\ncontext to a synthesizer LLM to generate the query response. However, the\nretrieval step can limit performance, as incorrect chunks can lead the\nsynthesizer LLM to generate a false response. This work applies a zero-shot\nadaptation of standard dense retrieval steps for more accurate chunk recall.\nSpecifically, a chunk is first decomposed into atomic statements. A set of\nsynthetic questions are then generated on these atoms (with the chunk as the\ncontext). Dense retrieval involves finding the closest set of synthetic\nquestions, and associated chunks, to the user query. It is found that retrieval\nwith the atoms leads to higher recall than retrieval with chunks. Further\nperformance gain is observed with retrieval using the synthetic questions\ngenerated over the atoms. Higher recall at the retrieval step enables higher\nperformance of the enterprise LLM using the RAG pipeline.\n","authors":["Vatsal Raina","Mark Gales"],"pdf_url":"https://arxiv.org/pdf/2405.12363v2.pdf","comment":"14 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.17377v1","updated":"2024-08-30T16:13:49Z","published":"2024-08-30T16:13:49Z","title":"NDP: Next Distribution Prediction as a More Broad Target","summary":" Large language models (LLMs) trained on next-token prediction (NTP) paradigm\nhave demonstrated powerful capabilities. However, the existing NTP paradigm\ncontains several limitations, particularly related to planned task\ncomplications and error propagation during inference. In our work, we extend\nthe critique of NTP, highlighting its limitation also due to training with a\nnarrow objective: the prediction of a sub-optimal one-hot distribution. To\nsupport this critique, we conducted a pre-experiment treating the output\ndistribution from powerful LLMs as efficient world data compression. By\nevaluating the similarity between the $n$-gram distribution and the one-hot\ndistribution with LLMs, we observed that the $n$-gram distributions align more\nclosely with the output distribution of LLMs. Based on this insight, we\nintroduce Next Distribution Prediction (NDP), which uses $n$-gram distributions\nto replace the one-hot targets, enhancing learning without extra online\ntraining time. We conducted experiments across translation, general task,\nlanguage transfer, and medical domain adaptation. Compared to NTP, NDP can\nachieve up to +2.97 COMET improvement in translation tasks, +0.61 average\nimprovement in general tasks, and incredible +10.75 average improvement in the\nmedical domain. This demonstrates the concrete benefits of addressing the\ntarget narrowing problem, pointing to a new direction for future work on\nimproving NTP.\n","authors":["Junhao Ruan","Abudukeyumu Abudula","Xinyu Liu","Bei Li","Yinqiao Li","Chenglong Wang","Yuchun Fan","Yuan Ge","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.17377v1.pdf","comment":"8 pages,5 figures"},{"id":"http://arxiv.org/abs/2404.00458v2","updated":"2024-08-30T15:59:46Z","published":"2024-03-30T19:45:04Z","title":"Beyond One-Size-Fits-All: Multi-Domain, Multi-Task Framework for\n Embedding Model Selection","summary":" This position paper proposes a systematic approach towards developing a\nframework to help select the most effective embedding models for natural\nlanguage processing (NLP) tasks, addressing the challenge posed by the\nproliferation of both proprietary and open-source encoder models.\n","authors":["Vivek Khetan"],"pdf_url":"https://arxiv.org/pdf/2404.00458v2.pdf","comment":"It was an initial idea - we plan to work on a detailed version"},{"id":"http://arxiv.org/abs/2408.17362v1","updated":"2024-08-30T15:52:41Z","published":"2024-08-30T15:52:41Z","title":"Assessing Generative Language Models in Classification Tasks:\n Performance and Self-Evaluation Capabilities in the Environmental and Climate\n Change Domain","summary":" This paper examines the performance of two Large Language Models (LLMs),\nGPT3.5 and Llama2 and one Small Language Model (SLM) Gemma, across three\ndifferent classification tasks within the climate change (CC) and environmental\ndomain. Employing BERT-based models as a baseline, we compare their efficacy\nagainst these transformer-based models. Additionally, we assess the models'\nself-evaluation capabilities by analyzing the calibration of verbalized\nconfidence scores in these text classification tasks. Our findings reveal that\nwhile BERT-based models generally outperform both the LLMs and SLM, the\nperformance of the large generative models is still noteworthy. Furthermore,\nour calibration analysis reveals that although Gemma is well-calibrated in\ninitial tasks, it thereafter produces inconsistent results; Llama is reasonably\ncalibrated, and GPT consistently exhibits strong calibration. Through this\nresearch, we aim to contribute to the ongoing discussion on the utility and\neffectiveness of generative LMs in addressing some of the planet's most urgent\nissues, highlighting their strengths and limitations in the context of ecology\nand CC.\n","authors":["Francesca Grasso","Stefano Locci"],"pdf_url":"https://arxiv.org/pdf/2408.17362v1.pdf","comment":"11 pages, to be published in NLDB 2024"},{"id":"http://arxiv.org/abs/2408.09869v3","updated":"2024-08-30T15:05:58Z","published":"2024-08-19T10:20:06Z","title":"Docling Technical Report","summary":" This technical report introduces Docling, an easy to use, self-contained,\nMIT-licensed open-source package for PDF document conversion. It is powered by\nstate-of-the-art specialized AI models for layout analysis (DocLayNet) and\ntable structure recognition (TableFormer), and runs efficiently on commodity\nhardware in a small resource budget. The code interface allows for easy\nextensibility and addition of new features and models.\n","authors":["Christoph Auer","Maksym Lysak","Ahmed Nassar","Michele Dolfi","Nikolaos Livathinos","Panos Vagenas","Cesar Berrospi Ramis","Matteo Omenetti","Fabian Lindlbauer","Kasper Dinkla","Lokesh Mishra","Yusik Kim","Shubham Gupta","Rafael Teixeira de Lima","Valery Weber","Lucas Morin","Ingmar Meijer","Viktor Kuropiatnyk","Peter W. J. Staar"],"pdf_url":"https://arxiv.org/pdf/2408.09869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15569v2","updated":"2024-08-30T14:52:24Z","published":"2024-07-22T11:55:14Z","title":"An Empirical Study of Retrieval Augmented Generation with\n Chain-of-Thought","summary":" Since the launch of ChatGPT at the end of 2022, generative dialogue models\nrepresented by ChatGPT have quickly become essential tools in daily life. As\nuser expectations increase, enhancing the capability of generative dialogue\nmodels to solve complex problems has become a focal point of current research.\nThis paper delves into the effectiveness of the RAFT (Retrieval Augmented\nFine-Tuning) method in improving the performance of Generative dialogue models.\nRAFT combines chain-of-thought with model supervised fine-tuning (SFT) and\nretrieval augmented generation (RAG), which significantly enhanced the model's\ninformation extraction and logical reasoning abilities. We evaluated the RAFT\nmethod across multiple datasets and analysed its performance in various\nreasoning tasks, including long-form QA and short-form QA tasks, tasks in both\nChinese and English, and supportive and comparison reasoning tasks. Notably, it\naddresses the gaps in previous research regarding long-form QA tasks and\nChinese datasets. Moreover, we also evaluate the benefit of the\nchain-of-thought (CoT) in the RAFT method. This work offers valuable insights\nfor studies focused on enhancing the performance of generative dialogue models.\n","authors":["Yuetong Zhao","Hongyu Cao","Xianyu Zhao","Zhijian Ou"],"pdf_url":"https://arxiv.org/pdf/2407.15569v2.pdf","comment":"Accepted by ISCSLP 2024"},{"id":"http://arxiv.org/abs/2402.01676v2","updated":"2024-08-30T14:43:22Z","published":"2024-01-19T19:36:54Z","title":"Language models align with human judgments on key grammatical\n constructions","summary":" Do large language models (LLMs) make human-like linguistic generalizations?\nDentella et al. (2023) (\"DGL\") prompt several LLMs (\"Is the following sentence\ngrammatically correct in English?\") to elicit grammaticality judgments of 80\nEnglish sentences, concluding that LLMs demonstrate a \"yes-response bias\" and a\n\"failure to distinguish grammatical from ungrammatical sentences\". We\nre-evaluate LLM performance using well-established practices and find that\nDGL's data in fact provide evidence for just how well LLMs capture human\nbehaviors. Models not only achieve high accuracy overall, but also capture\nfine-grained variation in human linguistic judgments.\n","authors":["Jennifer Hu","Kyle Mahowald","Gary Lupyan","Anna Ivanova","Roger Levy"],"pdf_url":"https://arxiv.org/pdf/2402.01676v2.pdf","comment":"Published in PNAS at https://www.pnas.org/doi/10.1073/pnas.2400917121\n as response to Dentella et al. (2023)"},{"id":"http://arxiv.org/abs/2408.17325v1","updated":"2024-08-30T14:37:10Z","published":"2024-08-30T14:37:10Z","title":"Impact of ChatGPT on the writing style of condensed matter physicists","summary":" We apply a state-of-the-art difference-in-differences approach to estimate\nthe impact of ChatGPT's release on the writing style of condensed matter papers\non arXiv. Our analysis reveals a statistically significant improvement in the\nEnglish quality of abstracts written by non-native English speakers.\nImportantly, this improvement remains robust even after accounting for other\npotential factors, confirming that it can be attributed to the release of\nChatGPT. This indicates widespread adoption of the tool. Following the release\nof ChatGPT, there is a significant increase in the use of unique words, while\nthe frequency of rare words decreases. Across language families, the changes in\nwriting style are significant for authors from the Latin and Ural-Altaic\ngroups, but not for those from the Germanic or other Indo-European groups.\n","authors":["Shaojun Xu","Xiaohui Ye","Mengqi Zhang","Pei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17325v1.pdf","comment":"9 pages, 1 figure, 7 tables"},{"id":"http://arxiv.org/abs/2408.17324v1","updated":"2024-08-30T14:35:01Z","published":"2024-08-30T14:35:01Z","title":"Modularity in Transformers: Investigating Neuron Separability &\n Specialization","summary":" Transformer models are increasingly prevalent in various applications, yet\nour understanding of their internal workings remains limited. This paper\ninvestigates the modularity and task specialization of neurons within\ntransformer architectures, focusing on both vision (ViT) and language (Mistral\n7B) models. Using a combination of selective pruning and MoEfication clustering\ntechniques, we analyze the overlap and specialization of neurons across\ndifferent tasks and data subsets. Our findings reveal evidence of task-specific\nneuron clusters, with varying degrees of overlap between related tasks. We\nobserve that neuron importance patterns persist to some extent even in randomly\ninitialized models, suggesting an inherent structure that training refines.\nAdditionally, we find that neuron clusters identified through MoEfication\ncorrespond more strongly to task-specific neurons in earlier and later layers\nof the models. This work contributes to a more nuanced understanding of\ntransformer internals and offers insights into potential avenues for improving\nmodel interpretability and efficiency.\n","authors":["Nicholas Pochinkov","Thomas Jones","Mohammed Rashidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2408.17324v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.17322v1","updated":"2024-08-30T14:32:25Z","published":"2024-08-30T14:32:25Z","title":"Investigating Neuron Ablation in Attention Heads: The Case for Peak\n Activation Centering","summary":" The use of transformer-based models is growing rapidly throughout society.\nWith this growth, it is important to understand how they work, and in\nparticular, how the attention mechanisms represent concepts. Though there are\nmany interpretability methods, many look at models through their neuronal\nactivations, which are poorly understood. We describe different lenses through\nwhich to view neuron activations, and investigate the effectiveness in language\nmodels and vision transformers through various methods of neural ablation: zero\nablation, mean ablation, activation resampling, and a novel approach we term\n'peak ablation'. Through experimental analysis, we find that in different\nregimes and models, each method can offer the lowest degradation of model\nperformance compared to other methods, with resampling usually causing the most\nsignificant performance deterioration. We make our code available at\nhttps://github.com/nickypro/investigating-ablation.\n","authors":["Nicholas Pochinkov","Ben Pasero","Skylar Shibayama"],"pdf_url":"https://arxiv.org/pdf/2408.17322v1.pdf","comment":"9 pages, 2 figures, XAI World Conference 2024 Late-Breaking Work"},{"id":"http://arxiv.org/abs/2408.17316v1","updated":"2024-08-30T14:23:40Z","published":"2024-08-30T14:23:40Z","title":"Bridging Domain Knowledge and Process Discovery Using Large Language\n Models","summary":" Discovering good process models is essential for different process analysis\ntasks such as conformance checking and process improvements. Automated process\ndiscovery methods often overlook valuable domain knowledge. This knowledge,\nincluding insights from domain experts and detailed process documentation,\nremains largely untapped during process discovery. This paper leverages Large\nLanguage Models (LLMs) to integrate such knowledge directly into process\ndiscovery. We use rules derived from LLMs to guide model construction, ensuring\nalignment with both domain knowledge and actual process executions. By\nintegrating LLMs, we create a bridge between process knowledge expressed in\nnatural language and the discovery of robust process models, advancing process\ndiscovery methodologies significantly. To showcase the usability of our\nframework, we conducted a case study with the UWV employee insurance agency,\ndemonstrating its practical benefits and effectiveness.\n","authors":["Ali Norouzifar","Humam Kourani","Marcus Dees","Wil van der Aalst"],"pdf_url":"https://arxiv.org/pdf/2408.17316v1.pdf","comment":"This paper is accepted at the AI4BPM 2024 workshop and to be\n published in their proceedings"},{"id":"http://arxiv.org/abs/2408.17308v1","updated":"2024-08-30T14:12:04Z","published":"2024-08-30T14:12:04Z","title":"Towards Tailored Recovery of Lexical Diversity in Literary Machine\n Translation","summary":" Machine translations are found to be lexically poorer than human\ntranslations. The loss of lexical diversity through MT poses an issue in the\nautomatic translation of literature, where it matters not only what is written,\nbut also how it is written. Current methods for increasing lexical diversity in\nMT are rigid. Yet, as we demonstrate, the degree of lexical diversity can vary\nconsiderably across different novels. Thus, rather than aiming for the rigid\nincrease of lexical diversity, we reframe the task as recovering what is lost\nin the machine translation process. We propose a novel approach that consists\nof reranking translation candidates with a classifier that distinguishes\nbetween original and translated text. We evaluate our approach on 31\nEnglish-to-Dutch book translations, and find that, for certain books, our\napproach retrieves lexical diversity scores that are close to human\ntranslation.\n","authors":["Esther Ploeger","Huiyuan Lai","Rik van Noord","Antonio Toral"],"pdf_url":"https://arxiv.org/pdf/2408.17308v1.pdf","comment":"Accepted to EAMT 2024"},{"id":"http://arxiv.org/abs/2310.09762v2","updated":"2024-08-30T13:39:56Z","published":"2023-10-15T07:20:28Z","title":"Diversifying the Mixture-of-Experts Representation for Language Models\n with Orthogonal Optimizer","summary":" The Mixture of Experts (MoE) has emerged as a highly successful technique in\ndeep learning, based on the principle of divide-and-conquer to maximize model\ncapacity without significant additional computational cost. Even in the era of\nlarge-scale language models (LLMs), MoE continues to play a crucial role, as\nsome researchers have indicated that GPT-4 adopts the MoE structure to ensure\ndiverse inference results. However, MoE is susceptible to performance\ndegeneracy, particularly evident in the issues of imbalance and homogeneous\nrepresentation among experts. While previous studies have extensively addressed\nthe problem of imbalance, the challenge of homogeneous representation remains\nunresolved. In this study, we shed light on the homogeneous representation\nproblem, wherein experts in the MoE fail to specialize and lack diversity,\nleading to frustratingly high similarities in their representations (up to 99\\%\nin a well-performed MoE model). This problem restricts the expressive power of\nthe MoE and, we argue, contradicts its original intention. To tackle this\nissue, we propose a straightforward yet highly effective solution: OMoE, an\northogonal expert optimizer. Additionally, we introduce an alternating training\nstrategy that encourages each expert to update in a direction orthogonal to the\nsubspace spanned by other experts. Our algorithm facilitates MoE training in\ntwo key ways: firstly, it explicitly enhances representation diversity, and\nsecondly, it implicitly fosters interaction between experts during orthogonal\nweights computation. Through extensive experiments, we demonstrate that our\nproposed optimization algorithm significantly improves the performance of\nfine-tuning the MoE model on the GLUE benchmark, SuperGLUE benchmark,\nquestion-answering task, and name entity recognition tasks.\n","authors":["Boan Liu","Liang Ding","Li Shen","Keqin Peng","Yu Cao","Dazhao Cheng","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.09762v2.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.17280v1","updated":"2024-08-30T13:28:45Z","published":"2024-08-30T13:28:45Z","title":"Flexible and Effective Mixing of Large Language Models into a Mixture of\n Domain Experts","summary":" We present a toolkit for creating low-cost Mixture-of-Domain-Experts (MOE)\nfrom trained models. The toolkit can be used for creating a mixture from models\nor from adapters. We perform extensive tests and offer guidance on defining the\narchitecture of the resulting MOE using the toolkit. A public repository is\navailable.\n","authors":["Rhui Dih Lee","Laura Wynter","Raghu Kiran Ganti"],"pdf_url":"https://arxiv.org/pdf/2408.17280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12614v4","updated":"2024-08-30T12:40:04Z","published":"2024-06-18T13:43:22Z","title":"EUvsDisinfo: A Dataset for Multilingual Detection of Pro-Kremlin\n Disinformation in News Articles","summary":" This work introduces EUvsDisinfo, a multilingual dataset of disinformation\narticles originating from pro-Kremlin outlets, along with trustworthy articles\nfrom credible / less biased sources. It is sourced directly from the debunk\narticles written by experts leading the EUvsDisinfo project. Our dataset is the\nlargest to-date resource in terms of the overall number of articles and\ndistinct languages. It also provides the largest topical and temporal coverage.\nUsing this dataset, we investigate the dissemination of pro-Kremlin\ndisinformation across different languages, uncovering language-specific\npatterns targeting certain disinformation topics. We further analyse the\nevolution of topic distribution over an eight-year period, noting a significant\nsurge in disinformation content before the full-scale invasion of Ukraine in\n2022. Lastly, we demonstrate the dataset's applicability in training models to\neffectively distinguish between disinformation and trustworthy content in\nmultilingual settings.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2406.12614v4.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2407.04295v2","updated":"2024-08-30T11:57:47Z","published":"2024-07-05T06:57:30Z","title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","summary":" Large Language Models (LLMs) have performed exceptionally in various\ntext-generative tasks, including question answering, translation, code\ncompletion, etc. However, the over-assistance of LLMs has raised the challenge\nof \"jailbreaking\", which induces the model to generate malicious responses\nagainst the usage policy and society by designing adversarial prompts. With the\nemergence of jailbreak attack methods exploiting different vulnerabilities in\nLLMs, the corresponding safety alignment measures are also evolving. In this\npaper, we propose a comprehensive and detailed taxonomy of jailbreak attack and\ndefense methods. For instance, the attack methods are divided into black-box\nand white-box attacks based on the transparency of the target model. Meanwhile,\nwe classify defense methods into prompt-level and model-level defenses.\nAdditionally, we further subdivide these attack and defense methods into\ndistinct sub-classes and present a coherent diagram illustrating their\nrelationships. We also conduct an investigation into the current evaluation\nmethods and compare them from different perspectives. Our findings aim to\ninspire future research and practical implementations in safeguarding LLMs\nagainst adversarial attacks. Above all, although jailbreak remains a\nsignificant concern within the community, we believe that our work enhances the\nunderstanding of this domain and provides a foundation for developing more\nsecure LLMs.\n","authors":["Sibo Yi","Yule Liu","Zhen Sun","Tianshuo Cong","Xinlei He","Jiaxing Song","Ke Xu","Qi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00023v2","updated":"2024-08-30T11:32:48Z","published":"2024-05-24T02:50:44Z","title":"Expert-Token Resonance: Redefining MoE Routing through Affinity-Driven\n Active Selection","summary":" Mixture-of-Experts (MoE) architectures have emerged as a paradigm-shifting\napproach for large language models (LLMs), offering unprecedented computational\nefficiency. However, these architectures grapple with challenges of token\ndistribution imbalance and expert homogenization, impeding optimal semantic\ngeneralization. We introduce a novel framework that redefines MoE routing\nthrough affinity-driven active selection. The innovations for the framework\nencompass: (1) A rigorous formulation of expert-token affinity metrics. (2) An\nadaptive bidirectional selection mechanism leveraging resonance between experts\nand tokens. (3) Theoretical derivation and experimental evidence of reduced\nexpert capacity bounds under dynamic token distribution evolution. It is also\nintegrated with orthogonal feature extraction module and an optimized loss\nfunction for expert localization. Our theoretical analysis demonstrates that\nthis approach mitigates expert homogenization while enabling substantial\ncapacity boundary reduction. Experimental validation corroborates these\nfindings: it achieves a 40% reduction in token processed by each expert without\ncompromising model convergence or efficacy. When coupled with communication\noptimizations, the training efficiency improvements of 5.4% to 46.6% can be\nobserved. After supervised fine-tuning, it exhibits performance gains of 9.7%\nto 14.1% across GDAD, C-Eval, and TeleQnA benchmarks.\n","authors":["Jing Li","Zhijie Sun","Dachao Lin","Xuan He","Yi Lin","Binfan Zheng","Li Zeng","Rongqian Zhao","Xin Chen"],"pdf_url":"https://arxiv.org/pdf/2406.00023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05200v2","updated":"2024-08-30T11:14:17Z","published":"2024-08-09T17:44:45Z","title":"TaSL: Task Skill Localization and Consolidation for Language Model\n Continual Learning","summary":" Language model continual learning (CL) has recently attracted significant\ninterest for its ability to adapt large language models (LLMs) to dynamic\nreal-world scenarios without retraining. A major challenge in this domain is\ncatastrophic forgetting, where models lose previously acquired knowledge upon\nlearning new tasks. Existing approaches commonly utilize multiple\nparameter-efficient fine-tuning (PEFT) blocks to acquire task-specific\nknowledge, yet these methods are inefficient and fail to leverage potential\nknowledge transfer across tasks. In this paper, we introduce a novel CL\nframework for language models, named Task Skill Localization and Consolidation\n(TaSL), which boosts knowledge transfer without depending on memory replay.\nTaSL initially segregates the model into 'skill units' based on parameter\ndependencies, allowing for more precise control. Subsequently, it employs a\nnovel group-wise skill localization technique to ascertain the importance\ndistribution of skill units for a new task. By comparing this importance\ndistribution with those from previous tasks, we implement a fine-grained skill\nconsolidation strategy that retains task-specific knowledge, thereby preventing\nforgetting, and updates task-shared knowledge, which facilitates bi-directional\nknowledge transfer. As a result, TaSL achieves an optimal balance between\nretaining prior knowledge and excelling in new tasks. TaSL also demonstrates\nstrong generalizability, making it suitable for various base models and\nadaptable to PEFT methods like LoRA. Furthermore, it offers notable\nextensibility, supporting enhancements through integration with memory replay\ntechniques. Comprehensive experiments conducted on two CL benchmarks, involving\nmodels ranging from 220M to 7B parameters, affirm the effectiveness of TaSL and\nits variants across different settings.\n","authors":["Yujie Feng","Xu Chu","Yongxin Xu","Zexin Lu","Bo Liu","Philip S. Yu","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2408.05200v2.pdf","comment":"Extension of ACL 2024 paper titled: Continual Dialog State Tracking\n via Task Skill Localization and Consolidation"},{"id":"http://arxiv.org/abs/2408.17181v1","updated":"2024-08-30T10:28:49Z","published":"2024-08-30T10:28:49Z","title":"Improving Extraction of Clinical Event Contextual Properties from\n Electronic Health Records: A Comparative Study","summary":" Electronic Health Records are large repositories of valuable clinical data,\nwith a significant portion stored in unstructured text format. This textual\ndata includes clinical events (e.g., disorders, symptoms, findings, medications\nand procedures) in context that if extracted accurately at scale can unlock\nvaluable downstream applications such as disease prediction. Using an existing\nNamed Entity Recognition and Linking methodology, MedCAT, these identified\nconcepts need to be further classified (contextualised) for their relevance to\nthe patient, and their temporal and negated status for example, to be useful\ndownstream. This study performs a comparative analysis of various natural\nlanguage models for medical text classification. Extensive experimentation\nreveals the effectiveness of transformer-based language models, particularly\nBERT. When combined with class imbalance mitigation techniques, BERT\noutperforms Bi-LSTM models by up to 28% and the baseline BERT model by up to\n16% for recall of the minority classes. The method has been implemented as part\nof CogStack/MedCAT framework and made available to the community for further\nresearch.\n","authors":["Shubham Agarwal","Thomas Searle","Mart Ratas","Anthony Shek","James Teo","Richard Dobson"],"pdf_url":"https://arxiv.org/pdf/2408.17181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17175v1","updated":"2024-08-30T10:24:07Z","published":"2024-08-30T10:24:07Z","title":"Codec Does Matter: Exploring the Semantic Shortcoming of Codec for Audio\n Language Model","summary":" Recent advancements in audio generation have been significantly propelled by\nthe capabilities of Large Language Models (LLMs). The existing research on\naudio LLM has primarily focused on enhancing the architecture and scale of\naudio language models, as well as leveraging larger datasets, and generally,\nacoustic codecs, such as EnCodec, are used for audio tokenization. However,\nthese codecs were originally designed for audio compression, which may lead to\nsuboptimal performance in the context of audio LLM. Our research aims to\naddress the shortcomings of current audio LLM codecs, particularly their\nchallenges in maintaining semantic integrity in generated audio. For instance,\nexisting methods like VALL-E, which condition acoustic token generation on text\ntranscriptions, often suffer from content inaccuracies and elevated word error\nrates (WER) due to semantic misinterpretations of acoustic tokens, resulting in\nword skipping and errors. To overcome these issues, we propose a\nstraightforward yet effective approach called X-Codec. X-Codec incorporates\nsemantic features from a pre-trained semantic encoder before the Residual\nVector Quantization (RVQ) stage and introduces a semantic reconstruction loss\nafter RVQ. By enhancing the semantic ability of the codec, X-Codec\nsignificantly reduces WER in speech synthesis tasks and extends these benefits\nto non-speech applications, including music and sound generation. Our\nexperiments in text-to-speech, music continuation, and text-to-sound tasks\ndemonstrate that integrating semantic information substantially improves the\noverall performance of language models in audio generation. Our code and demo\nare available (Demo: https://x-codec-audio.github.io Code:\nhttps://github.com/zhenye234/xcodec)\n","authors":["Zhen Ye","Peiwen Sun","Jiahe Lei","Hongzhan Lin","Xu Tan","Zheqi Dai","Qiuqiang Kong","Jianyi Chen","Jiahao Pan","Qifeng Liu","Yike Guo","Wei Xue"],"pdf_url":"https://arxiv.org/pdf/2408.17175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03387v2","updated":"2024-08-30T09:13:50Z","published":"2024-07-03T08:36:13Z","title":"ConCodeEval: Evaluating Large Language Models for Code Constraints in\n Domain-Specific Languages","summary":" Recent work shows Large Language Models (LLMs) struggle to understand natural\nlanguage constraints for various text generation tasks in zero- and few-shot\nsettings. While, in the code domain, there is wide usage of constraints in code\nformat to maintain the integrity of code written in Domain-Specific Languages\n(DSLs) like JSON and YAML which are widely used for system-level programming\ntasks in enterprises. Given that LLMs are increasingly used for system-level\ncode tasks, evaluating if they can comprehend these code constraints is\ncrucial. However, no work has been done to evaluate their controllability over\ncode constraints. Hence, we introduce ConCodeEval, a first-of-its-kind\nbenchmark having two novel tasks for code constraints across five\nrepresentations. Our findings suggest that language models struggle with code\nconstraints. Code languages that perform excellently for normal code tasks do\nnot perform well when the same languages represent fine-grained constraints.\n","authors":["Mehant Kammakomati","Sameer Pimparkhede","Srikanth Tamilselvam","Prince Kumar","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2407.03387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17072v1","updated":"2024-08-30T07:57:30Z","published":"2024-08-30T07:57:30Z","title":"MaFeRw: Query Rewriting with Multi-Aspect Feedbacks for\n Retrieval-Augmented Large Language Models","summary":" In a real-world RAG system, the current query often involves spoken ellipses\nand ambiguous references from dialogue contexts, necessitating query rewriting\nto better describe user's information needs. However, traditional context-based\nrewriting has minimal enhancement on downstream generation tasks due to the\nlengthy process from query rewriting to response generation. Some researchers\ntry to utilize reinforcement learning with generation feedback to assist the\nrewriter, but these sparse rewards provide little guidance in most cases,\nleading to unstable training and generation results. We find that user's needs\nare also reflected in the gold document, retrieved documents and ground truth.\nTherefore, by feeding back these multi-aspect dense rewards to query rewriting,\nmore stable and satisfactory responses can be achieved. In this paper, we\npropose a novel query rewriting method MaFeRw, which improves RAG performance\nby integrating multi-aspect feedback from both the retrieval process and\ngenerated results. Specifically, we first use manual data to train a T5 model\nfor the rewriter initialization. Next, we design three metrics as reinforcement\nlearning feedback: the similarity between the rewritten query and the gold\ndocument, the ranking metrics, and ROUGE between the generation and the ground\ntruth. Inspired by RLAIF, we train three kinds of reward models for the above\nmetrics to achieve more efficient training. Finally, we combine the scores of\nthese reward models as feedback, and use PPO algorithm to explore the optimal\nquery rewriting strategy. Experimental results on two conversational RAG\ndatasets demonstrate that MaFeRw achieves superior generation metrics and more\nstable training compared to baselines.\n","authors":["Yujing Wang","Hainan Zhang","Liang Pang","Liang Pang","Hongwei Zheng","Zhiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.17072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17070v1","updated":"2024-08-30T07:54:50Z","published":"2024-08-30T07:54:50Z","title":"Novel-WD: Exploring acquisition of Novel World Knowledge in LLMs Using\n Prefix-Tuning","summary":" Teaching new information to pre-trained large language models (PLM) is a\ncrucial but challenging task. Model adaptation techniques, such as fine-tuning\nand parameter-efficient training have been shown to store new facts at a slow\nrate; continual learning is an option but is costly and prone to catastrophic\nforgetting. This work studies and quantifies how PLM may learn and remember new\nworld knowledge facts that do not occur in their pre-training corpus, which\nonly contains world knowledge up to a certain date. To that purpose, we first\npropose Novel-WD, a new dataset consisting of sentences containing novel facts\nextracted from recent Wikidata updates, along with two evaluation tasks in the\nform of causal language modeling and multiple choice questions (MCQ). We make\nthis dataset freely available to the community, and release a procedure to\nlater build new versions of similar datasets with up-to-date information. We\nalso explore the use of prefix-tuning for novel information learning, and\nanalyze how much information can be stored within a given prefix. We show that\na single fact can reliably be encoded within a single prefix, and that the\nprefix capacity increases with its length and with the base model size.\n","authors":["Maxime Méloux","Christophe Cerisara"],"pdf_url":"https://arxiv.org/pdf/2408.17070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13344v2","updated":"2024-08-30T07:43:00Z","published":"2024-05-22T05:03:39Z","title":"Contextualized Automatic Speech Recognition with Dynamic Vocabulary","summary":" Deep biasing (DB) enhances the performance of end-to-end automatic speech\nrecognition (E2E-ASR) models for rare words or contextual phrases using a bias\nlist. However, most existing methods treat bias phrases as sequences of\nsubwords in a predefined static vocabulary. This naive sequence decomposition\nproduces unnatural token patterns, significantly lowering their occurrence\nprobability. More advanced techniques address this problem by expanding the\nvocabulary with additional modules, including the external language model\nshallow fusion or rescoring. However, they result in increasing the workload\ndue to the additional modules. This paper proposes a dynamic vocabulary where\nbias tokens can be added during inference. Each entry in a bias list is\nrepresented as a single token, unlike a sequence of existing subword tokens.\nThis approach eliminates the need to learn subword dependencies within the bias\nphrases. This method is easily applied to various architectures because it only\nexpands the embedding and output layers in common E2E-ASR architectures.\nExperimental results demonstrate that the proposed method improves the bias\nphrase WER on English and Japanese datasets by 3.1 -- 4.9 points compared with\nthe conventional DB method.\n","authors":["Yui Sudo","Yosuke Fukumoto","Muhammad Shakeel","Yifan Peng","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2405.13344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12942v2","updated":"2024-08-30T07:30:13Z","published":"2024-08-23T09:46:15Z","title":"Causal-Guided Active Learning for Debiasing Large Language Models","summary":" Although achieving promising performance, recent analyses show that current\ngenerative large language models (LLMs) may still capture dataset biases and\nutilize them for generation, leading to poor generalizability and harmfulness\nof LLMs. However, due to the diversity of dataset biases and the\nover-optimization problem, previous prior-knowledge-based debiasing methods and\nfine-tuning-based debiasing methods may not be suitable for current LLMs. To\naddress this issue, we explore combining active learning with the causal\nmechanisms and propose a casual-guided active learning (CAL) framework, which\nutilizes LLMs itself to automatically and autonomously identify informative\nbiased samples and induce the bias patterns. Then a cost-effective and\nefficient in-context learning based method is employed to prevent LLMs from\nutilizing dataset biases during generation. Experimental results show that CAL\ncan effectively recognize typical biased instances and induce various bias\npatterns for debiasing LLMs.\n","authors":["Li Du","Zhouhao Sun","Xiao Ding","Yixuan Ma","Yang Zhao","Kaitao Qiu","Ting Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2408.12942v2.pdf","comment":"Accepted as ACL 2024 main conference & Rewared as Outstanding Paper"},{"id":"http://arxiv.org/abs/2407.21646v2","updated":"2024-08-30T06:50:51Z","published":"2024-07-31T14:48:27Z","title":"Towards Achieving Human Parity on End-to-end Simultaneous Speech\n Translation via LLM Agent","summary":" In this paper, we present Cross Language Agent -- Simultaneous\nInterpretation, CLASI, a high-quality and human-like Simultaneous Speech\nTranslation (SiST) System. Inspired by professional human interpreters, we\nutilize a novel data-driven read-write strategy to balance the translation\nquality and latency. To address the challenge of translating in-domain\nterminologies, CLASI employs a multi-modal retrieving module to obtain relevant\ninformation to augment the translation. Supported by LLMs, our approach can\ngenerate error-tolerated translation by considering the input audio, historical\ncontext, and retrieved information. Experimental results show that our system\noutperforms other systems by significant margins. Aligned with professional\nhuman interpreters, we evaluate CLASI with a better human evaluation metric,\nvalid information proportion (VIP), which measures the amount of information\nthat can be successfully conveyed to the listeners. In the real-world\nscenarios, where the speeches are often disfluent, informal, and unclear, CLASI\nachieves VIP of 81.3% and 78.0% for Chinese-to-English and English-to-Chinese\ntranslation directions, respectively. In contrast, state-of-the-art commercial\nor open-source systems only achieve 35.4% and 41.6%. On the extremely hard\ndataset, where other systems achieve under 13% VIP, CLASI can still achieve 70%\nVIP.\n","authors":["Shanbo Cheng","Zhichao Huang","Tom Ko","Hang Li","Ningxin Peng","Lu Xu","Qini Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.21646v2.pdf","comment":"Authors are listed in alphabetical order by last name. Demonstrations\n and human-annotated test sets are available at\n https://byteresearchcla.github.io/clasi"},{"id":"http://arxiv.org/abs/2408.15545v2","updated":"2024-08-30T06:42:36Z","published":"2024-08-28T05:41:52Z","title":"SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding","summary":" Scientific literature understanding is crucial for extracting targeted\ninformation and garnering insights, thereby significantly advancing scientific\ndiscovery. Despite the remarkable success of Large Language Models (LLMs), they\nface challenges in scientific literature understanding, primarily due to (1) a\nlack of scientific knowledge and (2) unfamiliarity with specialized scientific\ntasks.\n To develop an LLM specialized in scientific literature understanding, we\npropose a hybrid strategy that integrates continual pre-training (CPT) and\nsupervised fine-tuning (SFT), to simultaneously infuse scientific domain\nknowledge and enhance instruction-following capabilities for domain-specific\ntasks.cIn this process, we identify two key challenges: (1) constructing\nhigh-quality CPT corpora, and (2) generating diverse SFT instructions. We\naddress these challenges through a meticulous pipeline, including PDF text\nextraction, parsing content error correction, quality filtering, and synthetic\ninstruction creation. Applying this strategy, we present a suite of LLMs:\nSciLitLLM, specialized in scientific literature understanding. These models\ndemonstrate promising performance on scientific literature understanding\nbenchmarks.\n Our contributions are threefold: (1) We present an effective framework that\nintegrates CPT and SFT to adapt LLMs to scientific literature understanding,\nwhich can also be easily adapted to other domains. (2) We propose an LLM-based\nsynthesis method to generate diverse and high-quality scientific instructions,\nresulting in a new instruction set -- SciLitIns -- for supervised fine-tuning\nin less-represented scientific domains. (3) SciLitLLM achieves promising\nperformance improvements on scientific literature understanding benchmarks.\n","authors":["Sihang Li","Jin Huang","Jiaxi Zhuang","Yaorui Shi","Xiaochen Cai","Mingjun Xu","Xiang Wang","Linfeng Zhang","Guolin Ke","Hengxing Cai"],"pdf_url":"https://arxiv.org/pdf/2408.15545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02959v2","updated":"2024-08-30T05:54:15Z","published":"2024-03-05T13:30:02Z","title":"AgentsCourt: Building Judicial Decision-Making Agents with Court Debate\n Simulation and Legal Knowledge Augmentation","summary":" With the development of deep learning, natural language processing technology\nhas effectively improved the efficiency of various aspects of the traditional\njudicial industry. However, most current efforts focus on tasks within\nindividual judicial stages, making it difficult to handle complex tasks that\nspan multiple stages. As the autonomous agents powered by large language models\nare becoming increasingly smart and able to make complex decisions in\nreal-world settings, offering new insights for judicial intelligence. In this\npaper, (1) we propose a novel multi-agent framework, AgentsCourt, for judicial\ndecision-making. Our framework follows the classic court trial process,\nconsisting of court debate simulation, legal resources retrieval and\ndecision-making refinement to simulate the decision-making of judge. (2) we\nintroduce SimuCourt, a judicial benchmark that encompasses 420 Chinese judgment\ndocuments, spanning the three most common types of judicial cases. Furthermore,\nto support this task, we construct a large-scale legal knowledge base,\nLegal-KB, with multi-resource legal knowledge. (3) Extensive experiments show\nthat our framework outperforms the existing advanced methods in various\naspects, especially in generating legal articles, where our model achieves\nsignificant improvements of 8.6% and 9.1% F1 score in the first and second\ninstance settings, respectively.\n","authors":["Zhitao He","Pengfei Cao","Chenhao Wang","Zhuoran Jin","Yubo Chen","Jiexin Xu","Huaijun Li","Xiaojian Jiang","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.02959v2.pdf","comment":"This paper was first submitted to ACL ARR 2024 April (Under review)"},{"id":"http://arxiv.org/abs/2408.17026v1","updated":"2024-08-30T05:50:15Z","published":"2024-08-30T05:50:15Z","title":"From Text to Emotion: Unveiling the Emotion Annotation Capabilities of\n LLMs","summary":" Training emotion recognition models has relied heavily on human annotated\ndata, which present diversity, quality, and cost challenges. In this paper, we\nexplore the potential of Large Language Models (LLMs), specifically GPT4, in\nautomating or assisting emotion annotation. We compare GPT4 with supervised\nmodels and or humans in three aspects: agreement with human annotations,\nalignment with human perception, and impact on model training. We find that\ncommon metrics that use aggregated human annotations as ground truth can\nunderestimate the performance, of GPT-4 and our human evaluation experiment\nreveals a consistent preference for GPT-4 annotations over humans across\nmultiple datasets and evaluators. Further, we investigate the impact of using\nGPT-4 as an annotation filtering process to improve model training. Together,\nour findings highlight the great potential of LLMs in emotion annotation tasks\nand underscore the need for refined evaluation methodologies.\n","authors":["Minxue Niu","Mimansa Jaiswal","Emily Mower Provost"],"pdf_url":"https://arxiv.org/pdf/2408.17026v1.pdf","comment":"to be published in Interspeech 2024"},{"id":"http://arxiv.org/abs/2408.17024v1","updated":"2024-08-30T05:42:31Z","published":"2024-08-30T05:42:31Z","title":"InkubaLM: A small language model for low-resource African languages","summary":" High-resource language models often fall short in the African context, where\nthere is a critical need for models that are efficient, accessible, and locally\nrelevant, even amidst significant computing and data constraints. This paper\nintroduces InkubaLM, a small language model with 0.4 billion parameters, which\nachieves performance comparable to models with significantly larger parameter\ncounts and more extensive training data on tasks such as machine translation,\nquestion-answering, AfriMMLU, and the AfriXnli task. Notably, InkubaLM\noutperforms many larger models in sentiment analysis and demonstrates\nremarkable consistency across multiple languages. This work represents a\npivotal advancement in challenging the conventional paradigm that effective\nlanguage models must rely on substantial resources. Our model and datasets are\npublicly available \\footnote{\\url{https://huggingface.co/lelapa}} to encourage\nresearch and development on low-resource languages.\n","authors":["Atnafu Lambebo Tonja","Bonaventure F. P. Dossou","Jessica Ojo","Jenalea Rajab","Fadel Thior","Eric Peter Wairagala","Aremu Anuoluwapo","Pelonomi Moiloa","Jade Abbott","Vukosi Marivate","Benjamin Rosman"],"pdf_url":"https://arxiv.org/pdf/2408.17024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17017v1","updated":"2024-08-30T05:14:59Z","published":"2024-08-30T05:14:59Z","title":"Dynamic Self-Consistency: Leveraging Reasoning Paths for Efficient LLM\n Sampling","summary":" Self-Consistency (SC) is a widely used method to mitigate hallucinations in\nLarge Language Models (LLMs) by sampling the LLM multiple times and outputting\nthe most frequent solution. Despite its benefits, SC results in significant\ncomputational costs proportional to the number of samples generated. Previous\nearly-stopping approaches, such as Early Stopping Self Consistency and Adaptive\nConsistency, have aimed to reduce these costs by considering output\nconsistency, but they do not analyze the quality of the reasoning paths (RPs)\nthemselves. To address this issue, we propose Reasoning-Aware Self-Consistency\n(RASC), an innovative early-stopping framework that dynamically adjusts the\nnumber of sample generations by considering both the output answer and the RPs\nfrom Chain of Thought (CoT) prompting. RASC assigns confidence scores\nsequentially to the generated samples, stops when certain criteria are met, and\nthen employs weighted majority voting to optimize sample usage and enhance\nanswer reliability. We comprehensively test RASC with multiple LLMs across\nvaried QA datasets. RASC outperformed existing methods and significantly\nreduces sample usage by an average of 80% while maintaining or improving\naccuracy up to 5% compared to the original SC\n","authors":["Guangya Wan","Yuqi Wu","Jie Chen","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2408.17017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10537v3","updated":"2024-08-30T04:51:28Z","published":"2022-12-20T18:46:28Z","title":"Does CLIP Bind Concepts? Probing Compositionality in Large Image Models","summary":" Large-scale neural network models combining text and images have made\nincredible progress in recent years. However, it remains an open question to\nwhat extent such models encode compositional representations of the concepts\nover which they operate, such as correctly identifying \"red cube\" by reasoning\nover the constituents \"red\" and \"cube\". In this work, we focus on the ability\nof a large pretrained vision and language model (CLIP) to encode compositional\nconcepts and to bind variables in a structure-sensitive way (e.g.,\ndifferentiating \"cube behind sphere\" from \"sphere behind cube\"). To inspect the\nperformance of CLIP, we compare several architectures from research on\ncompositional distributional semantics models (CDSMs), a line of research that\nattempts to implement traditional compositional linguistic structures within\nembedding spaces. We benchmark them on three synthetic datasets -\nsingle-object, two-object, and relational - designed to test concept binding.\nWe find that CLIP can compose concepts in a single-object setting, but in\nsituations where concept binding is needed, performance drops dramatically. At\nthe same time, CDSMs also perform poorly, with best performance at chance\nlevel.\n","authors":["Martha Lewis","Nihal V. Nayak","Peilin Yu","Qinan Yu","Jack Merullo","Stephen H. Bach","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2212.10537v3.pdf","comment":"Lewis and Nayak contributed equally"},{"id":"http://arxiv.org/abs/2305.09548v3","updated":"2024-08-30T04:18:01Z","published":"2023-05-16T15:45:59Z","title":"Measuring Dimensions of Self-Presentation in Twitter Bios and their\n Links to Misinformation Sharing","summary":" Social media platforms provide users with a profile description field,\ncommonly known as a ``bio,\" where they can present themselves to the world. A\ngrowing literature shows that text in these bios can improve our understanding\nof online self-presentation and behavior, but existing work relies exclusively\non keyword-based approaches to do so. We here propose and evaluate a suite of\n\\hl{simple, effective, and theoretically motivated} approaches to embed bios in\nspaces that capture salient dimensions of social meaning, such as age and\npartisanship. We \\hl{evaluate our methods on four tasks, showing that the\nstrongest one out-performs several practical baselines.} We then show the\nutility of our method in helping understand associations between\nself-presentation and the sharing of URLs from low-quality news sites on\nTwitter\\hl{, with a particular focus on explore the interactions between age\nand partisanship, and exploring the effects of self-presentations of\nreligiosity}. Our work provides new tools to help computational social\nscientists make use of information in bios, and provides new insights into how\nmisinformation sharing may be perceived on Twitter.\n","authors":["Navid Madani","Rabiraj Bandyopadhyay","Briony Swire-Thompson","Michael Miller Yoder","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2305.09548v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11999v5","updated":"2024-08-30T03:39:57Z","published":"2024-04-18T08:49:38Z","title":"Token-level Direct Preference Optimization","summary":" Fine-tuning pre-trained Large Language Models (LLMs) is essential to align\nthem with human values and intentions. This process often utilizes methods like\npairwise comparisons and KL divergence against a reference LLM, focusing on the\nevaluation of full answers generated by the models. However, the generation of\nthese responses occurs in a token level, following a sequential,\nauto-regressive fashion. In this paper, we introduce Token-level Direct\nPreference Optimization (TDPO), a novel approach to align LLMs with human\npreferences by optimizing policy at the token level. Unlike previous methods,\nwhich face challenges in divergence efficiency, TDPO incorporates forward KL\ndivergence constraints for each token, improving alignment and diversity.\nUtilizing the Bradley-Terry model for a token-based reward system, TDPO\nenhances the regulation of KL divergence, while preserving simplicity without\nthe need for explicit reward modeling. Experimental results across various text\ntasks demonstrate TDPO's superior performance in balancing alignment with\ngeneration diversity. Notably, fine-tuning with TDPO strikes a better balance\nthan DPO in the controlled sentiment generation and single-turn dialogue\ndatasets, and significantly improves the quality of generated responses\ncompared to both DPO and PPO-based RLHF methods. Our code is open-sourced at\nhttps://github.com/Vance0124/Token-level-Direct-Preference-Optimization.\n","authors":["Yongcheng Zeng","Guoqing Liu","Weiyu Ma","Ning Yang","Haifeng Zhang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11999v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16991v1","updated":"2024-08-30T03:38:37Z","published":"2024-08-30T03:38:37Z","title":"Tool-Assisted Agent on SQL Inspection and Refinement in Real-World\n Scenarios","summary":" Recent Text-to-SQL methods leverage large language models (LLMs) by\nincorporating feedback from the database management system. While these methods\neffectively address execution errors in SQL queries, they struggle with\ndatabase mismatches -- errors that do not trigger execution exceptions.\nDatabase mismatches include issues such as condition mismatches and stricter\nconstraint mismatches, both of which are more prevalent in real-world\nscenarios. To address these challenges, we propose a tool-assisted agent\nframework for SQL inspection and refinement, equipping the LLM-based agent with\ntwo specialized tools: a retriever and a detector, designed to diagnose and\ncorrect SQL queries with database mismatches. These tools enhance the\ncapability of LLMs to handle real-world queries more effectively. We also\nintroduce Spider-Mismatch, a new dataset specifically constructed to reflect\nthe condition mismatch problems encountered in real-world scenarios.\nExperimental results demonstrate that our method achieves the highest\nperformance on the averaged results of the Spider and Spider-Realistic datasets\nin few-shot settings, and it significantly outperforms baseline methods on the\nmore realistic dataset, Spider-Mismatch.\n","authors":["Zhongyuan Wang","Richong Zhang","Zhijie Nie","Jaein Kim"],"pdf_url":"https://arxiv.org/pdf/2408.16991v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2408.16725v2","updated":"2024-08-30T02:53:48Z","published":"2024-08-29T17:18:53Z","title":"Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming","summary":" Recent advances in language models have achieved significant progress.\nGPT-4o, as a new milestone, has enabled real-time conversations with humans,\ndemonstrating near-human natural fluency. Such human-computer interaction\nnecessitates models with the capability to perform reasoning directly with the\naudio modality and generate output in streaming. However, this remains beyond\nthe reach of current academic models, as they typically depend on extra TTS\nsystems for speech synthesis, resulting in undesirable latency. This paper\nintroduces the Mini-Omni, an audio-based end-to-end conversational model,\ncapable of real-time speech interaction. To achieve this capability, we propose\na text-instructed speech generation method, along with batch-parallel\nstrategies during inference to further boost the performance. Our method also\nhelps to retain the original model's language capabilities with minimal\ndegradation, enabling other works to establish real-time interaction\ncapabilities. We call this training method \"Any Model Can Talk\". We also\nintroduce the VoiceAssistant-400K dataset to fine-tune models optimized for\nspeech output. To our best knowledge, Mini-Omni is the first fully end-to-end,\nopen-source model for real-time speech interaction, offering valuable potential\nfor future research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16725v2.pdf","comment":"Technical report, work in progress. Demo and code:\n https://github.com/gpt-omni/mini-omni"},{"id":"http://arxiv.org/abs/2403.04261v2","updated":"2024-08-30T02:47:43Z","published":"2024-03-07T06:52:51Z","title":"Advancing Chinese biomedical text mining with community challenges","summary":" Objective: This study aims to review the recent advances in community\nchallenges for biomedical text mining in China. Methods: We collected\ninformation of evaluation tasks released in community challenges of biomedical\ntext mining, including task description, dataset description, data source, task\ntype and related links. A systematic summary and comparative analysis were\nconducted on various biomedical natural language processing tasks, such as\nnamed entity recognition, entity normalization, attribute extraction, relation\nextraction, event extraction, text classification, text similarity, knowledge\ngraph construction, question answering, text generation, and large language\nmodel evaluation. Results: We identified 39 evaluation tasks from 6 community\nchallenges that spanned from 2017 to 2023. Our analysis revealed the diverse\nrange of evaluation task types and data sources in biomedical text mining. We\nexplored the potential clinical applications of these community challenge tasks\nfrom a translational biomedical informatics perspective. We compared with their\nEnglish counterparts, and discussed the contributions, limitations, lessons and\nguidelines of these community challenges, while highlighting future directions\nin the era of large language models. Conclusion: Community challenge evaluation\ncompetitions have played a crucial role in promoting technology innovation and\nfostering interdisciplinary collaboration in the field of biomedical text\nmining. These challenges provide valuable platforms for researchers to develop\nstate-of-the-art solutions.\n","authors":["Hui Zong","Rongrong Wu","Jiaxue Cha","Weizhe Feng","Erman Wu","Jiakun Li","Aibin Shao","Liang Tao","Zuofeng Li","Buzhou Tang","Bairong Shen"],"pdf_url":"https://arxiv.org/pdf/2403.04261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16967v1","updated":"2024-08-30T02:01:56Z","published":"2024-08-30T02:01:56Z","title":"MemLong: Memory-Augmented Retrieval for Long Text Modeling","summary":" Recent advancements in Large Language Models (LLMs) have yielded remarkable\nsuccess across diverse fields. However, handling long contexts remains a\nsignificant challenge for LLMs due to the quadratic time and space complexity\nof attention mechanisms and the growing memory consumption of the key-value\ncache during generation. This work introduces MemLong: Memory-Augmented\nRetrieval for Long Text Generation, a method designed to enhance the\ncapabilities of long-context language modeling by utilizing an external\nretriever for historical information retrieval. MemLong combines a\nnon-differentiable ``ret-mem'' module with a partially trainable decoder-only\nlanguage model and introduces a fine-grained, controllable retrieval attention\nmechanism that leverages semantic-level relevant chunks. Comprehensive\nevaluations on multiple long-context language modeling benchmarks demonstrate\nthat MemLong consistently outperforms other state-of-the-art LLMs. More\nimportantly, MemLong can extend the context length on a single 3090 GPU from 4k\nup to 80k. Our code is available at https://github.com/Bui1dMySea/MemLong\n","authors":["Weijie Liu","Zecheng Tang","Juntao Li","Kehai Chen","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16966v1","updated":"2024-08-30T01:56:57Z","published":"2024-08-30T01:56:57Z","title":"UserSumBench: A Benchmark Framework for Evaluating User Summarization\n Approaches","summary":" Large language models (LLMs) have shown remarkable capabilities in generating\nuser summaries from a long list of raw user activity data. These summaries\ncapture essential user information such as preferences and interests, and\ntherefore are invaluable for LLM-based personalization applications, such as\nexplainable recommender systems. However, the development of new summarization\ntechniques is hindered by the lack of ground-truth labels, the inherent\nsubjectivity of user summaries, and human evaluation which is often costly and\ntime-consuming. To address these challenges, we introduce \\UserSumBench, a\nbenchmark framework designed to facilitate iterative development of LLM-based\nsummarization approaches. This framework offers two key components: (1) A\nreference-free summary quality metric. We show that this metric is effective\nand aligned with human preferences across three diverse datasets (MovieLens,\nYelp and Amazon Review). (2) A novel robust summarization method that leverages\ntime-hierarchical summarizer and self-critique verifier to produce high-quality\nsummaries while eliminating hallucination. This method serves as a strong\nbaseline for further innovation in summarization techniques.\n","authors":["Chao Wang","Neo Wu","Lin Ning","Luyang Liu","Jun Xie","Shawn O'Banion","Bradley Green"],"pdf_url":"https://arxiv.org/pdf/2408.16966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07000v2","updated":"2024-08-30T01:19:42Z","published":"2024-07-09T16:13:26Z","title":"Etalon: Holistic Performance Evaluation Framework for LLM Inference\n Systems","summary":" Serving large language models (LLMs) in production can incur substantial\ncosts, which has prompted recent advances in inference system optimizations.\nToday, these systems are evaluated against conventional latency and throughput\nmetrics (eg. TTFT, TBT, Normalised Latency and TPOT). However, these metrics\nfail to fully capture the nuances of LLM inference, leading to an incomplete\nassessment of user-facing performance crucial for real-time applications such\nas chat and translation. In this paper, we first identify the pitfalls of\ncurrent performance metrics in evaluating LLM inference systems. We then\npropose Etalon, a comprehensive performance evaluation framework that includes\nfluidity-index -- a novel metric designed to reflect the intricacies of the LLM\ninference process and its impact on real-time user experience. Finally, we\nevaluate various existing open-source platforms and model-as-a-service\nofferings using Etalon, discussing their strengths and weaknesses. Etalon is\navailable at https://github.com/project-etalon/etalon.\n","authors":["Amey Agrawal","Anmol Agarwal","Nitin Kedia","Jayashree Mohan","Souvik Kundu","Nipun Kwatra","Ramachandran Ramjee","Alexey Tumanov"],"pdf_url":"https://arxiv.org/pdf/2407.07000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09625v3","updated":"2024-08-30T01:07:08Z","published":"2023-12-15T09:08:14Z","title":"Weakly-Supervised 3D Visual Grounding based on Visual Linguistic\n Alignment","summary":" Learning to ground natural language queries to target objects or regions in\n3D point clouds is quite essential for 3D scene understanding. Nevertheless,\nexisting 3D visual grounding approaches require a substantial number of\nbounding box annotations for text queries, which is time-consuming and\nlabor-intensive to obtain. In this paper, we propose 3D-VLA, a weakly\nsupervised approach for 3D visual grounding based on Visual Linguistic\nAlignment. Our 3D-VLA exploits the superior ability of current large-scale\nvision-language models (VLMs) on aligning the semantics between texts and 2D\nimages, as well as the naturally existing correspondences between 2D images and\n3D point clouds, and thus implicitly constructs correspondences between texts\nand 3D point clouds with no need for fine-grained box annotations in the\ntraining procedure. During the inference stage, the learned text-3D\ncorrespondence will help us ground the text queries to the 3D target objects\neven without 2D images. To the best of our knowledge, this is the first work to\ninvestigate 3D visual grounding in a weakly supervised manner by involving\nlarge scale vision-language models, and extensive experiments on ReferIt3D and\nScanRefer datasets demonstrate that our 3D-VLA achieves comparable and even\nsuperior results over the fully supervised methods.\n","authors":["Xiaoxu Xu","Yitian Yuan","Qiudan Zhang","Wenhui Wu","Zequn Jie","Lin Ma","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.09625v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04691v2","updated":"2024-08-30T23:09:46Z","published":"2024-08-08T13:10:51Z","title":"Improving Relational Database Interactions with Large Language Models:\n Column Descriptions and Their Impact on Text-to-SQL Performance","summary":" Relational databases often suffer from uninformative descriptors of table\ncontents, such as ambiguous columns and hard-to-interpret values, impacting\nboth human users and Text-to-SQL models. This paper explores the use of large\nlanguage models (LLMs) to generate informative column descriptions as a\nsemantic layer for relational databases. Using the BIRD-Bench development set,\nwe created ColSQL, a dataset with gold-standard column descriptions generated\nand refined by LLMs and human annotators. We evaluated several\ninstruction-tuned models, finding that GPT-4o and Command R+ excelled in\ngenerating high-quality descriptions. Additionally, we applied an\nLLM-as-a-judge to evaluate model performance. Although this method does not\nalign well with human evaluations, we included it to explore its potential and\nto identify areas for improvement. More work is needed to improve the\nreliability of automatic evaluations for this task. We also find that detailed\ncolumn descriptions significantly improve Text-to-SQL execution accuracy,\nespecially when columns are uninformative. This study establishes LLMs as\neffective tools for generating detailed metadata, enhancing the usability of\nrelational databases.\n","authors":["Niklas Wretblad","Oskar Holmström","Erik Larsson","Axel Wiksäter","Oscar Söderlund","Hjalmar Öhman","Ture Pontén","Martin Forsberg","Martin Sörme","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2408.04691v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14056v2","updated":"2024-08-30T22:46:55Z","published":"2024-07-19T06:33:10Z","title":"Rasa: Building Expressive Speech Synthesis Systems for Indian Languages\n in Low-resource Settings","summary":" We release Rasa, the first multilingual expressive TTS dataset for any Indian\nlanguage, which contains 10 hours of neutral speech and 1-3 hours of expressive\nspeech for each of the 6 Ekman emotions covering 3 languages: Assamese,\nBengali, & Tamil. Our ablation studies reveal that just 1 hour of neutral and\n30 minutes of expressive data can yield a Fair system as indicated by MUSHRA\nscores. Increasing neutral data to 10 hours, with minimal expressive data,\nsignificantly enhances expressiveness. This offers a practical recipe for\nresource-constrained languages, prioritizing easily obtainable neutral data\nalongside smaller amounts of expressive data. We show the importance of\nsyllabically balanced data and pooling emotions to enhance expressiveness. We\nalso highlight challenges in generating specific emotions, e.g., fear and\nsurprise.\n","authors":["Praveen Srinivasa Varadhan","Ashwin Sankar","Giri Raju","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2407.14056v2.pdf","comment":"Accepted at INTERSPEECH 2024. First two authors listed contributed\n equally"},{"id":"http://arxiv.org/abs/2404.14219v4","updated":"2024-08-30T21:17:17Z","published":"2024-04-22T14:32:33Z","title":"Phi-3 Technical Report: A Highly Capable Language Model Locally on Your\n Phone","summary":" We introduce phi-3-mini, a 3.8 billion parameter language model trained on\n3.3 trillion tokens, whose overall performance, as measured by both academic\nbenchmarks and internal testing, rivals that of models such as Mixtral 8x7B and\nGPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite\nbeing small enough to be deployed on a phone. Our training dataset is a\nscaled-up version of the one used for phi-2, composed of heavily filtered\npublicly available web data and synthetic data. The model is also further\naligned for robustness, safety, and chat format. We also provide\nparameter-scaling results with a 7B, 14B models trained for 4.8T tokens, called\nphi-3-small, phi-3-medium, both significantly more capable than phi-3-mini\n(e.g., respectively 75%, 78% on MMLU, and 8.7, 8.9 on MT-bench). To enhance\nmultilingual, multimodal, and long-context capabilities, we introduce three\nmodels in the phi-3.5 series: phi-3.5-mini, phi-3.5-MoE, and phi-3.5-Vision.\nThe phi-3.5-MoE, a 16 x 3.8B MoE model with 6.6 billion active parameters,\nachieves superior performance in language reasoning, math, and code tasks\ncompared to other open-source models of similar scale, such as Llama 3.1 and\nthe Mixtral series, and on par with Gemini-1.5-Flash and GPT-4o-mini.\nMeanwhile, phi-3.5-Vision, a 4.2 billion parameter model derived from\nphi-3.5-mini, excels in reasoning tasks and is adept at handling both\nsingle-image and text prompts, as well as multi-image and text prompts.\n","authors":["Marah Abdin","Jyoti Aneja","Hany Awadalla","Ahmed Awadallah","Ammar Ahmad Awan","Nguyen Bach","Amit Bahree","Arash Bakhtiari","Jianmin Bao","Harkirat Behl","Alon Benhaim","Misha Bilenko","Johan Bjorck","Sébastien Bubeck","Martin Cai","Qin Cai","Vishrav Chaudhary","Dong Chen","Dongdong Chen","Weizhu Chen","Yen-Chun Chen","Yi-Ling Chen","Hao Cheng","Parul Chopra","Xiyang Dai","Matthew Dixon","Ronen Eldan","Victor Fragoso","Jianfeng Gao","Mei Gao","Min Gao","Amit Garg","Allie Del Giorno","Abhishek Goswami","Suriya Gunasekar","Emman Haider","Junheng Hao","Russell J. Hewett","Wenxiang Hu","Jamie Huynh","Dan Iter","Sam Ade Jacobs","Mojan Javaheripi","Xin Jin","Nikos Karampatziakis","Piero Kauffmann","Mahoud Khademi","Dongwoo Kim","Young Jin Kim","Lev Kurilenko","James R. Lee","Yin Tat Lee","Yuanzhi Li","Yunsheng Li","Chen Liang","Lars Liden","Xihui Lin","Zeqi Lin","Ce Liu","Liyuan Liu","Mengchen Liu","Weishung Liu","Xiaodong Liu","Chong Luo","Piyush Madan","Ali Mahmoudzadeh","David Majercak","Matt Mazzola","Caio César Teodoro Mendes","Arindam Mitra","Hardik Modi","Anh Nguyen","Brandon Norick","Barun Patra","Daniel Perez-Becker","Thomas Portet","Reid Pryzant","Heyang Qin","Marko Radmilac","Liliang Ren","Gustavo de Rosa","Corby Rosset","Sambudha Roy","Olatunji Ruwase","Olli Saarikivi","Amin Saied","Adil Salim","Michael Santacroce","Shital Shah","Ning Shang","Hiteshi Sharma","Yelong Shen","Swadheen Shukla","Xia Song","Masahiro Tanaka","Andrea Tupini","Praneetha Vaddamanu","Chunyu Wang","Guanhua Wang","Lijuan Wang","Shuohang Wang","Xin Wang","Yu Wang","Rachel Ward","Wen Wen","Philipp Witte","Haiping Wu","Xiaoxia Wu","Michael Wyatt","Bin Xiao","Can Xu","Jiahang Xu","Weijian Xu","Jilong Xue","Sonali Yadav","Fan Yang","Jianwei Yang","Yifan Yang","Ziyi Yang","Donghan Yu","Lu Yuan","Chenruidong Zhang","Cyril Zhang","Jianwen Zhang","Li Lyna Zhang","Yi Zhang","Yue Zhang","Yunan Zhang","Xiren Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.14219v4.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2402.10311v7","updated":"2024-08-30T20:18:00Z","published":"2024-02-15T20:24:39Z","title":"The optimal placement of the head in the noun phrase. The case of\n demonstrative, numeral, adjective and noun","summary":" The word order of a sentence is shaped by multiple principles. The principle\nof syntactic dependency distance minimization is in conflict with the principle\nof surprisal minimization (or predictability maximization) in single head\nsyntactic dependency structures: while the former predicts that the head should\nbe placed at the center of the linear arrangement, the latter predicts that the\nhead should be placed at one of the ends (either first or last). A critical\nquestion is when surprisal minimization (or predictability maximization) should\nsurpass syntactic dependency distance minimization. In the context of single\nhead structures, it has been predicted that this is more likely to happen when\ntwo conditions are met, i.e. (a) fewer words are involved and (b) words are\nshorter. Here we test the prediction on the noun phrase when it is composed of\na demonstrative, a numeral, an adjective and a noun. We find that, across\npreferred orders in languages, the noun tends to be placed at one of the ends,\nconfirming the theoretical prediction. We also show evidence of anti locality\neffects: syntactic dependency distances in preferred orders are longer than\nexpected by chance.\n","authors":["Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2402.10311v7.pdf","comment":"typos corrected"},{"id":"http://arxiv.org/abs/2402.04315v3","updated":"2024-08-30T18:24:27Z","published":"2024-02-06T19:00:40Z","title":"Training Language Models to Generate Text with Citations via\n Fine-grained Rewards","summary":" While recent Large Language Models (LLMs) have proven useful in answering\nuser queries, they are prone to hallucination, and their responses often lack\ncredibility due to missing references to reliable sources. An intuitive\nsolution to these issues would be to include in-text citations referring to\nexternal documents as evidence. While previous works have directly prompted\nLLMs to generate in-text citations, their performances are far from\nsatisfactory, especially when it comes to smaller LLMs. In this work, we\npropose an effective training framework using fine-grained rewards to teach\nLLMs to generate highly supportive and relevant citations, while ensuring the\ncorrectness of their responses. We also conduct a systematic analysis of\napplying these fine-grained rewards to common LLM training strategies,\ndemonstrating its advantage over conventional practices. We conduct extensive\nexperiments on Question Answering (QA) datasets taken from the ALCE benchmark\nand validate the model's generalizability using EXPERTQA. On LLaMA-2-7B, the\nincorporation of fine-grained rewards achieves the best performance among the\nbaselines, even surpassing that of GPT-3.5-turbo.\n","authors":["Chengyu Huang","Zeqiu Wu","Yushi Hu","Wenya Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04315v3.pdf","comment":"Accepted by ACL 2024"},{"id":"http://arxiv.org/abs/2210.15265v2","updated":"2024-08-30T18:22:23Z","published":"2022-10-27T08:41:46Z","title":"Conversation Disentanglement with Bi-Level Contrastive Learning","summary":" Conversation disentanglement aims to group utterances into detached sessions,\nwhich is a fundamental task in processing multi-party conversations. Existing\nmethods have two main drawbacks. First, they overemphasize pairwise utterance\nrelations but pay inadequate attention to the utterance-to-context relation\nmodeling. Second, huge amount of human annotated data is required for training,\nwhich is expensive to obtain in practice. To address these issues, we propose a\ngeneral disentangle model based on bi-level contrastive learning. It brings\ncloser utterances in the same session while encourages each utterance to be\nnear its clustered session prototypes in the representation space. Unlike\nexisting approaches, our disentangle model works in both supervised setting\nwith labeled data and unsupervised setting when no such data is available. The\nproposed method achieves new state-of-the-art performance on both settings\nacross several public datasets.\n","authors":["Chengyu Huang","Zheng Zhang","Hao Fei","Lizi Liao"],"pdf_url":"https://arxiv.org/pdf/2210.15265v2.pdf","comment":"Accepted by EMNLP 2022 Findings"},{"id":"http://arxiv.org/abs/2408.16672v2","updated":"2024-08-30T18:14:24Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce a novel architecture and a training framework to\nsupport long context window and multilingual retrieval. Leveraging Matryoshka\nRepresentation Loss, we further demonstrate that the reducing the embedding\ndimensionality from 128 to 64 has insignificant impact on the model's retrieval\nperformance and cut storage requirements by up to 50%. Our new model,\nJina-ColBERT-v2, demonstrates strong performance across a range of English and\nmultilingual retrieval tasks,\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Georgios Mastrapas","Saba Sturua","Isabelle Mohr","Andreas Koukounas","Mohammad Kalim Akram","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.17443v1","updated":"2024-08-30T17:52:55Z","published":"2024-08-30T17:52:55Z","title":"Bridging Episodes and Semantics: A Novel Framework for Long-Form Video\n Understanding","summary":" While existing research often treats long-form videos as extended short\nvideos, we propose a novel approach that more accurately reflects human\ncognition. This paper introduces BREASE: BRidging Episodes And SEmantics for\nLong-Form Video Understanding, a model that simulates episodic memory\naccumulation to capture action sequences and reinforces them with semantic\nknowledge dispersed throughout the video. Our work makes two key contributions:\nFirst, we develop an Episodic COmpressor (ECO) that efficiently aggregates\ncrucial representations from micro to semi-macro levels. Second, we propose a\nSemantics reTRiever (SeTR) that enhances these aggregated representations with\nsemantic information by focusing on the broader context, dramatically reducing\nfeature dimensionality while preserving relevant macro-level information.\nExtensive experiments demonstrate that BREASE achieves state-of-the-art\nperformance across multiple long video understanding benchmarks in both\nzero-shot and fully-supervised settings. The project page and code are at:\nhttps://joslefaure.github.io/assets/html/hermes.html.\n","authors":["Gueter Josmy Faure","Jia-Fong Yeh","Min-Hung Chen","Hung-Ting Su","Winston H. Hsu","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.17443v1.pdf","comment":"Accepted to the EVAL-FoMo Workshop at ECCV'24. Project page:\n https://joslefaure.github.io/assets/html/hermes.html"},{"id":"http://arxiv.org/abs/2403.16210v2","updated":"2024-08-30T17:39:50Z","published":"2024-03-24T16:09:21Z","title":"Frankenstein: Generating Semantic-Compositional 3D Scenes in One\n Tri-Plane","summary":" We present Frankenstein, a diffusion-based framework that can generate\nsemantic-compositional 3D scenes in a single pass. Unlike existing methods that\noutput a single, unified 3D shape, Frankenstein simultaneously generates\nmultiple separated shapes, each corresponding to a semantically meaningful\npart. The 3D scene information is encoded in one single tri-plane tensor, from\nwhich multiple Singed Distance Function (SDF) fields can be decoded to\nrepresent the compositional shapes. During training, an auto-encoder compresses\ntri-planes into a latent space, and then the denoising diffusion process is\nemployed to approximate the distribution of the compositional scenes.\nFrankenstein demonstrates promising results in generating room interiors as\nwell as human avatars with automatically separated parts. The generated scenes\nfacilitate many downstream applications, such as part-wise re-texturing, object\nrearrangement in the room or avatar cloth re-targeting. Our project page is\navailable at: https://wolfball.github.io/frankenstein/.\n","authors":["Han Yan","Yang Li","Zhennan Wu","Shenzhou Chen","Weixuan Sun","Taizhang Shang","Weizhe Liu","Tian Chen","Xiaqiang Dai","Chao Ma","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2403.16210v2.pdf","comment":"SIGGRAPH Asia 2024 Conference Paper"},{"id":"http://arxiv.org/abs/2408.17433v1","updated":"2024-08-30T17:35:06Z","published":"2024-08-30T17:35:06Z","title":"DARES: Depth Anything in Robotic Endoscopic Surgery with Self-supervised\n Vector-LoRA of the Foundation Model","summary":" Robotic-assisted surgery (RAS) relies on accurate depth estimation for 3D\nreconstruction and visualization. While foundation models like Depth Anything\nModels (DAM) show promise, directly applying them to surgery often yields\nsuboptimal results. Fully fine-tuning on limited surgical data can cause\noverfitting and catastrophic forgetting, compromising model robustness and\ngeneralization. Although Low-Rank Adaptation (LoRA) addresses some adaptation\nissues, its uniform parameter distribution neglects the inherent feature\nhierarchy, where earlier layers, learning more general features, require more\nparameters than later ones. To tackle this issue, we introduce Depth Anything\nin Robotic Endoscopic Surgery (DARES), a novel approach that employs a new\nadaptation technique, Vector Low-Rank Adaptation (Vector-LoRA) on the DAM V2 to\nperform self-supervised monocular depth estimation in RAS scenes. To enhance\nlearning efficiency, we introduce Vector-LoRA by integrating more parameters in\nearlier layers and gradually decreasing parameters in later layers. We also\ndesign a reprojection loss based on the multi-scale SSIM error to enhance depth\nperception by better tailoring the foundation model to the specific\nrequirements of the surgical environment. The proposed method is validated on\nthe SCARED dataset and demonstrates superior performance over recent\nstate-of-the-art self-supervised monocular depth estimation techniques,\nachieving an improvement of 13.3% in the absolute relative error metric. The\ncode and pre-trained weights are available at\nhttps://github.com/mobarakol/DARES.\n","authors":["Mona Sheikh Zeinoddin","Chiara Lena","Jiongqi Qu","Luca Carlini","Mattia Magro","Seunghoi Kim","Elena De Momi","Sophia Bano","Matthew Grech-Sollars","Evangelos Mazomenos","Daniel C. Alexander","Danail Stoyanov","Matthew J. Clarkson","Mobarakol Islam"],"pdf_url":"https://arxiv.org/pdf/2408.17433v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.17424v1","updated":"2024-08-30T17:16:18Z","published":"2024-08-30T17:16:18Z","title":"CinePreGen: Camera Controllable Video Previsualization via\n Engine-powered Diffusion","summary":" With advancements in video generative AI models (e.g., SORA), creators are\nincreasingly using these techniques to enhance video previsualization. However,\nthey face challenges with incomplete and mismatched AI workflows. Existing\nmethods mainly rely on text descriptions and struggle with camera placement, a\nkey component of previsualization. To address these issues, we introduce\nCinePreGen, a visual previsualization system enhanced with engine-powered\ndiffusion. It features a novel camera and storyboard interface that offers\ndynamic control, from global to local camera adjustments. This is combined with\na user-friendly AI rendering workflow, which aims to achieve consistent results\nthrough multi-masked IP-Adapter and engine simulation guidelines. In our\ncomprehensive evaluation study, we demonstrate that our system reduces\ndevelopment viscosity (i.e., the complexity and challenges in the development\nprocess), meets users' needs for extensive control and iteration in the design\nprocess, and outperforms other AI video production workflows in cinematic\ncamera movement, as shown by our experiments and a within-subjects user study.\nWith its intuitive camera controls and realistic rendering of camera motion,\nCinePreGen shows great potential for improving video production for both\nindividual creators and industry professionals.\n","authors":["Yiran Chen","Anyi Rao","Xuekun Jiang","Shishi Xiao","Ruiqing Ma","Zeyu Wang","Hui Xiong","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2408.17424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17422v1","updated":"2024-08-30T17:12:14Z","published":"2024-08-30T17:12:14Z","title":"Open-vocabulary Temporal Action Localization using VLMs","summary":" Video action localization aims to find timings of a specific action from a\nlong video. Although existing learning-based approaches have been successful,\nthose require annotating videos that come with a considerable labor cost. This\npaper proposes a learning-free, open-vocabulary approach based on emerging\nvision-language models (VLM). The challenge stems from the fact that VLMs are\nneither designed to process long videos nor tailored for finding actions. We\novercome these problems by extending an iterative visual prompting technique.\nSpecifically, we sample video frames into a concatenated image with frame index\nlabels, making a VLM guess a frame that is considered to be closest to the\nstart/end of the action. Iterating this process by narrowing a sampling time\nwindow results in finding a specific frame of start and end of an action. We\ndemonstrate that this sampling technique yields reasonable results,\nillustrating a practical extension of VLMs for understanding videos.\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2408.17422v1.pdf","comment":"7 pages, 5 figures, 4 tables. Last updated on August 30th, 2024"},{"id":"http://arxiv.org/abs/2408.17421v1","updated":"2024-08-30T17:11:36Z","published":"2024-08-30T17:11:36Z","title":"Generative AI Enables Medical Image Segmentation in Ultra Low-Data\n Regimes","summary":" Semantic segmentation of medical images is pivotal in applications like\ndisease diagnosis and treatment planning. While deep learning has excelled in\nautomating this task, a major hurdle is the need for numerous annotated\nsegmentation masks, which are resource-intensive to produce due to the required\nexpertise and time. This scenario often leads to ultra low-data regimes, where\nannotated images are extremely limited, posing significant challenges for the\ngeneralization of conventional deep learning methods on test images. To address\nthis, we introduce a generative deep learning framework, which uniquely\ngenerates high-quality paired segmentation masks and medical images, serving as\nauxiliary data for training robust models in data-scarce environments. Unlike\ntraditional generative models that treat data generation and segmentation model\ntraining as separate processes, our method employs multi-level optimization for\nend-to-end data generation. This approach allows segmentation performance to\ndirectly influence the data generation process, ensuring that the generated\ndata is specifically tailored to enhance the performance of the segmentation\nmodel. Our method demonstrated strong generalization performance across 9\ndiverse medical image segmentation tasks and on 16 datasets, in ultra-low data\nregimes, spanning various diseases, organs, and imaging modalities. When\napplied to various segmentation models, it achieved performance improvements of\n10-20\\% (absolute), in both same-domain and out-of-domain scenarios. Notably,\nit requires 8 to 20 times less training data than existing methods to achieve\ncomparable results. This advancement significantly improves the feasibility and\ncost-effectiveness of applying deep learning in medical imaging, particularly\nin scenarios with limited data availability.\n","authors":["Li Zhang","Basu Jindal","Ahmed Alaa","Robert Weinreb","David Wilson","Eran Segal","James Zou","Pengtao Xie"],"pdf_url":"https://arxiv.org/pdf/2408.17421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17399v1","updated":"2024-08-30T16:35:28Z","published":"2024-08-30T16:35:28Z","title":"How Knowledge Distillation Mitigates the Synthetic Gap in Fair Face\n Recognition","summary":" Leveraging the capabilities of Knowledge Distillation (KD) strategies, we\ndevise a strategy to fight the recent retraction of face recognition datasets.\nGiven a pretrained Teacher model trained on a real dataset, we show that\ncarefully utilising synthetic datasets, or a mix between real and synthetic\ndatasets to distil knowledge from this teacher to smaller students can yield\nsurprising results. In this sense, we trained 33 different models with and\nwithout KD, on different datasets, with different architectures and losses. And\nour findings are consistent, using KD leads to performance gains across all\nethnicities and decreased bias. In addition, it helps to mitigate the\nperformance gap between real and synthetic datasets. This approach addresses\nthe limitations of synthetic data training, improving both the accuracy and\nfairness of face recognition models.\n","authors":["Pedro C. Neto","Ivona Colakovic","Sašo Karakatič","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2408.17399v1.pdf","comment":"Accepted at ECCV 2024 Workshops"},{"id":"http://arxiv.org/abs/2405.18033v2","updated":"2024-08-30T16:14:57Z","published":"2024-05-28T10:34:28Z","title":"RT-GS2: Real-Time Generalizable Semantic Segmentation for 3D Gaussian\n Representations of Radiance Fields","summary":" Gaussian Splatting has revolutionized the world of novel view synthesis by\nachieving high rendering performance in real-time. Recently, studies have\nfocused on enriching these 3D representations with semantic information for\ndownstream tasks. In this paper, we introduce RT-GS2, the first generalizable\nsemantic segmentation method employing Gaussian Splatting. While existing\nGaussian Splatting-based approaches rely on scene-specific training, RT-GS2\ndemonstrates the ability to generalize to unseen scenes. Our method adopts a\nnew approach by first extracting view-independent 3D Gaussian features in a\nself-supervised manner, followed by a novel View-Dependent / View-Independent\n(VDVI) feature fusion to enhance semantic consistency over different views.\nExtensive experimentation on three different datasets showcases RT-GS2's\nsuperiority over the state-of-the-art methods in semantic segmentation quality,\nexemplified by a 8.01% increase in mIoU on the Replica dataset. Moreover, our\nmethod achieves real-time performance of 27.03 FPS, marking an astonishing 901\ntimes speedup compared to existing approaches. This work represents a\nsignificant advancement in the field by introducing, to the best of our\nknowledge, the first real-time generalizable semantic segmentation method for\n3D Gaussian representations of radiance fields.\n","authors":["Mihnea-Bogdan Jurca","Remco Royen","Ion Giosan","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2405.18033v2.pdf","comment":"Accepted paper at BMVC 2024"},{"id":"http://arxiv.org/abs/2408.17363v1","updated":"2024-08-30T15:53:48Z","published":"2024-08-30T15:53:48Z","title":"Look, Learn and Leverage (L$^3$): Mitigating Visual-Domain Shift and\n Discovering Intrinsic Relations via Symbolic Alignment","summary":" Modern deep learning models have demonstrated outstanding performance on\ndiscovering the underlying mechanisms when both visual appearance and intrinsic\nrelations (e.g., causal structure) data are sufficient, such as Disentangled\nRepresentation Learning (DRL), Causal Representation Learning (CRL) and Visual\nQuestion Answering (VQA) methods. However, generalization ability of these\nmodels is challenged when the visual domain shifts and the relations data is\nabsent during finetuning. To address this challenge, we propose a novel\nlearning framework, Look, Learn and Leverage (L$^3$), which decomposes the\nlearning process into three distinct phases and systematically utilize the\nclass-agnostic segmentation masks as the common symbolic space to align visual\ndomains. Thus, a relations discovery model can be trained on the source domain,\nand when the visual domain shifts and the intrinsic relations are absent, the\npretrained relations discovery model can be directly reused and maintain a\nsatisfactory performance. Extensive performance evaluations are conducted on\nthree different tasks: DRL, CRL and VQA, and show outstanding results on all\nthree tasks, which reveals the advantages of L$^3$.\n","authors":["Hanchen Xie","Jiageng Zhu","Mahyar Khayatkhoei","Jiazhi Li","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2408.17363v1.pdf","comment":"17 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.15119v3","updated":"2024-08-30T15:29:08Z","published":"2024-08-27T14:58:13Z","title":"A Permuted Autoregressive Approach to Word-Level Recognition for Urdu\n Digital Text","summary":" This research paper introduces a novel word-level Optical Character\nRecognition (OCR) model specifically designed for digital Urdu text, leveraging\ntransformer-based architectures and attention mechanisms to address the\ndistinct challenges of Urdu script recognition, including its diverse text\nstyles, fonts, and variations. The model employs a permuted autoregressive\nsequence (PARSeq) architecture, which enhances its performance by enabling\ncontext-aware inference and iterative refinement through the training of\nmultiple token permutations. This method allows the model to adeptly manage\ncharacter reordering and overlapping characters, commonly encountered in Urdu\nscript. Trained on a dataset comprising approximately 160,000 Urdu text images,\nthe model demonstrates a high level of accuracy in capturing the intricacies of\nUrdu script, achieving a CER of 0.178. Despite ongoing challenges in handling\ncertain text variations, the model exhibits superior accuracy and effectiveness\nin practical applications. Future work will focus on refining the model through\nadvanced data augmentation techniques and the integration of context-aware\nlanguage models to further enhance its performance and robustness in Urdu text\nrecognition.\n","authors":["Ahmed Mustafa","Muhammad Tahir Rafique","Muhammad Ijlal Baig","Hasan Sajid","Muhammad Jawad Khan","Karam Dad Kallu"],"pdf_url":"https://arxiv.org/pdf/2408.15119v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17347v1","updated":"2024-08-30T15:22:13Z","published":"2024-08-30T15:22:13Z","title":"LSMS: Language-guided Scale-aware MedSegmentor for Medical Image\n Referring Segmentation","summary":" Conventional medical image segmentation methods have been found inadequate in\nfacilitating physicians with the identification of specific lesions for\ndiagnosis and treatment. Given the utility of text as an instructional format,\nwe introduce a novel task termed Medical Image Referring Segmentation (MIRS),\nwhich requires segmenting specified lesions in images based on the given\nlanguage expressions. Due to the varying object scales in medical images, MIRS\ndemands robust vision-language modeling and comprehensive multi-scale\ninteraction for precise localization and segmentation under linguistic\nguidance. However, existing medical image segmentation methods fall short in\nmeeting these demands, resulting in insufficient segmentation accuracy. In\nresponse, we propose an approach named Language-guided Scale-aware MedSegmentor\n(LSMS), incorporating two appealing designs: (1)~a Scale-aware Vision-Language\nAttention module that leverages diverse convolutional kernels to acquire rich\nvisual knowledge and interact closely with linguistic features, thereby\nenhancing lesion localization capability; (2)~a Full-Scale Decoder that\nglobally models multi-modal features across various scales, capturing\ncomplementary information between scales to accurately outline lesion\nboundaries. Addressing the lack of suitable datasets for MIRS, we constructed a\nvision-language medical dataset called Reference Hepatic Lesion Segmentation\n(RefHL-Seg). This dataset comprises 2,283 abdominal CT slices from 231 cases,\nwith corresponding textual annotations and segmentation masks for various liver\nlesions in images. We validated the performance of LSMS for MIRS and\nconventional medical image segmentation tasks across various datasets. Our LSMS\nconsistently outperforms on all datasets with lower computational costs. The\ncode and datasets will be released.\n","authors":["Shuyi Ouyang","Jinyang Zhang","Xiangye Lin","Xilai Wang","Qingqing Chen","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2408.17347v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.00583v2","updated":"2024-08-30T15:16:43Z","published":"2023-11-30T18:53:03Z","title":"DeformGS: Scene Flow in Highly Deformable Scenes for Deformable Object\n Manipulation","summary":" Teaching robots to fold, drape, or reposition deformable objects such as\ncloth will unlock a variety of automation applications. While remarkable\nprogress has been made for rigid object manipulation, manipulating deformable\nobjects poses unique challenges, including frequent occlusions,\ninfinite-dimensional state spaces and complex dynamics. Just as object pose\nestimation and tracking have aided robots for rigid manipulation, dense 3D\ntracking (scene flow) of highly deformable objects will enable new applications\nin robotics while aiding existing approaches, such as imitation learning or\ncreating digital twins with real2sim transfer. We propose DeformGS, an approach\nto recover scene flow in highly deformable scenes, using simultaneous video\ncaptures of a dynamic scene from multiple cameras. DeformGS builds on recent\nadvances in Gaussian splatting, a method that learns the properties of a large\nnumber of Gaussians for state-of-the-art and fast novel-view synthesis.\nDeformGS learns a deformation function to project a set of Gaussians with\ncanonical properties into world space. The deformation function uses a\nneural-voxel encoding and a multilayer perceptron (MLP) to infer Gaussian\nposition, rotation, and a shadow scalar. We enforce physics-inspired\nregularization terms based on conservation of momentum and isometry, which\nleads to trajectories with smaller trajectory errors. We also leverage existing\nfoundation models SAM and XMEM to produce noisy masks, and learn a per-Gaussian\nmask for better physics-inspired regularization. DeformGS achieves high-quality\n3D tracking on highly deformable scenes with shadows and occlusions. In\nexperiments, DeformGS improves 3D tracking by an average of 55.8% compared to\nthe state-of-the-art. With sufficient texture, DeformGS achieves a median\ntracking error of 3.3 mm on a cloth of 1.5 x 1.5 m in area. Website:\nhttps://deformgs.github.io\n","authors":["Bardienus P. Duisterhof","Zhao Mandi","Yunchao Yao","Jia-Wei Liu","Jenny Seidenschwarz","Mike Zheng Shou","Deva Ramanan","Shuran Song","Stan Birchfield","Bowen Wen","Jeffrey Ichnowski"],"pdf_url":"https://arxiv.org/pdf/2312.00583v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11933v3","updated":"2024-08-30T15:08:13Z","published":"2024-06-17T15:41:57Z","title":"OpticalRS-4M: Scaling Efficient Masked Autoencoder Learning on Large\n Remote Sensing Dataset","summary":" Masked Image Modeling (MIM) has become an essential method for building\nfoundational visual models in remote sensing (RS). However, the limitations in\nsize and diversity of existing RS datasets restrict the ability of MIM methods\nto learn generalizable representations. Additionally, conventional MIM\ntechniques, which require reconstructing all tokens, introduce unnecessary\ncomputational overhead. To address these issues, we present a new pre-training\npipeline for RS models, featuring the creation of a large-scale RS dataset and\nan efficient MIM approach. We curated a high-quality dataset named OpticalRS-4M\nby collecting publicly available RS datasets and processing them through\nexclusion, slicing, and deduplication. OpticalRS-4M comprises 4 million optical\nimages covering various RS tasks, such as object detection and pixel\nsegmentation. To enhance efficiency, we propose SelectiveMAE, a pre-training\nmethod that dynamically encodes and reconstructs semantically rich patch\ntokens, thereby reducing the inefficiencies of traditional MIM models caused by\nredundant background pixels in RS images. Extensive experiments demonstrate\nthat OpticalRS-4M significantly improves classification, detection, and\nsegmentation performance, while SelectiveMAE increases training efficiency over\n2 times. This highlights the effectiveness and scalability of our pipeline in\ndeveloping RS foundational models.\n","authors":["Fengxiang Wang","Hongzhen Wang","Di Wang","Zonghao Guo","Zhenyu Zhong","Long Lan","Jing Zhang","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2406.11933v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17339v1","updated":"2024-08-30T15:06:45Z","published":"2024-08-30T15:06:45Z","title":"Enhancing Underwater Imaging with 4-D Light Fields: Dataset and Method","summary":" In this paper, we delve into the realm of 4-D light fields (LFs) to enhance\nunderwater imaging plagued by light absorption, scattering, and other\nchallenges. Contrasting with conventional 2-D RGB imaging, 4-D LF imaging\nexcels in capturing scenes from multiple perspectives, thereby indirectly\nembedding geometric information. This intrinsic property is anticipated to\neffectively address the challenges associated with underwater imaging. By\nleveraging both explicit and implicit depth cues present in 4-D LF images, we\npropose a progressive, mutually reinforcing framework for underwater 4-D LF\nimage enhancement and depth estimation. Specifically, our framework explicitly\nutilizes estimated depth information alongside implicit depth-related dynamic\nconvolutional kernels to modulate output features. The entire framework\ndecomposes this complex task, iteratively optimizing the enhanced image and\ndepth information to progressively achieve optimal enhancement results. More\nimportantly, we construct the first 4-D LF-based underwater image dataset for\nquantitative evaluation and supervised training of learning-based methods,\ncomprising 75 underwater scenes and 3675 high-resolution 2K pairs. To craft\nvibrant and varied underwater scenes, we build underwater environments with\nvarious objects and adopt several types of degradation. Through extensive\nexperimentation, we showcase the potential and superiority of 4-D LF-based\nunderwater imaging vis-a-vis traditional 2-D RGB-based approaches. Moreover,\nour method effectively corrects color bias and achieves state-of-the-art\nperformance. The dataset and code will be publicly available at\nhttps://github.com/linlos1234/LFUIE.\n","authors":["Yuji Lin","Xianqiang Lyu","Junhui Hou","Qian Zhao","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2408.17339v1.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.09869v3","updated":"2024-08-30T15:05:58Z","published":"2024-08-19T10:20:06Z","title":"Docling Technical Report","summary":" This technical report introduces Docling, an easy to use, self-contained,\nMIT-licensed open-source package for PDF document conversion. It is powered by\nstate-of-the-art specialized AI models for layout analysis (DocLayNet) and\ntable structure recognition (TableFormer), and runs efficiently on commodity\nhardware in a small resource budget. The code interface allows for easy\nextensibility and addition of new features and models.\n","authors":["Christoph Auer","Maksym Lysak","Ahmed Nassar","Michele Dolfi","Nikolaos Livathinos","Panos Vagenas","Cesar Berrospi Ramis","Matteo Omenetti","Fabian Lindlbauer","Kasper Dinkla","Lokesh Mishra","Yusik Kim","Shubham Gupta","Rafael Teixeira de Lima","Valery Weber","Lucas Morin","Ingmar Meijer","Viktor Kuropiatnyk","Peter W. J. Staar"],"pdf_url":"https://arxiv.org/pdf/2408.09869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17337v1","updated":"2024-08-30T15:02:22Z","published":"2024-08-30T15:02:22Z","title":"Evaluating Reliability in Medical DNNs: A Critical Analysis of Feature\n and Confidence-Based OOD Detection","summary":" Reliable use of deep neural networks (DNNs) for medical image analysis\nrequires methods to identify inputs that differ significantly from the training\ndata, called out-of-distribution (OOD), to prevent erroneous predictions. OOD\ndetection methods can be categorised as either confidence-based (using the\nmodel's output layer for OOD detection) or feature-based (not using the output\nlayer). We created two new OOD benchmarks by dividing the D7P (dermatology) and\nBreastMNIST (ultrasound) datasets into subsets which either contain or don't\ncontain an artefact (rulers or annotations respectively). Models were trained\nwith artefact-free images, and images with the artefacts were used as OOD test\nsets. For each OOD image, we created a counterfactual by manually removing the\nartefact via image processing, to assess the artefact's impact on the model's\npredictions. We show that OOD artefacts can boost a model's softmax confidence\nin its predictions, due to correlations in training data among other factors.\nThis contradicts the common assumption that OOD artefacts should lead to more\nuncertain outputs, an assumption on which most confidence-based methods rely.\nWe use this to explain why feature-based methods (e.g. Mahalanobis score)\ntypically have greater OOD detection performance than confidence-based methods\n(e.g. MCP). However, we also show that feature-based methods typically perform\nworse at distinguishing between inputs that lead to correct and incorrect\npredictions (for both OOD and ID data). Following from these insights, we argue\nthat a combination of feature-based and confidence-based methods should be used\nwithin DNN pipelines to mitigate their respective weaknesses. These project's\ncode and OOD benchmarks are available at:\nhttps://github.com/HarryAnthony/Evaluating_OOD_detection.\n","authors":["Harry Anthony","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2408.17337v1.pdf","comment":"Accepted for the Uncertainty for Safe Utilization of Machine Learning\n in Medical Imaging (UNSURE 2024) workshop at the MICCAI 2023"},{"id":"http://arxiv.org/abs/2406.18249v2","updated":"2024-08-30T14:36:08Z","published":"2024-06-26T10:51:44Z","title":"Foundational Models for Pathology and Endoscopy Images: Application for\n Gastric Inflammation","summary":" The integration of artificial intelligence (AI) in medical diagnostics\nrepresents a significant advancement in managing upper gastrointestinal (GI)\ncancer, a major cause of global cancer mortality. Specifically for gastric\ncancer (GC), chronic inflammation causes changes in the mucosa such as atrophy,\nintestinal metaplasia (IM), dysplasia and ultimately cancer. Early detection\nthrough endoscopic regular surveillance is essential for better outcomes.\nFoundation models (FM), which are machine or deep learning models trained on\ndiverse data and applicable to broad use cases, offer a promising solution to\nenhance the accuracy of endoscopy and its subsequent pathology image analysis.\nThis review explores the recent advancements, applications, and challenges\nassociated with FM in endoscopy and pathology imaging. We started by\nelucidating the core principles and architectures underlying these models,\nincluding their training methodologies and the pivotal role of large-scale data\nin developing their predictive capabilities. Moreover, this work discusses\nemerging trends and future research directions, emphasizing the integration of\nmultimodal data, the development of more robust and equitable models, and the\npotential for real-time diagnostic support. This review aims to provide a\nroadmap for researchers and practitioners in navigating the complexities of\nincorporating FM into clinical practice for prevention/management of GC cases,\nthereby improving patient outcomes.\n","authors":["Hamideh Kerdegari","Kyle Higgins","Dennis Veselkov","Ivan Laponogov","Inese Polaka","Miguel Coimbra","Junior Andrea Pescino","Marcis Leja","Mario Dinis-Ribeiro","Tania Fleitas Kanonnikoff","Kirill Veselkov"],"pdf_url":"https://arxiv.org/pdf/2406.18249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17322v1","updated":"2024-08-30T14:32:25Z","published":"2024-08-30T14:32:25Z","title":"Investigating Neuron Ablation in Attention Heads: The Case for Peak\n Activation Centering","summary":" The use of transformer-based models is growing rapidly throughout society.\nWith this growth, it is important to understand how they work, and in\nparticular, how the attention mechanisms represent concepts. Though there are\nmany interpretability methods, many look at models through their neuronal\nactivations, which are poorly understood. We describe different lenses through\nwhich to view neuron activations, and investigate the effectiveness in language\nmodels and vision transformers through various methods of neural ablation: zero\nablation, mean ablation, activation resampling, and a novel approach we term\n'peak ablation'. Through experimental analysis, we find that in different\nregimes and models, each method can offer the lowest degradation of model\nperformance compared to other methods, with resampling usually causing the most\nsignificant performance deterioration. We make our code available at\nhttps://github.com/nickypro/investigating-ablation.\n","authors":["Nicholas Pochinkov","Ben Pasero","Skylar Shibayama"],"pdf_url":"https://arxiv.org/pdf/2408.17322v1.pdf","comment":"9 pages, 2 figures, XAI World Conference 2024 Late-Breaking Work"},{"id":"http://arxiv.org/abs/2408.17311v1","updated":"2024-08-30T14:15:48Z","published":"2024-08-30T14:15:48Z","title":"Structuring a Training Strategy to Robustify Perception Models with\n Realistic Image Augmentations","summary":" Advancing Machine Learning (ML)-based perception models for autonomous\nsystems necessitates addressing weak spots within the models, particularly in\nchallenging Operational Design Domains (ODDs). These are environmental\noperating conditions of an autonomous vehicle which can contain difficult\nconditions, e.g., lens flare at night or objects reflected in a wet street.\nThis report introduces a novel methodology for training with augmentations to\nenhance model robustness and performance in such conditions. The proposed\napproach leverages customized physics-based augmentation functions, to generate\nrealistic training data that simulates diverse ODD scenarios.\n We present a comprehensive framework that includes identifying weak spots in\nML models, selecting suitable augmentations, and devising effective training\nstrategies. The methodology integrates hyperparameter optimization and latent\nspace optimization to fine-tune augmentation parameters, ensuring they\nmaximally improve the ML models' performance. Experimental results demonstrate\nimprovements in model performance, as measured by commonly used metrics such as\nmean Average Precision (mAP) and mean Intersection over Union (mIoU) on\nopen-source object detection and semantic segmentation models and datasets.\n Our findings emphasize that optimal training strategies are model- and\ndata-specific and highlight the benefits of integrating augmentations into the\ntraining pipeline. By incorporating augmentations, we observe enhanced\nrobustness of ML-based perception models, making them more resilient to edge\ncases encountered in real-world ODDs. This work underlines the importance of\ncustomized augmentations and offers an effective solution for improving the\nsafety and reliability of autonomous driving functions.\n","authors":["Ahmed Hammam","Bharathwaj Krishnaswami Sreedhar","Nura Kawa","Tim Patzelt","Oliver De Candido"],"pdf_url":"https://arxiv.org/pdf/2408.17311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01476v2","updated":"2024-08-30T14:09:36Z","published":"2024-06-03T16:05:25Z","title":"DreamPhysics: Learning Physical Properties of Dynamic 3D Gaussians with\n Video Diffusion Priors","summary":" Dynamic 3D interaction has been attracting a lot of attention recently.\nHowever, creating such 4D content remains challenging. One solution is to\nanimate 3D scenes with physics-based simulation, which requires manually\nassigning precise physical properties to the object or the simulated results\nwould become unnatural. Another solution is to learn the deformation of 3D\nobjects with the distillation of video generative models, which, however, tends\nto produce 3D videos with small and discontinuous motions due to the\ninappropriate extraction and application of physical prior. In this work,\ncombining the strengths and complementing shortcomings of the above two\nsolutions, we propose to learn the physical properties of a material field with\nvideo diffusion priors, and then utilize a physics-based Material-Point-Method\n(MPM) simulator to generate 4D content with realistic motions. In particular,\nwe propose motion distillation sampling to emphasize video motion information\nduring distillation. Moreover, to facilitate the optimization, we further\npropose a KAN-based material field with frame boosting. Experimental results\ndemonstrate that our method enjoys more realistic motion than\nstate-of-the-arts. Codes are released at:\nhttps://github.com/tyhuang0428/DreamPhysics.\n","authors":["Tianyu Huang","Haoze Zhang","Yihan Zeng","Zhilu Zhang","Hui Li","Wangmeng Zuo","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2406.01476v2.pdf","comment":"Codes are released at: https://github.com/tyhuang0428/DreamPhysics"},{"id":"http://arxiv.org/abs/2408.03677v3","updated":"2024-08-30T13:55:50Z","published":"2024-08-07T10:36:26Z","title":"L4DR: LiDAR-4DRadar Fusion for Weather-Robust 3D Object Detection","summary":" LiDAR-based vision systems are integral for 3D object detection, which is\ncrucial for autonomous navigation. However, they suffer from performance\ndegradation in adverse weather conditions due to the quality deterioration of\nLiDAR point clouds. Fusing LiDAR with the weather-robust 4D radar sensor is\nexpected to solve this problem. However, the fusion of LiDAR and 4D radar is\nchallenging because they differ significantly in terms of data quality and the\ndegree of degradation in adverse weather. To address these issues, we introduce\nL4DR, a weather-robust 3D object detection method that effectively achieves\nLiDAR and 4D Radar fusion. Our L4DR includes Multi-Modal Encoding (MME) and\nForeground-Aware Denoising (FAD) technique to reconcile sensor gaps, which is\nthe first exploration of the complementarity of early fusion between LiDAR and\n4D radar. Additionally, we design an Inter-Modal and Intra-Modal ({IM}2 )\nparallel feature extraction backbone coupled with a Multi-Scale Gated Fusion\n(MSGF) module to counteract the varying degrees of sensor degradation under\nadverse weather conditions. Experimental evaluation on a VoD dataset with\nsimulated fog proves that L4DR is more adaptable to changing weather\nconditions. It delivers a significant performance increase under different fog\nlevels, improving the 3D mAP by up to 20.0% over the traditional LiDAR-only\napproach. Moreover, the results on the K-Radar dataset validate the consistent\nperformance improvement of L4DR in real-world adverse weather conditions.\n","authors":["Xun Huang","Ziyu Xu","Hai Wu","Jinlong Wang","Qiming Xia","Yan Xia","Jonathan Li","Kyle Gao","Chenglu Wen","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03677v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17297v1","updated":"2024-08-30T13:52:26Z","published":"2024-08-30T13:52:26Z","title":"BOP-D: Revisiting 6D Pose Estimation Benchmark for Better Evaluation\n under Visual Ambiguities","summary":" Currently, 6D pose estimation methods are benchmarked on datasets that\nconsider, for their ground truth annotations, visual ambiguities as only\nrelated to global object symmetries. However, as previously observed [26],\nvisual ambiguities can also happen depending on the viewpoint or the presence\nof occluding objects, when disambiguating parts become hidden. The visual\nambiguities are therefore actually different across images. We thus first\npropose an automatic method to re-annotate those datasets with a 6D pose\ndistribution specific to each image, taking into account the visibility of the\nobject surface in the image to correctly determine the visual ambiguities.\nGiven this improved ground truth, we re-evaluate the state-of-the-art methods\nand show this greatly modify the ranking of these methods. Our annotations also\nallow us to benchmark recent methods able to estimate a pose distribution on\nreal images for the first time. We will make our annotations for the T-LESS\ndataset and our code publicly available.\n","authors":["Boris Meden","Asma Brazi","Steve Bourgeois","Fabrice Mayran de Chamisso","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2408.17297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11748v3","updated":"2024-08-30T13:52:12Z","published":"2024-08-21T16:16:18Z","title":"GeoMeter: Probing Depth and Height Perception of Large Visual-Language\n Models","summary":" Geometric understanding is crucial for navigating and interacting with our\nenvironment. While large Vision Language Models (VLMs) demonstrate impressive\ncapabilities, deploying them in real-world scenarios necessitates a comparable\ngeometric understanding in visual perception. In this work, we focus on the\ngeometric comprehension of these models; specifically targeting the depths and\nheights of objects within a scene. Our observations reveal that, although VLMs\nexcel in basic geometric properties perception such as shape and size, they\nencounter significant challenges in reasoning about the depth and height of\nobjects. To address this, we introduce GeoMeter, a suite of benchmark datasets\nencompassing Synthetic 2D, Synthetic 3D, and Real-World scenarios to rigorously\nevaluate these aspects. We benchmark 17 state-of-the-art VLMs using these\ndatasets and find that they consistently struggle with both depth and height\nperception. Our key insights include detailed analyses of the shortcomings in\ndepth and height reasoning capabilities of VLMs and the inherent bias present\nin these models. This study aims to pave the way for the development of VLMs\nwith enhanced geometric understanding, crucial for real-world applications.\n","authors":["Shehreen Azad","Yash Jain","Rishit Garg","Yogesh S Rawat","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2408.11748v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16227v2","updated":"2024-08-30T13:48:14Z","published":"2024-08-29T02:58:35Z","title":"Revisiting 360 Depth Estimation with PanoGabor: A New Fusion Perspective","summary":" Depth estimation from a monocular 360 image is important to the perception of\nthe entire 3D environment. However, the inherent distortion and large field of\nview (FoV) in 360 images pose great challenges for this task. To this end,\nexisting mainstream solutions typically introduce additional perspective-based\n360 representations (\\textit{e.g.}, Cubemap) to achieve effective feature\nextraction. Nevertheless, regardless of the introduced representations, they\neventually need to be unified into the equirectangular projection (ERP) format\nfor the subsequent depth estimation, which inevitably reintroduces the\ntroublesome distortions. In this work, we propose an oriented distortion-aware\nGabor Fusion framework (PGFuse) to address the above challenges. First, we\nintroduce Gabor filters that analyze texture in the frequency domain, thereby\nextending the receptive fields and enhancing depth cues. To address the\nreintroduced distortions, we design a linear latitude-aware distortion\nrepresentation method to generate customized, distortion-aware Gabor filters\n(PanoGabor filters). Furthermore, we design a channel-wise and spatial-wise\nunidirectional fusion module (CS-UFM) that integrates the proposed PanoGabor\nfilters to unify other representations into the ERP format, delivering\neffective and distortion-free features. Considering the orientation sensitivity\nof the Gabor transform, we introduce a spherical gradient constraint to\nstabilize this sensitivity. Experimental results on three popular indoor 360\nbenchmarks demonstrate the superiority of the proposed PGFuse to existing\nstate-of-the-art solutions. Code can be available upon acceptance.\n","authors":["Zhijie Shen","Chunyu Lin","Lang Nie","Kang Liao"],"pdf_url":"https://arxiv.org/pdf/2408.16227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17284v1","updated":"2024-08-30T13:31:15Z","published":"2024-08-30T13:31:15Z","title":"DCUDF2: Improving Efficiency and Accuracy in Extracting Zero Level Sets\n from Unsigned Distance Fields","summary":" Unsigned distance fields (UDFs) allow for the representation of models with\ncomplex topologies, but extracting accurate zero level sets from these fields\nposes significant challenges, particularly in preserving topological accuracy\nand capturing fine geometric details. To overcome these issues, we introduce\nDCUDF2, an enhancement over DCUDF--the current state-of-the-art method--for\nextracting zero level sets from UDFs. Our approach utilizes an accuracy-aware\nloss function, enhanced with self-adaptive weights, to improve geometric\nquality significantly. We also propose a topology correction strategy that\nreduces the dependence on hyper-parameter, increasing the robustness of our\nmethod. Furthermore, we develop new operations leveraging self-adaptive weights\nto boost runtime efficiency. Extensive experiments on surface extraction across\ndiverse datasets demonstrate that DCUDF2 outperforms DCUDF and existing methods\nin both geometric fidelity and topological accuracy. We will make the source\ncode publicly available.\n","authors":["Xuhui Chen","Fugang Yu","Fei Hou","Wencheng Wang","Zhebin Zhang","Ying He"],"pdf_url":"https://arxiv.org/pdf/2408.17284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05735v3","updated":"2024-08-30T13:28:38Z","published":"2024-01-11T08:36:15Z","title":"Object-Centric Diffusion for Efficient Video Editing","summary":" Diffusion-based video editing have reached impressive quality and can\ntransform either the global style, local structure, and attributes of given\nvideo inputs, following textual edit prompts. However, such solutions typically\nincur heavy memory and computational costs to generate temporally-coherent\nframes, either in the form of diffusion inversion and/or cross-frame attention.\nIn this paper, we conduct an analysis of such inefficiencies, and suggest\nsimple yet effective modifications that allow significant speed-ups whilst\nmaintaining quality. Moreover, we introduce Object-Centric Diffusion, to fix\ngeneration artifacts and further reduce latency by allocating more computations\ntowards foreground edited regions, arguably more important for perceptual\nquality. We achieve this by two novel proposals: i) Object-Centric Sampling,\ndecoupling the diffusion steps spent on salient or background regions and\nspending most on the former, and ii) Object-Centric Token Merging, which\nreduces cost of cross-frame attention by fusing redundant tokens in unimportant\nbackground regions. Both techniques are readily applicable to a given video\nediting model without retraining, and can drastically reduce its memory and\ncomputational cost. We evaluate our proposals on inversion-based and\ncontrol-signal-based editing pipelines, and show a latency reduction up to 10x\nfor a comparable synthesis quality. Project page:\nqualcomm-ai-research.github.io/object-centric-diffusion.\n","authors":["Kumara Kahatapitiya","Adil Karjauv","Davide Abati","Fatih Porikli","Yuki M. Asano","Amirhossein Habibian"],"pdf_url":"https://arxiv.org/pdf/2401.05735v3.pdf","comment":"ECCV24"},{"id":"http://arxiv.org/abs/2407.00697v3","updated":"2024-08-30T13:25:50Z","published":"2024-06-30T13:39:29Z","title":"CaFNet: A Confidence-Driven Framework for Radar Camera Depth Estimation","summary":" Depth estimation is critical in autonomous driving for interpreting 3D scenes\naccurately. Recently, radar-camera depth estimation has become of sufficient\ninterest due to the robustness and low-cost properties of radar. Thus, this\npaper introduces a two-stage, end-to-end trainable Confidence-aware Fusion Net\n(CaFNet) for dense depth estimation, combining RGB imagery with sparse and\nnoisy radar point cloud data. The first stage addresses radar-specific\nchallenges, such as ambiguous elevation and noisy measurements, by predicting a\nradar confidence map and a preliminary coarse depth map. A novel approach is\npresented for generating the ground truth for the confidence map, which\ninvolves associating each radar point with its corresponding object to identify\npotential projection surfaces. These maps, together with the initial radar\ninput, are processed by a second encoder. For the final depth estimation, we\ninnovate a confidence-aware gated fusion mechanism to integrate radar and image\nfeatures effectively, thereby enhancing the reliability of the depth map by\nfiltering out radar noise. Our methodology, evaluated on the nuScenes dataset,\ndemonstrates superior performance, improving upon the current leading model by\n3.2% in Mean Absolute Error (MAE) and 2.7% in Root Mean Square Error (RMSE).\nCode: https://github.com/harborsarah/CaFNet\n","authors":["Huawei Sun","Hao Feng","Julius Ott","Lorenzo Servadei","Robert Wille"],"pdf_url":"https://arxiv.org/pdf/2407.00697v3.pdf","comment":"Accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2408.17267v1","updated":"2024-08-30T13:13:35Z","published":"2024-08-30T13:13:35Z","title":"UrBench: A Comprehensive Benchmark for Evaluating Large Multimodal\n Models in Multi-View Urban Scenarios","summary":" Recent evaluations of Large Multimodal Models (LMMs) have explored their\ncapabilities in various domains, with only few benchmarks specifically focusing\non urban environments. Moreover, existing urban benchmarks have been limited to\nevaluating LMMs with basic region-level urban tasks under singular views,\nleading to incomplete evaluations of LMMs' abilities in urban environments. To\naddress these issues, we present UrBench, a comprehensive benchmark designed\nfor evaluating LMMs in complex multi-view urban scenarios. UrBench contains\n11.6K meticulously curated questions at both region-level and role-level that\ncover 4 task dimensions: Geo-Localization, Scene Reasoning, Scene\nUnderstanding, and Object Understanding, totaling 14 task types. In\nconstructing UrBench, we utilize data from existing datasets and additionally\ncollect data from 11 cities, creating new annotations using a cross-view\ndetection-matching method. With these images and annotations, we then integrate\nLMM-based, rule-based, and human-based methods to construct large-scale\nhigh-quality questions. Our evaluations on 21 LMMs show that current LMMs\nstruggle in the urban environments in several aspects. Even the best performing\nGPT-4o lags behind humans in most tasks, ranging from simple tasks such as\ncounting to complex tasks such as orientation, localization and object\nattribute recognition, with an average performance gap of 17.4%. Our benchmark\nalso reveals that LMMs exhibit inconsistent behaviors with different urban\nviews, especially with respect to understanding cross-view relations. UrBench\ndatasets and benchmark results will be publicly available at\nhttps://opendatalab.github.io/UrBench/.\n","authors":["Baichuan Zhou","Haote Yang","Dairong Chen","Junyan Ye","Tianyi Bai","Jinhua Yu","Songyang Zhang","Dahua Lin","Conghui He","Weijia Li"],"pdf_url":"https://arxiv.org/pdf/2408.17267v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.15761v2","updated":"2024-08-30T13:13:04Z","published":"2024-08-28T12:56:00Z","title":"Addressing the challenges of loop detection in agricultural environments","summary":" While visual SLAM systems are well studied and achieve impressive results in\nindoor and urban settings, natural, outdoor and open-field environments are\nmuch less explored and still present relevant research challenges. Visual\nnavigation and local mapping have shown a relatively good performance in\nopen-field environments. However, globally consistent mapping and long-term\nlocalization still depend on the robustness of loop detection and closure, for\nwhich the literature is scarce. In this work we propose a novel method to pave\nthe way towards robust loop detection in open fields, particularly in\nagricultural settings, based on local feature search and stereo geometric\nrefinement, with a final stage of relative pose estimation. Our method\nconsistently achieves good loop detections, with a median error of 15cm. We aim\nto characterize open fields as a novel environment for loop detection,\nunderstanding the limitations and problems that arise when dealing with them.\n","authors":["Nicolás Soncini","Javier Civera","Taihú Pire"],"pdf_url":"https://arxiv.org/pdf/2408.15761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08981v2","updated":"2024-08-30T13:06:28Z","published":"2024-04-13T12:09:37Z","title":"Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active\n Image Classification","summary":" Deep active learning (AL) seeks to minimize the annotation costs for training\ndeep neural networks. BAIT, a recently proposed AL strategy based on the Fisher\nInformation, has demonstrated impressive performance across various datasets.\nHowever, BAIT's high computational and memory requirements hinder its\napplicability on large-scale classification tasks, resulting in current\nresearch neglecting BAIT in their evaluation. This paper introduces two methods\nto enhance BAIT's computational efficiency and scalability. Notably, we\nsignificantly reduce its time complexity by approximating the Fisher\nInformation. In particular, we adapt the original formulation by i) taking the\nexpectation over the most probable classes, and ii) constructing a binary\nclassification task, leading to an alternative likelihood for gradient\ncomputations. Consequently, this allows the efficient use of BAIT on\nlarge-scale datasets, including ImageNet. Our unified and comprehensive\nevaluation across a variety of datasets demonstrates that our approximations\nachieve strong performance with considerably reduced time complexity.\nFurthermore, we provide an extensive open-source toolbox that implements recent\nstate-of-the-art AL strategies, available at\nhttps://github.com/dhuseljic/dal-toolbox.\n","authors":["Denis Huseljic","Paul Hahn","Marek Herde","Lukas Rauch","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2404.08981v2.pdf","comment":"Accepted at ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2408.17253v1","updated":"2024-08-30T12:51:55Z","published":"2024-08-30T12:51:55Z","title":"VisionTS: Visual Masked Autoencoders Are Free-Lunch Zero-Shot Time\n Series Forecasters","summary":" Foundation models have emerged as a promising approach in time series\nforecasting (TSF). Existing approaches either fine-tune large language models\n(LLMs) or build large-scale time-series datasets to develop TSF foundation\nmodels. However, these methods face challenges due to the severe cross-domain\ngap or in-domain heterogeneity. In this paper, we explore a new road to\nbuilding a TSF foundation model from rich and high-quality natural images,\nbased on the intrinsic similarities between images and time series. To bridge\nthe gap between the two domains, we reformulate the TSF task as an image\nreconstruction task, which is further processed by a visual masked autoencoder\n(MAE) self-supervised pre-trained on the ImageNet dataset. Surprisingly,\nwithout further adaptation in the time-series domain, the proposed VisionTS\ncould achieve superior zero-shot forecasting performance compared to existing\nTSF foundation models. With minimal fine-tuning, VisionTS could further improve\nthe forecasting and achieve state-of-the-art performance in most cases. These\nfindings suggest that visual models could be a free lunch for TSF and highlight\nthe potential for future cross-domain research between computer vision and TSF.\nOur code is publicly available at https://github.com/Keytoyze/VisionTS.\n","authors":["Mouxiang Chen","Lefei Shen","Zhuo Li","Xiaoyun Joy Wang","Jianling Sun","Chenghao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.17253v1.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.17251v1","updated":"2024-08-30T12:50:15Z","published":"2024-08-30T12:50:15Z","title":"Abstracted Gaussian Prototypes for One-Shot Concept Learning","summary":" We introduce a cluster-based generative image segmentation framework to\nencode higher-level representations of visual concepts based on one-shot\nlearning inspired by the Omniglot Challenge. The inferred parameters of each\ncomponent of a Gaussian Mixture Model (GMM) represent a distinct topological\nsubpart of a visual concept. Sampling new data from these parameters generates\naugmented subparts to build a more robust prototype for each concept, i.e., the\nAbstracted Gaussian Prototype (AGP). This framework addresses one-shot\nclassification tasks using a cognitively-inspired similarity metric and\naddresses one-shot generative tasks through a novel AGP-VAE pipeline employing\nvariational autoencoders (VAEs) to generate new class variants. Results from\nhuman judges reveal that the generative pipeline produces novel examples and\nclasses of visual concepts that are broadly indistinguishable from those made\nby humans. The proposed framework leads to impressive but not state-of-the-art\nclassification accuracy; thus, the contribution is two-fold: 1) the system is\nuniquely low in theoretical and computational complexity and operates in a\ncompletely standalone manner compared while existing approaches draw heavily on\npre-training or knowledge engineering; and 2) in contrast with competing neural\nnetwork models, the AGP approach addresses the importance of breadth of task\ncapability emphasized in the Omniglot challenge (i.e., successful performance\non generative tasks). These two points are critical as we advance toward an\nunderstanding of how learning/reasoning systems can produce viable, robust, and\nflexible concepts based on literally nothing more than a single example.\n","authors":["Chelsea Zou","Kenneth J. Kurtz"],"pdf_url":"https://arxiv.org/pdf/2408.17251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02252v2","updated":"2024-08-30T12:44:44Z","published":"2024-07-02T13:17:49Z","title":"GlyphDraw2: Automatic Generation of Complex Glyph Posters with Diffusion\n Models and Large Language Models","summary":" Posters play a crucial role in marketing and advertising by enhancing visual\ncommunication and brand visibility, making significant contributions to\nindustrial design. With the latest advancements in controllable T2I diffusion\nmodels, increasing research has focused on rendering text within synthesized\nimages. Despite improvements in text rendering accuracy, the field of automatic\nposter generation remains underexplored. In this paper, we propose an automatic\nposter generation framework with text rendering capabilities leveraging LLMs,\nutilizing a triple-cross attention mechanism based on alignment learning. This\nframework aims to create precise poster text within a detailed contextual\nbackground. Additionally, the framework supports controllable fonts, adjustable\nimage resolution, and the rendering of posters with descriptions and text in\nboth English and Chinese.Furthermore, we introduce a high-resolution font\ndataset and a poster dataset with resolutions exceeding 1024 pixels. Our\napproach leverages the SDXL architecture. Extensive experiments validate our\nmethod's capability in generating poster images with complex and contextually\nrich backgrounds.Codes is available at\nhttps://github.com/OPPO-Mente-Lab/GlyphDraw2.\n","authors":["Jian Ma","Yonglin Deng","Chen Chen","Haonan Lu","Zhenyu Yang"],"pdf_url":"https://arxiv.org/pdf/2407.02252v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09353v2","updated":"2024-08-30T12:41:06Z","published":"2024-05-15T14:03:38Z","title":"Large coordinate kernel attention network for lightweight image\n super-resolution","summary":" The multi-scale receptive field and large kernel attention (LKA) module have\nbeen shown to significantly improve performance in the lightweight image\nsuper-resolution task. However, existing lightweight super-resolution (SR)\nmethods seldom pay attention to designing efficient building block with\nmulti-scale receptive field for local modeling, and their LKA modules face a\nquadratic increase in computational and memory footprints as the convolutional\nkernel size increases. To address the first issue, we propose the multi-scale\nblueprint separable convolutions (MBSConv) as highly efficient building block\nwith multi-scale receptive field, it can focus on the learning for the\nmulti-scale information which is a vital component of discriminative\nrepresentation. As for the second issue, we revisit the key properties of LKA\nin which we find that the adjacent direct interaction of local information and\nlong-distance dependencies is crucial to provide remarkable performance. Thus,\ntaking this into account and in order to mitigate the complexity of LKA, we\npropose a large coordinate kernel attention (LCKA) module which decomposes the\n2D convolutional kernels of the depth-wise convolutional layers in LKA into\nhorizontal and vertical 1-D kernels. LCKA enables the adjacent direct\ninteraction of local information and long-distance dependencies not only in the\nhorizontal direction but also in the vertical. Besides, LCKA allows for the\ndirect use of extremely large kernels in the depth-wise convolutional layers to\ncapture more contextual information, which helps to significantly improve the\nreconstruction performance, and it incurs lower computational complexity and\nmemory footprints. Integrating MBSConv and LCKA, we propose a large coordinate\nkernel attention network (LCAN).\n","authors":["Fangwei Hao","Jiesheng Wu","Haotian Lu","Ji Du","Jing Xu","Xiaoxuan Xu"],"pdf_url":"https://arxiv.org/pdf/2405.09353v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2310.19258v3","updated":"2024-08-30T12:31:40Z","published":"2023-10-30T04:04:02Z","title":"Improving Online Source-free Domain Adaptation for Object Detection by\n Unsupervised Data Acquisition","summary":" Effective object detection in autonomous vehicles is challenged by deployment\nin diverse and unfamiliar environments. Online Source-Free Domain Adaptation\n(O-SFDA) offers model adaptation using a stream of unlabeled data from a target\ndomain in an online manner. However, not all captured frames contain\ninformation beneficial for adaptation, especially in the presence of redundant\ndata and class imbalance issues. This paper introduces a novel approach to\nenhance O-SFDA for adaptive object detection through unsupervised data\nacquisition. Our methodology prioritizes the most informative unlabeled frames\nfor inclusion in the online training process. Empirical evaluation on a\nreal-world dataset reveals that our method outperforms existing\nstate-of-the-art O-SFDA techniques, demonstrating the viability of unsupervised\ndata acquisition for improving the adaptive object detector.\n","authors":["Xiangyu Shi","Yanyuan Qiao","Qi Wu","Lingqiao Liu","Feras Dayoub"],"pdf_url":"https://arxiv.org/pdf/2310.19258v3.pdf","comment":"Accepted by ECCV workshop ROAM 2024; 12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.17237v1","updated":"2024-08-30T12:27:22Z","published":"2024-08-30T12:27:22Z","title":"A nonlinear elasticity model in computer vision","summary":" The purpose of this paper is to analyze a nonlinear elasticity model\npreviously introduced by the authors for comparing two images, regarded as\nbounded open subsets of $\\R^n$ together with associated vector-valued intensity\nmaps. Optimal transformations between the images are sought as minimisers of an\nintegral functional among orientation-preserving homeomorphisms. The existence\nof minimisers is proved under natural coercivity and polyconvexity conditions,\nassuming only that the intensity functions are bounded measurable. Variants of\nthe existence theorem are also proved, first under the constraint that finite\nsets of landmark points in the two images are mapped one to the other, and\nsecond when one image is to be compared to an unknown part of another.\n The question is studied as to whether for images related by a linear mapping\nthe unique minimizer is given by that linear mapping. For a natural class of\nfunctional integrands an example is given guaranteeing that this property holds\nfor pairs of images in which the second is a scaling of the first by a constant\nfactor. However for the property to hold for arbitrary pairs of linearly\nrelated images it is shown that the integrand has to depend on the gradient of\nthe transformation as a convex function of its determinant alone. This suggests\na new model in which the integrand depends also on second derivatives of the\ntransformation, and an example is given for which both existence of minimizers\nis assured and the above property holds for all pairs of linearly related\nimages.\n","authors":["John M. Ball","Christopher L. Horner"],"pdf_url":"https://arxiv.org/pdf/2408.17237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17231v1","updated":"2024-08-30T12:17:49Z","published":"2024-08-30T12:17:49Z","title":"CondSeg: Ellipse Estimation of Pupil and Iris via Conditioned\n Segmentation","summary":" Parsing of eye components (i.e. pupil, iris and sclera) is fundamental for\neye tracking and gaze estimation for AR/VR products. Mainstream approaches\ntackle this problem as a multi-class segmentation task, providing only visible\npart of pupil/iris, other methods regress elliptical parameters using\nhuman-annotated full pupil/iris parameters. In this paper, we consider two\npriors: projected full pupil/iris circle can be modelled with ellipses (ellipse\nprior), and the visibility of pupil/iris is controlled by openness of\neye-region (condition prior), and design a novel method CondSeg to estimate\nelliptical parameters of pupil/iris directly from segmentation labels, without\nexplicitly annotating full ellipses, and use eye-region mask to control the\nvisibility of estimated pupil/iris ellipses. Conditioned segmentation loss is\nused to optimize the parameters by transforming parameterized ellipses into\npixel-wise soft masks in a differentiable way. Our method is tested on public\ndatasets (OpenEDS-2019/-2020) and shows competitive results on segmentation\nmetrics, and provides accurate elliptical parameters for further applications\nof eye tracking simultaneously.\n","authors":["Zhuang Jia","Jiangfan Deng","Liying Chi","Xiang Long","Daniel K. Du"],"pdf_url":"https://arxiv.org/pdf/2408.17231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17223v1","updated":"2024-08-30T12:01:59Z","published":"2024-08-30T12:01:59Z","title":"OG-Mapping: Octree-based Structured 3D Gaussians for Online Dense\n Mapping","summary":" 3D Gaussian splatting (3DGS) has recently demonstrated promising advancements\nin RGB-D online dense mapping. Nevertheless, existing methods excessively rely\non per-pixel depth cues to perform map densification, which leads to\nsignificant redundancy and increased sensitivity to depth noise. Additionally,\nexplicitly storing 3D Gaussian parameters of room-scale scene poses a\nsignificant storage challenge. In this paper, we introduce OG-Mapping, which\nleverages the robust scene structural representation capability of sparse\noctrees, combined with structured 3D Gaussian representations, to achieve\nefficient and robust online dense mapping. Moreover, OG-Mapping employs an\nanchor-based progressive map refinement strategy to recover the scene\nstructures at multiple levels of detail. Instead of maintaining a small number\nof active keyframes with a fixed keyframe window as previous approaches do, a\ndynamic keyframe window is employed to allow OG-Mapping to better tackle false\nlocal minima and forgetting issues. Experimental results demonstrate that\nOG-Mapping delivers more robust and superior realism mapping results than\nexisting Gaussian-based RGB-D online mapping methods with a compact model, and\nno additional post-processing is required.\n","authors":["Meng Wang","Junyi Wang","Changqun Xia","Chen Wang","Yue Qi"],"pdf_url":"https://arxiv.org/pdf/2408.17223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17222v1","updated":"2024-08-30T12:01:06Z","published":"2024-08-30T12:01:06Z","title":"How Could Generative AI Support Compliance with the EU AI Act? A Review\n for Safe Automated Driving Perception","summary":" Deep Neural Networks (DNNs) have become central for the perception functions\nof autonomous vehicles, substantially enhancing their ability to understand and\ninterpret the environment. However, these systems exhibit inherent limitations\nsuch as brittleness, opacity, and unpredictable behavior in out-of-distribution\nscenarios. The European Union (EU) Artificial Intelligence (AI) Act, as a\npioneering legislative framework, aims to address these challenges by\nestablishing stringent norms and standards for AI systems, including those used\nin autonomous driving (AD), which are categorized as high-risk AI. In this\nwork, we explore how the newly available generative AI models can potentially\nsupport addressing upcoming regulatory requirements in AD perception,\nparticularly with respect to safety. This short review paper summarizes the\nrequirements arising from the EU AI Act regarding DNN-based perception systems\nand systematically categorizes existing generative AI applications in AD. While\ngenerative AI models show promise in addressing some of the EU AI Acts\nrequirements, such as transparency and robustness, this review examines their\npotential benefits and discusses how developers could leverage these methods to\nenhance compliance with the Act. The paper also highlights areas where further\nresearch is needed to ensure reliable and safe integration of these\ntechnologies.\n","authors":["Mert Keser","Youssef Shoeb","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2408.17222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13126v2","updated":"2024-08-30T11:45:09Z","published":"2024-08-23T14:54:49Z","title":"CathAction: A Benchmark for Endovascular Intervention Understanding","summary":" Real-time visual feedback from catheterization analysis is crucial for\nenhancing surgical safety and efficiency during endovascular interventions.\nHowever, existing datasets are often limited to specific tasks, small scale,\nand lack the comprehensive annotations necessary for broader endovascular\nintervention understanding. To tackle these limitations, we introduce\nCathAction, a large-scale dataset for catheterization understanding. Our\nCathAction dataset encompasses approximately 500,000 annotated frames for\ncatheterization action understanding and collision detection, and 25,000 ground\ntruth masks for catheter and guidewire segmentation. For each task, we\nbenchmark recent related works in the field. We further discuss the challenges\nof endovascular intentions compared to traditional computer vision tasks and\npoint out open research questions. We hope that CathAction will facilitate the\ndevelopment of endovascular intervention understanding methods that can be\napplied to real-world applications. The dataset is available at\nhttps://airvlab.github.io/cathaction/.\n","authors":["Baoru Huang","Tuan Vo","Chayun Kongtongvattana","Giulio Dagnino","Dennis Kundrat","Wenqiang Chi","Mohamed Abdelaziz","Trevor Kwok","Tudor Jianu","Tuong Do","Hieu Le","Minh Nguyen","Hoan Nguyen","Erman Tjiputra","Quang Tran","Jianyang Xie","Yanda Meng","Binod Bhattarai","Zhaorui Tan","Hongbin Liu","Hong Seng Gan","Wei Wang","Xi Yang","Qiufeng Wang","Jionglong Su","Kaizhu Huang","Angelos Stefanidis","Min Guo","Bo Du","Rong Tao","Minh Vu","Guoyan Zheng","Yalin Zheng","Francisco Vasconcelos","Danail Stoyanov","Daniel Elson","Ferdinando Rodriguez y Baena","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.13126v2.pdf","comment":"10 pages. Webpage: https://airvlab.github.io/cathaction/"},{"id":"http://arxiv.org/abs/2408.17207v1","updated":"2024-08-30T11:22:09Z","published":"2024-08-30T11:22:09Z","title":"NanoMVG: USV-Centric Low-Power Multi-Task Visual Grounding based on\n Prompt-Guided Camera and 4D mmWave Radar","summary":" Recently, visual grounding and multi-sensors setting have been incorporated\ninto perception system for terrestrial autonomous driving systems and Unmanned\nSurface Vehicles (USVs), yet the high complexity of modern learning-based\nvisual grounding model using multi-sensors prevents such model to be deployed\non USVs in the real-life. To this end, we design a low-power multi-task model\nnamed NanoMVG for waterway embodied perception, guiding both camera and 4D\nmillimeter-wave radar to locate specific object(s) through natural language.\nNanoMVG can perform both box-level and mask-level visual grounding tasks\nsimultaneously. Compared to other visual grounding models, NanoMVG achieves\nhighly competitive performance on the WaterVG dataset, particularly in harsh\nenvironments and boasts ultra-low power consumption for long endurance.\n","authors":["Runwei Guan","Jianan Liu","Liye Jia","Haocheng Zhao","Shanliang Yao","Xiaohui Zhu","Ka Lok Man","Eng Gee Lim","Jeremy Smith","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2408.17207v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.18197v2","updated":"2024-08-30T11:19:02Z","published":"2024-06-26T09:29:05Z","title":"Human-Free Automated Prompting for Vision-Language Anomaly Detection:\n Prompt Optimization with Meta-guiding Prompt Scheme","summary":" Pre-trained vision-language models (VLMs) are highly adaptable to various\ndownstream tasks through few-shot learning, making prompt-based anomaly\ndetection a promising approach. Traditional methods depend on human-crafted\nprompts that require prior knowledge of specific anomaly types. Our goal is to\ndevelop a human-free prompt-based anomaly detection framework that optimally\nlearns prompts through data-driven methods, eliminating the need for human\nintervention. The primary challenge in this approach is the lack of anomalous\nsamples during the training phase. Additionally, the Vision Transformer\n(ViT)-based image encoder in VLMs is not ideal for pixel-wise anomaly\nsegmentation due to a locality feature mismatch between the original image and\nthe output feature map. To tackle the first challenge, we have developed the\nObject-Attention Anomaly Generation Module (OAGM) to synthesize anomaly samples\nfor training. Furthermore, our Meta-Guiding Prompt-Tuning Scheme (MPTS)\niteratively adjusts the gradient-based optimization direction of learnable\nprompts to avoid overfitting to the synthesized anomalies. For the second\nchallenge, we propose Locality-Aware Attention, which ensures that each local\npatch feature attends only to nearby patch features, preserving the locality\nfeatures corresponding to their original locations. This framework allows for\nthe optimal prompt embeddings by searching in the continuous latent space via\nbackpropagation, free from human semantic constraints. Additionally, the\nmodified locality-aware attention improves the precision of pixel-wise anomaly\nsegmentation.\n","authors":["Pi-Wei Chen","Jerry Chun-Wei Lin","Jia Ji","Feng-Hao Yeh","Chao-Chun Chen"],"pdf_url":"https://arxiv.org/pdf/2406.18197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17197v1","updated":"2024-08-30T10:49:33Z","published":"2024-08-30T10:49:33Z","title":"Covariance-corrected Whitening Alleviates Network Degeneration on\n Imbalanced Classification","summary":" Class imbalance is a critical issue in image classification that\nsignificantly affects the performance of deep recognition models. In this work,\nwe first identify a network degeneration dilemma that hinders the model\nlearning by introducing a high linear dependence among the features inputted\ninto the classifier. To overcome this challenge, we propose a novel framework\ncalled Whitening-Net to mitigate the degenerate solutions, in which ZCA\nwhitening is integrated before the linear classifier to normalize and\ndecorrelate the batch samples. However, in scenarios with extreme class\nimbalance, the batch covariance statistic exhibits significant fluctuations,\nimpeding the convergence of the whitening operation. Therefore, we propose two\ncovariance-corrected modules, the Group-based Relatively Balanced Batch Sampler\n(GRBS) and the Batch Embedded Training (BET), to get more accurate and stable\nbatch covariance, thereby reinforcing the capability of whitening. Our modules\ncan be trained end-to-end without incurring substantial computational costs.\nComprehensive empirical evaluations conducted on benchmark datasets, including\nCIFAR-LT-10/100, ImageNet-LT, and iNaturalist-LT, validate the effectiveness of\nour proposed approaches.\n","authors":["Zhiwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.17197v1.pdf","comment":"20 pages, 10 figures, 10 tables. arXiv admin note: text overlap with\n arXiv:2112.05958"},{"id":"http://arxiv.org/abs/2408.17182v1","updated":"2024-08-30T10:31:39Z","published":"2024-08-30T10:31:39Z","title":"Hybrid Classification-Regression Adaptive Loss for Dense Object\n Detection","summary":" For object detection detectors, enhancing model performance hinges on the\nability to simultaneously consider inconsistencies across tasks and focus on\ndifficult-to-train samples. Achieving this necessitates incorporating\ninformation from both the classification and regression tasks. However, prior\nwork tends to either emphasize difficult-to-train samples within their\nrespective tasks or simply compute classification scores with IoU, often\nleading to suboptimal model performance. In this paper, we propose a Hybrid\nClassification-Regression Adaptive Loss, termed as HCRAL. Specifically, we\nintroduce the Residual of Classification and IoU (RCI) module for cross-task\nsupervision, addressing task inconsistencies, and the Conditioning Factor (CF)\nto focus on difficult-to-train samples within each task. Furthermore, we\nintroduce a new strategy named Expanded Adaptive Training Sample Selection\n(EATSS) to provide additional samples that exhibit classification and\nregression inconsistencies. To validate the effectiveness of the proposed\nmethod, we conduct extensive experiments on COCO test-dev. Experimental\nevaluations demonstrate the superiority of our approachs. Additionally, we\ndesigned experiments by separately combining the classification and regression\nloss with regular loss functions in popular one-stage models, demonstrating\nimproved performance.\n","authors":["Yanquan Huang","Liu Wei Zhen","Yun Hao","Mengyuan Zhang","Qingyao Wu","Zikun Deng","Xueming Liu","Hong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.17182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17168v1","updated":"2024-08-30T10:12:13Z","published":"2024-08-30T10:12:13Z","title":"EMHI: A Multimodal Egocentric Human Motion Dataset with HMD and\n Body-Worn IMUs","summary":" Egocentric human pose estimation (HPE) using wearable sensors is essential\nfor VR/AR applications. Most methods rely solely on either egocentric-view\nimages or sparse Inertial Measurement Unit (IMU) signals, leading to\ninaccuracies due to self-occlusion in images or the sparseness and drift of\ninertial sensors. Most importantly, the lack of real-world datasets containing\nboth modalities is a major obstacle to progress in this field. To overcome the\nbarrier, we propose EMHI, a multimodal \\textbf{E}gocentric human\n\\textbf{M}otion dataset with \\textbf{H}ead-Mounted Display (HMD) and body-worn\n\\textbf{I}MUs, with all data collected under the real VR product suite.\nSpecifically, EMHI provides synchronized stereo images from downward-sloping\ncameras on the headset and IMU data from body-worn sensors, along with pose\nannotations in SMPL format. This dataset consists of 885 sequences captured by\n58 subjects performing 39 actions, totaling about 28.5 hours of recording. We\nevaluate the annotations by comparing them with optical marker-based SMPL\nfitting results. To substantiate the reliability of our dataset, we introduce\nMEPoser, a new baseline method for multimodal egocentric HPE, which employs a\nmultimodal fusion encoder, temporal feature encoder, and MLP-based regression\nheads. The experiments on EMHI show that MEPoser outperforms existing\nsingle-modal methods and demonstrates the value of our dataset in solving the\nproblem of egocentric HPE. We believe the release of EMHI and the method could\nadvance the research of egocentric HPE and expedite the practical\nimplementation of this technology in VR/AR products.\n","authors":["Zhen Fan","Peng Dai","Zhuo Su","Xu Gao","Zheng Lv","Jiarui Zhang","Tianyuan Du","Guidong Wang","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.17168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17154v1","updated":"2024-08-30T09:48:47Z","published":"2024-08-30T09:48:47Z","title":"Self-supervised Anomaly Detection Pretraining Enhances Long-tail ECG\n Diagnosis","summary":" Current computer-aided ECG diagnostic systems struggle with the\nunderdetection of rare but critical cardiac anomalies due to the imbalanced\nnature of ECG datasets. This study introduces a novel approach using\nself-supervised anomaly detection pretraining to address this limitation. The\nanomaly detection model is specifically designed to detect and localize subtle\ndeviations from normal cardiac patterns, capturing the nuanced details\nessential for accurate ECG interpretation. Validated on an extensive dataset of\nover one million ECG records from clinical practice, characterized by a\nlong-tail distribution across 116 distinct categories, the anomaly\ndetection-pretrained ECG diagnostic model has demonstrated a significant\nimprovement in overall accuracy. Notably, our approach yielded a 94.7% AUROC,\n92.2% sensitivity, and 92.5\\% specificity for rare ECG types, significantly\noutperforming traditional methods and narrowing the performance gap with common\nECG types. The integration of anomaly detection pretraining into ECG analysis\nrepresents a substantial contribution to the field, addressing the\nlong-standing challenge of long-tail data distributions in clinical\ndiagnostics. Furthermore, prospective validation in real-world clinical\nsettings revealed that our AI-driven approach enhances diagnostic efficiency,\nprecision, and completeness by 32%, 6.7%, and 11.8% respectively, when compared\nto standard practices. This advancement marks a pivotal step forward in the\nintegration of AI within clinical cardiology, with particularly profound\nimplications for emergency care, where rapid and accurate ECG interpretation is\ncrucial. The contributions of this study not only push the boundaries of\ncurrent ECG diagnostic capabilities but also lay the groundwork for more\nreliable and accessible cardiovascular care.\n","authors":["Aofan Jiang","Chaoqin Huang","Qing Cao","Yuchen Xu","Zi Zeng","Kang Chen","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17154v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.04935"},{"id":"http://arxiv.org/abs/2408.17150v1","updated":"2024-08-30T09:40:10Z","published":"2024-08-30T09:40:10Z","title":"Look, Compare, Decide: Alleviating Hallucination in Large\n Vision-Language Models via Multi-View Multi-Path Reasoning","summary":" Recently, Large Vision-Language Models (LVLMs) have demonstrated impressive\ncapabilities in multi-modal context comprehension. However, they still suffer\nfrom hallucination problems referring to generating inconsistent outputs with\nthe image content. To mitigate hallucinations, previous studies mainly focus on\nretraining LVLMs with custom datasets. Although effective, they inherently come\nwith additional computational costs. In this paper, we propose a training-free\nframework, \\textbf{MVP}, that aims to reduce hallucinations by making the most\nof the innate capabilities of the LVLMs via \\textbf{M}ulti-\\textbf{V}iew\nMulti-\\textbf{P}ath Reasoning. Specifically, we first devise a multi-view\ninformation-seeking strategy to thoroughly perceive the comprehensive\ninformation in the image, which enriches the general global information\ncaptured by the original vision encoder in LVLMs. Furthermore, during the\nanswer decoding, we observe that the occurrence of hallucinations has a strong\ncorrelation with the certainty of the answer tokens. Thus, we propose\nmulti-path reasoning for each information view to quantify and aggregate the\ncertainty scores for each potential answer among multiple decoding paths and\nfinally decide the output answer. By fully grasping the information in the\nimage and carefully considering the certainty of the potential answers when\ndecoding, our MVP can effectively reduce hallucinations in LVLMs.The extensive\nexperiments verify that our proposed MVP significantly mitigates the\nhallucination problem across four well-known LVLMs. The source code is\navailable at: \\url{https://github.com/GasolSun36/MVP}.\n","authors":["Xiaoye Qu","Jiashuo Sun","Wei Wei","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.17150v1.pdf","comment":"13 pages, 7 tables, 7 figures"},{"id":"http://arxiv.org/abs/2408.17149v1","updated":"2024-08-30T09:39:59Z","published":"2024-08-30T09:39:59Z","title":"GMM-IKRS: Gaussian Mixture Models for Interpretable Keypoint Refinement\n and Scoring","summary":" The extraction of keypoints in images is at the basis of many computer vision\napplications, from localization to 3D reconstruction. Keypoints come with a\nscore permitting to rank them according to their quality. While learned\nkeypoints often exhibit better properties than handcrafted ones, their scores\nare not easily interpretable, making it virtually impossible to compare the\nquality of individual keypoints across methods. We propose a framework that can\nrefine, and at the same time characterize with an interpretable score, the\nkeypoints extracted by any method. Our approach leverages a modified robust\nGaussian Mixture Model fit designed to both reject non-robust keypoints and\nrefine the remaining ones. Our score comprises two components: one relates to\nthe probability of extracting the same keypoint in an image captured from\nanother viewpoint, the other relates to the localization accuracy of the\nkeypoint. These two interpretable components permit a comparison of individual\nkeypoints extracted across different methods. Through extensive experiments we\ndemonstrate that, when applied to popular keypoint detectors, our framework\nconsistently improves the repeatability of keypoints as well as their\nperformance in homography and two/multiple-view pose recovery tasks.\n","authors":["Emanuele Santellani","Martin Zach","Christian Sormann","Mattia Rossi","Andreas Kuhn","Friedrich Fraundorfer"],"pdf_url":"https://arxiv.org/pdf/2408.17149v1.pdf","comment":"Accepted at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.17143v1","updated":"2024-08-30T09:34:36Z","published":"2024-08-30T09:34:36Z","title":"RenDetNet: Weakly-supervised Shadow Detection with Shadow Caster\n Verification","summary":" Existing shadow detection models struggle to differentiate dark image areas\nfrom shadows. In this paper, we tackle this issue by verifying that all\ndetected shadows are real, i.e. they have paired shadow casters. We perform\nthis step in a physically-accurate manner by differentiably re-rendering the\nscene and observing the changes stemming from carving out estimated shadow\ncasters. Thanks to this approach, the RenDetNet proposed in this paper is the\nfirst learning-based shadow detection model whose supervisory signals can be\ncomputed in a self-supervised manner. The developed system compares favourably\nagainst recent models trained on our data. As part of this publication, we\nrelease our code on github.\n","authors":["Nikolina Kubiak","Elliot Wortman","Armin Mustafa","Graeme Phillipson","Stephen Jolly","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2408.17143v1.pdf","comment":"AIM @ ECCV 2024 / code available at\n https://github.com/n-kubiak/RenDetNet"},{"id":"http://arxiv.org/abs/2408.16005v2","updated":"2024-08-30T09:26:10Z","published":"2024-08-13T20:00:36Z","title":"Many-Worlds Inverse Rendering","summary":" Discontinuous visibility changes remain a major bottleneck when optimizing\nsurfaces within a physically-based inverse renderer. Many previous works have\nproposed sophisticated algorithms and data structures to sample visibility\nsilhouettes more efficiently.\n Our work presents another solution: instead of differentiating a tentative\nsurface locally, we differentiate a volumetric perturbation of a surface. We\nrefer this as a many-worlds representation because it models a non-interacting\nsuperposition of conflicting explanations (worlds) of the input dataset. Each\nworld is optically isolated from others, leading to a new transport law that\ndistinguishes our method from prior work based on exponential random media.\n The resulting Monte Carlo algorithm is simpler and more efficient than prior\nmethods. We demonstrate that our method promotes rapid convergence, both in\nterms of the total iteration count and the cost per iteration.\n","authors":["Ziyi Zhang","Nicolas Roussel","Wenzel Jakob"],"pdf_url":"https://arxiv.org/pdf/2408.16005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17135v1","updated":"2024-08-30T09:22:07Z","published":"2024-08-30T09:22:07Z","title":"Temporal and Interactive Modeling for Efficient Human-Human Motion\n Generation","summary":" Human-human motion generation is essential for understanding humans as social\nbeings. Although several transformer-based methods have been proposed, they\ntypically model each individual separately and overlook the causal\nrelationships in temporal motion sequences. Furthermore, the attention\nmechanism in transformers exhibits quadratic computational complexity,\nsignificantly reducing their efficiency when processing long sequences. In this\npaper, we introduce TIM (Temporal and Interactive Modeling), an efficient and\neffective approach that presents the pioneering human-human motion generation\nmodel utilizing RWKV. Specifically, we first propose Causal Interactive\nInjection to leverage the temporal properties of motion sequences and avoid\nnon-causal and cumbersome modeling. Then we present Role-Evolving Mixing to\nadjust to the ever-evolving roles throughout the interaction. Finally, to\ngenerate smoother and more rational motion, we design Localized Pattern\nAmplification to capture short-term motion patterns. Extensive experiments on\nInterHuman demonstrate that our method achieves superior performance. Notably,\nTIM has achieved state-of-the-art results using only 32% of InterGen's\ntrainable parameters. Code will be available soon. Homepage:\nhttps://aigc-explorer.github.io/TIM-page/\n","authors":["Yabiao Wang","Shuo Wang","Jiangning Zhang","Ke Fan","Jiafu Wu","Zhengkai Jiang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.17135v1.pdf","comment":"Homepage: https://aigc-explorer.github.io/TIM-page/"},{"id":"http://arxiv.org/abs/2408.17131v1","updated":"2024-08-30T09:15:54Z","published":"2024-08-30T09:15:54Z","title":"VQ4DiT: Efficient Post-Training Vector Quantization for Diffusion\n Transformers","summary":" The Diffusion Transformers Models (DiTs) have transitioned the network\narchitecture from traditional UNets to transformers, demonstrating exceptional\ncapabilities in image generation. Although DiTs have been widely applied to\nhigh-definition video generation tasks, their large parameter size hinders\ninference on edge devices. Vector quantization (VQ) can decompose model weight\ninto a codebook and assignments, allowing extreme weight quantization and\nsignificantly reducing memory usage. In this paper, we propose VQ4DiT, a fast\npost-training vector quantization method for DiTs. We found that traditional VQ\nmethods calibrate only the codebook without calibrating the assignments. This\nleads to weight sub-vectors being incorrectly assigned to the same assignment,\nproviding inconsistent gradients to the codebook and resulting in a suboptimal\nresult. To address this challenge, VQ4DiT calculates the candidate assignment\nset for each weight sub-vector based on Euclidean distance and reconstructs the\nsub-vector based on the weighted average. Then, using the zero-data and\nblock-wise calibration method, the optimal assignment from the set is\nefficiently selected while calibrating the codebook. VQ4DiT quantizes a DiT\nXL/2 model on a single NVIDIA A100 GPU within 20 minutes to 5 hours depending\non the different quantization settings. Experiments show that VQ4DiT\nestablishes a new state-of-the-art in model size and performance trade-offs,\nquantizing weights to 2-bit precision while retaining acceptable image\ngeneration quality.\n","authors":["Juncan Deng","Shuaiting Li","Zeyu Wang","Hong Gu","Kedong Xu","Kejie Huang"],"pdf_url":"https://arxiv.org/pdf/2408.17131v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.19341v3","updated":"2024-08-30T09:07:26Z","published":"2024-02-29T16:47:54Z","title":"RoadRunner -- Learning Traversability Estimation for Autonomous Off-road\n Driving","summary":" Autonomous navigation at high speeds in off-road environments necessitates\nrobots to comprehensively understand their surroundings using onboard sensing\nonly. The extreme conditions posed by the off-road setting can cause degraded\ncamera image quality due to poor lighting and motion blur, as well as limited\nsparse geometric information available from LiDAR sensing when driving at high\nspeeds. In this work, we present RoadRunner, a novel framework capable of\npredicting terrain traversability and an elevation map directly from camera and\nLiDAR sensor inputs. RoadRunner enables reliable autonomous navigation, by\nfusing sensory information, handling of uncertainty, and generation of\ncontextually informed predictions about the geometry and traversability of the\nterrain while operating at low latency. In contrast to existing methods relying\non classifying handcrafted semantic classes and using heuristics to predict\ntraversability costs, our method is trained end-to-end in a self-supervised\nfashion. The RoadRunner network architecture builds upon popular sensor fusion\nnetwork architectures from the autonomous driving domain, which embed LiDAR and\ncamera information into a common Bird's Eye View perspective. Training is\nenabled by utilizing an existing traversability estimation stack to generate\ntraining data in hindsight in a scalable manner from real-world off-road\ndriving datasets. Furthermore, RoadRunner improves the system latency by a\nfactor of roughly 4, from 500 ms to 140 ms, while improving the accuracy for\ntraversability costs and elevation map predictions. We demonstrate the\neffectiveness of RoadRunner in enabling safe and reliable off-road navigation\nat high speeds in multiple real-world driving scenarios through unstructured\ndesert environments.\n","authors":["Jonas Frey","Manthan Patel","Deegan Atha","Julian Nubert","David Fan","Ali Agha","Curtis Padgett","Patrick Spieler","Marco Hutter","Shehryar Khattak"],"pdf_url":"https://arxiv.org/pdf/2402.19341v3.pdf","comment":"accepted for IEEE Transactions on Field Robotics (T-FR)"},{"id":"http://arxiv.org/abs/2408.13123v3","updated":"2024-08-30T09:06:06Z","published":"2024-08-23T14:50:49Z","title":"Evidential Deep Partial Multi-View Classification With Discount Fusion","summary":" Incomplete multi-view data classification poses significant challenges due to\nthe common issue of missing views in real-world scenarios. Despite\nadvancements, existing methods often fail to provide reliable predictions,\nlargely due to the uncertainty of missing views and the inconsistent quality of\nimputed data. To tackle these problems, we propose a novel framework called\nEvidential Deep Partial Multi-View Classification (EDP-MVC). Initially, we use\nK-means imputation to address missing views, creating a complete set of\nmulti-view data. However, the potential conflicts and uncertainties within this\nimputed data can affect the reliability of downstream inferences. To manage\nthis, we introduce a Conflict-Aware Evidential Fusion Network (CAEFN), which\ndynamically adjusts based on the reliability of the evidence, ensuring\ntrustworthy discount fusion and producing reliable inference outcomes.\nComprehensive experiments on various benchmark datasets reveal EDP-MVC not only\nmatches but often surpasses the performance of state-of-the-art methods.\n","authors":["Haojian Huang","Zhe Liu","Sukumar Letchmunan","Muhammet Deveci","Mingwei Lin","Weizhong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.13123v3.pdf","comment":"Ongoing work. 13 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.12966v3","updated":"2024-08-30T09:00:38Z","published":"2024-04-19T15:53:27Z","title":"Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of\n Multi-modal Large Language Models","summary":" Counterfactual reasoning, as a crucial manifestation of human intelligence,\nrefers to making presuppositions based on established facts and extrapolating\npotential outcomes. Existing multimodal large language models (MLLMs) have\nexhibited impressive cognitive and reasoning capabilities, which have been\nexamined across a wide range of Visual Question Answering (VQA) benchmarks.\nNevertheless, how will existing MLLMs perform when faced with counterfactual\nquestions? To answer this question, we first curate a novel\n\\textbf{C}ounter\\textbf{F}actual \\textbf{M}ulti\\textbf{M}odal reasoning\nbenchmark, abbreviated as \\textbf{CFMM}, to systematically assess the\ncounterfactual reasoning capabilities of MLLMs. Our CFMM comprises six\nchallenging tasks, each including hundreds of carefully human-labeled and\nGPT-generated counterfactual questions, to evaluate MLLM's counterfactual\nreasoning capabilities across diverse aspects. Through experiments,\ninterestingly, we find that existing MLLMs prefer to believe what they see, but\nignore the counterfactual presuppositions presented in the question, thereby\nleading to inaccurate responses. Furthermore, we evaluate a wide range of\nprevalent MLLMs on our proposed CFMM. The significant gap between their\nperformance on our CFMM and that on several VQA benchmarks indicates that there\nis still considerable room for improvement in existing MLLMs toward approaching\nhuman-level intelligence. On the other hand, through boosting MLLMs\nperformances on our CFMM in the future, potential avenues toward developing\nMLLMs with advanced intelligence can be explored.\n","authors":["Yian Li","Wentao Tian","Yang Jiao","Jingjing Chen"],"pdf_url":"https://arxiv.org/pdf/2404.12966v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17115v1","updated":"2024-08-30T08:57:04Z","published":"2024-08-30T08:57:04Z","title":"Multi-centric AI Model for Unruptured Intracranial Aneurysm Detection\n and Volumetric Segmentation in 3D TOF-MRI","summary":" Purpose: To develop an open-source nnU-Net-based AI model for combined\ndetection and segmentation of unruptured intracranial aneurysms (UICA) in 3D\nTOF-MRI, and compare models trained on datasets with aneurysm-like differential\ndiagnoses. Methods: This retrospective study (2020-2023) included 385\nanonymized 3D TOF-MRI images from 364 patients (mean age 59 years, 60% female)\nat multiple centers plus 113 subjects from the ADAM challenge. Images featured\nuntreated or possible UICAs and differential diagnoses. Four distinct training\ndatasets were created, and the nnU-Net framework was used for model\ndevelopment. Performance was assessed on a separate test set using sensitivity\nand False Positive (FP)/case rate for detection, and DICE score and NSD\n(Normalized Surface Distance) with a 0.5mm threshold for segmentation.\nStatistical analysis included chi-square, Mann-Whitney-U, and Kruskal-Wallis\ntests, with significance set at p < 0.05. Results: Models achieved overall\nsensitivity between 82% and 85% and a FP/case rate of 0.20 to 0.31, with no\nsignificant differences (p = 0.90 and p = 0.16). The primary model showed 85%\nsensitivity and 0.23 FP/case rate, outperforming the ADAM-challenge winner\n(61%) and a nnU-Net trained on ADAM data (51%) in sensitivity (p < 0.05). It\nachieved a mean DICE score of 0.73 and an NSD of 0.84 for correctly detected\nUICA. Conclusions: Our open-source, nnU-Net-based AI model (available at\n10.5281/zenodo.13386859) demonstrates high sensitivity, low false positive\nrates, and consistent segmentation accuracy for UICA detection and segmentation\nin 3D TOF-MRI, suggesting its potential to improve clinical diagnosis and for\nmonitoring of UICA.\n","authors":["Ashraya K. Indrakanti","Jakob Wasserthal","Martin Segeroth","Shan Yang","Victor Schulze-Zachau","Joshy Cyriac","Michael Bach","Marios Psychogios","Matthias A. Mutke"],"pdf_url":"https://arxiv.org/pdf/2408.17115v1.pdf","comment":"14 pages, 5 figures, 3 tables, 2 supplementary tables"},{"id":"http://arxiv.org/abs/2408.17108v1","updated":"2024-08-30T08:49:27Z","published":"2024-08-30T08:49:27Z","title":"Sparse Uncertainty-Informed Sampling from Federated Streaming Data","summary":" We present a numerically robust, computationally efficient approach for\nnon-I.I.D. data stream sampling in federated client systems, where resources\nare limited and labeled data for local model adaptation is sparse and\nexpensive. The proposed method identifies relevant stream observations to\noptimize the underlying client model, given a local labeling budget, and\nperforms instantaneous labeling decisions without relying on any memory\nbuffering strategies. Our experiments show enhanced training batch diversity\nand an improved numerical robustness of the proposal compared to existing\nstrategies over large-scale data streams, making our approach an effective and\nconvenient solution in FL environments.\n","authors":["Manuel Röder","Frank-Michael Schleif"],"pdf_url":"https://arxiv.org/pdf/2408.17108v1.pdf","comment":"Preprint, 6 pages, 3 figures, Accepted for ESANN 2024"},{"id":"http://arxiv.org/abs/2407.07605v3","updated":"2024-08-30T08:36:36Z","published":"2024-07-10T12:44:22Z","title":"Early Explorations of Lightweight Models for Wound Segmentation on\n Mobile Devices","summary":" The aging population poses numerous challenges to healthcare, including the\nincrease in chronic wounds in the elderly. The current approach to wound\nassessment by therapists based on photographic documentation is subjective,\nhighlighting the need for computer-aided wound recognition from smartphone\nphotos. This offers objective and convenient therapy monitoring, while being\naccessible to patients from their home at any time. However, despite research\nin mobile image segmentation, there is a lack of focus on mobile wound\nsegmentation. To address this gap, we conduct initial research on three\nlightweight architectures to investigate their suitability for smartphone-based\nwound segmentation. Using public datasets and UNet as a baseline, our results\nare promising, with both ENet and TopFormer, as well as the larger UNeXt\nvariant, showing comparable performance to UNet. Furthermore, we deploy the\nmodels into a smartphone app for visual assessment of live segmentation, where\nresults demonstrate the effectiveness of TopFormer in distinguishing wounds\nfrom wound-coloured objects. While our study highlights the potential of\ntransformer models for mobile wound segmentation, future work should aim to\nfurther improve the mask contours.\n","authors":["Vanessa Borst","Timo Dittus","Konstantin Müller","Samuel Kounev"],"pdf_url":"https://arxiv.org/pdf/2407.07605v3.pdf","comment":"Extended version of our paper that was published in the \"47th German\n Conference on Artificial Intelligence (KI 2024)\""},{"id":"http://arxiv.org/abs/2408.17098v1","updated":"2024-08-30T08:34:51Z","published":"2024-08-30T08:34:51Z","title":"UTrack: Multi-Object Tracking with Uncertain Detections","summary":" The tracking-by-detection paradigm is the mainstream in multi-object\ntracking, associating tracks to the predictions of an object detector. Although\nexhibiting uncertainty through a confidence score, these predictions do not\ncapture the entire variability of the inference process. For safety and\nsecurity critical applications like autonomous driving, surveillance, etc.,\nknowing this predictive uncertainty is essential though. Therefore, we\nintroduce, for the first time, a fast way to obtain the empirical predictive\ndistribution during object detection and incorporate that knowledge in\nmulti-object tracking. Our mechanism can easily be integrated into\nstate-of-the-art trackers, enabling them to fully exploit the uncertainty in\nthe detections. Additionally, novel association methods are introduced that\nleverage the proposed mechanism. We demonstrate the effectiveness of our\ncontribution on a variety of benchmarks, such as MOT17, MOT20, DanceTrack, and\nKITTI.\n","authors":["Edgardo Solano-Carrillo","Felix Sattler","Antje Alex","Alexander Klein","Bruno Pereira Costa","Angel Bueno Rodriguez","Jannis Stoppe"],"pdf_url":"https://arxiv.org/pdf/2408.17098v1.pdf","comment":"Accepted for the ECCV 2024 Workshop on Uncertainty Quantification for\n Computer Vision"},{"id":"http://arxiv.org/abs/2408.17095v1","updated":"2024-08-30T08:26:55Z","published":"2024-08-30T08:26:55Z","title":"RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation\n and Retrieval-Guidance","summary":" Diffusion-based models demonstrate impressive generation capabilities.\nHowever, they also have a massive number of parameters, resulting in enormous\nmodel sizes, thus making them unsuitable for deployment on resource-constraint\ndevices. Block-wise generation can be a promising alternative for designing\ncompact-sized (parameter-efficient) deep generative models since the model can\ngenerate one block at a time instead of generating the whole image at once.\nHowever, block-wise generation is also considerably challenging because\nensuring coherence across generated blocks can be non-trivial. To this end, we\ndesign a retrieval-augmented generation (RAG) approach and leverage the\ncorresponding blocks of the images retrieved by the RAG module to condition the\ntraining and generation stages of a block-wise denoising diffusion model. Our\nconditioning schemes ensure coherence across the different blocks during\ntraining and, consequently, during generation. While we showcase our approach\nusing the latent diffusion model (LDM) as the base model, it can be used with\nother variants of denoising diffusion models. We validate the solution of the\ncoherence problem through the proposed approach by reporting substantive\nexperiments to demonstrate our approach's effectiveness in compact model size\nand excellent generation quality.\n","authors":["Avideep Mukherjee","Soumya Banerjee","Vinay P. Namboodiri","Piyush Rai"],"pdf_url":"https://arxiv.org/pdf/2408.17095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17090v1","updated":"2024-08-30T08:22:30Z","published":"2024-08-30T08:22:30Z","title":"FissionVAE: Federated Non-IID Image Generation with Latent Space and\n Decoder Decomposition","summary":" Federated learning is a machine learning paradigm that enables decentralized\nclients to collaboratively learn a shared model while keeping all the training\ndata local. While considerable research has focused on federated image\ngeneration, particularly Generative Adversarial Networks, Variational\nAutoencoders have received less attention. In this paper, we address the\nchallenges of non-IID (independently and identically distributed) data\nenvironments featuring multiple groups of images of different types.\nSpecifically, heterogeneous data distributions can lead to difficulties in\nmaintaining a consistent latent space and can also result in local generators\nwith disparate texture features being blended during aggregation. We introduce\na novel approach, FissionVAE, which decomposes the latent space and constructs\ndecoder branches tailored to individual client groups. This method allows for\ncustomized learning that aligns with the unique data distributions of each\ngroup. Additionally, we investigate the incorporation of hierarchical VAE\narchitectures and demonstrate the use of heterogeneous decoder architectures\nwithin our model. We also explore strategies for setting the latent prior\ndistributions to enhance the decomposition process. To evaluate our approach,\nwe assemble two composite datasets: the first combines MNIST and FashionMNIST;\nthe second comprises RGB datasets of cartoon and human faces, wild animals,\nmarine vessels, and remote sensing images of Earth. Our experiments demonstrate\nthat FissionVAE greatly improves generation quality on these datasets compared\nto baseline federated VAE models.\n","authors":["Chen Hu","Jingjing Deng","Xianghua Xie","Xiaoke Ma"],"pdf_url":"https://arxiv.org/pdf/2408.17090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17083v1","updated":"2024-08-30T08:13:06Z","published":"2024-08-30T08:13:06Z","title":"Focus-Consistent Multi-Level Aggregation for Compositional Zero-Shot\n Learning","summary":" To transfer knowledge from seen attribute-object compositions to recognize\nunseen ones, recent compositional zero-shot learning (CZSL) methods mainly\ndiscuss the optimal classification branches to identify the elements, leading\nto the popularity of employing a three-branch architecture. However, these\nmethods mix up the underlying relationship among the branches, in the aspect of\nconsistency and diversity. Specifically, consistently providing the\nhighest-level features for all three branches increases the difficulty in\ndistinguishing classes that are superficially similar. Furthermore, a single\nbranch may focus on suboptimal regions when spatial messages are not shared\nbetween the personalized branches. Recognizing these issues and endeavoring to\naddress them, we propose a novel method called Focus-Consistent Multi-Level\nAggregation (FOMA). Our method incorporates a Multi-Level Feature Aggregation\n(MFA) module to generate personalized features for each branch based on the\nimage content. Additionally, a Focus-Consistent Constraint encourages a\nconsistent focus on the informative regions, thereby implicitly exchanging\nspatial information between all branches. Extensive experiments on three\nbenchmark datasets (UT-Zappos, C-GQA, and Clothing16K) demonstrate that our\nFOMA outperforms SOTA.\n","authors":["Fengyuan Dai","Siteng Huang","Min Zhang","Biao Gong","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17083v1.pdf","comment":"Compositional Zero-Shot Learning"},{"id":"http://arxiv.org/abs/2408.17081v1","updated":"2024-08-30T08:09:19Z","published":"2024-08-30T08:09:19Z","title":"Stochastic Layer-Wise Shuffle: A Good Practice to Improve Vision Mamba\n Training","summary":" Recent Vision Mamba models not only have much lower complexity for processing\nhigher resolution images and longer videos but also the competitive performance\nwith Vision Transformers (ViTs). However, they are stuck into overfitting and\nthus only present up to base size (about 80M). It is still unclear how vanilla\nVision Mamba (Vim) can be efficiently scaled up to larger sizes, which is\nessentially for further exploitation. In this paper, we propose a stochastic\nlayer-wise shuffle regularization, which empowers successfully scaling\nnon-hierarchical Vision Mamba to a large size (about 300M) in a supervised\nsetting. Specifically, our base and large-scale ShuffleMamba models can\noutperform the supervised ViTs of similar size by 0.8\\% and 1.0\\%\nclassification accuracy on ImageNet1k, respectively, without auxiliary data.\nWhen evaluated on the ADE20K semantic segmentation and COCO detection tasks,\nour ShuffleMamba models also show significant improvements. Without bells and\nwhistles, the stochastic layer-wise shuffle has the following highlights: (1)\n\\textit{Plug and play:} it does not change model architectures and will be\nomitted in inference. (2) \\textit{Simple but effective:} it can improve the\noverfitting in Vim training and only introduce random token permutation\noperations. (3) \\textit{Intuitive:} the token sequences in deeper layers are\nmore likely to be shuffled as they are expected to be more semantic and less\nsensitive to patch positions. Code and models will be available at\nhttps://github.com/huangzizheng01/ShuffleMamba.\n","authors":["Zizheng Huang","Haoxing Chen","Jiaqi Li","Jun Lan","Huijia Zhu","Weiqiang Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17073v1","updated":"2024-08-30T07:57:47Z","published":"2024-08-30T07:57:47Z","title":"Approximately Invertible Neural Network for Learned Image Compression","summary":" Learned image compression have attracted considerable interests in recent\nyears. It typically comprises an analysis transform, a synthesis transform,\nquantization and an entropy coding model. The analysis transform and synthesis\ntransform are used to encode an image to latent feature and decode the\nquantized feature to reconstruct the image, and can be regarded as coupled\ntransforms. However, the analysis transform and synthesis transform are\ndesigned independently in the existing methods, making them unreliable in\nhigh-quality image compression. Inspired by the invertible neural networks in\ngenerative modeling, invertible modules are used to construct the coupled\nanalysis and synthesis transforms. Considering the noise introduced in the\nfeature quantization invalidates the invertible process, this paper proposes an\nApproximately Invertible Neural Network (A-INN) framework for learned image\ncompression. It formulates the rate-distortion optimization in lossy image\ncompression when using INN with quantization, which differentiates from using\nINN for generative modelling. Generally speaking, A-INN can be used as the\ntheoretical foundation for any INN based lossy compression method. Based on\nthis formulation, A-INN with a progressive denoising module (PDM) is developed\nto effectively reduce the quantization noise in the decoding. Moreover, a\nCascaded Feature Recovery Module (CFRM) is designed to learn high-dimensional\nfeature recovery from low-dimensional ones to further reduce the noise in\nfeature channel compression. In addition, a Frequency-enhanced Decomposition\nand Synthesis Module (FDSM) is developed by explicitly enhancing the\nhigh-frequency components in an image to address the loss of high-frequency\ninformation inherent in neural network based image compression. Extensive\nexperiments demonstrate that the proposed A-INN outperforms the existing\nlearned image compression methods.\n","authors":["Yanbo Gao","Meng Fu","Shuai Li","Chong Lv","Xun Cai","Hui Yuan","Mao Ye"],"pdf_url":"https://arxiv.org/pdf/2408.17073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17065v1","updated":"2024-08-30T07:49:57Z","published":"2024-08-30T07:49:57Z","title":"Generalizing Deepfake Video Detection with Plug-and-Play: Video-Level\n Blending and Spatiotemporal Adapter Tuning","summary":" Three key challenges hinder the development of current deepfake video\ndetection: (1) Temporal features can be complex and diverse: how can we\nidentify general temporal artifacts to enhance model generalization? (2)\nSpatiotemporal models often lean heavily on one type of artifact and ignore the\nother: how can we ensure balanced learning from both? (3) Videos are naturally\nresource-intensive: how can we tackle efficiency without compromising accuracy?\n This paper attempts to tackle the three challenges jointly. First, inspired\nby the notable generality of using image-level blending data for image forgery\ndetection, we investigate whether and how video-level blending can be effective\nin video. We then perform a thorough analysis and identify a previously\nunderexplored temporal forgery artifact: Facial Feature Drift (FFD), which\ncommonly exists across different forgeries. To reproduce FFD, we then propose a\nnovel Video-level Blending data (VB), where VB is implemented by blending the\noriginal image and its warped version frame-by-frame, serving as a hard\nnegative sample to mine more general artifacts. Second, we carefully design a\nlightweight Spatiotemporal Adapter (StA) to equip a pretrained image model\n(both ViTs and CNNs) with the ability to capture both spatial and temporal\nfeatures jointly and efficiently. StA is designed with two-stream 3D-Conv with\nvarying kernel sizes, allowing it to process spatial and temporal features\nseparately. Extensive experiments validate the effectiveness of the proposed\nmethods; and show our approach can generalize well to previously unseen forgery\nvideos, even the just-released (in 2024) SoTAs. We release our code and\npretrained weights at \\url{https://github.com/YZY-stack/StA4Deepfake}.\n","authors":["Zhiyuan Yan","Yandan Zhao","Shen Chen","Xinghe Fu","Taiping Yao","Shouhong Ding","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.17065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17064v1","updated":"2024-08-30T07:49:35Z","published":"2024-08-30T07:49:35Z","title":"Instant Adversarial Purification with Adversarial Consistency\n Distillation","summary":" Neural networks, despite their remarkable performance in widespread\napplications, including image classification, are also known to be vulnerable\nto subtle adversarial noise. Although some diffusion-based purification methods\nhave been proposed, for example, DiffPure, those methods are time-consuming. In\nthis paper, we propose One Step Control Purification (OSCP), a diffusion-based\npurification model that can purify the adversarial image in one Neural Function\nEvaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and\nControlNet for our one-step purification. OSCP is computationally friendly and\ntime efficient compared to other diffusion-based purification methods; we\nachieve defense success rate of 74.19\\% on ImageNet, only requiring 0.1s for\neach purification. Moreover, there is a fundamental incongruence between\nconsistency distillation and adversarial perturbation. To address this\nontological dissonance, we propose Gaussian Adversarial Noise Distillation\n(GAND), a novel consistency distillation framework that facilitates a more\nnuanced reconciliation of the latent space dynamics, effectively bridging the\nnatural and adversarial manifolds. Our experiments show that the GAND does not\nneed a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient.\n","authors":["Chun Tong Lei","Hon Ming Yam","Zhongliang Guo","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.17064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17062v1","updated":"2024-08-30T07:48:05Z","published":"2024-08-30T07:48:05Z","title":"Vote&Mix: Plug-and-Play Token Reduction for Efficient Vision Transformer","summary":" Despite the remarkable success of Vision Transformers (ViTs) in various\nvisual tasks, they are often hindered by substantial computational cost. In\nthis work, we introduce Vote\\&Mix (\\textbf{VoMix}), a plug-and-play and\nparameter-free token reduction method, which can be readily applied to\noff-the-shelf ViT models \\textit{without any training}. VoMix tackles the\ncomputational redundancy of ViTs by identifying tokens with high homogeneity\nthrough a layer-wise token similarity voting mechanism. Subsequently, the\nselected tokens are mixed into the retained set, thereby preserving visual\ninformation. Experiments demonstrate VoMix significantly improves the\nspeed-accuracy tradeoff of ViTs on both images and videos. Without any\ntraining, VoMix achieves a 2$\\times$ increase in throughput of existing ViT-H\non ImageNet-1K and a 2.4$\\times$ increase in throughput of existing ViT-L on\nKinetics-400 video dataset, with a mere 0.3\\% drop in top-1 accuracy.\n","authors":["Shuai Peng","Di Fu","Baole Wei","Yong Cao","Liangcai Gao","Zhi Tang"],"pdf_url":"https://arxiv.org/pdf/2408.17062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10306v3","updated":"2024-08-30T07:39:50Z","published":"2023-01-25T11:00:32Z","title":"Deep Convolutional Framelet Denoising for Panoramic by Mixed Wavelet\n Integration","summary":" Enhancing quality and removing noise during preprocessing is one of the most\ncritical steps in image processing. X-ray images are created by photons\ncolliding with atoms and the variation in scattered noise absorption. This\nnoise leads to a deterioration in the graph's medical quality and, at times,\nresults in repetition, thereby increasing the patient's effective dose. One of\nthe most critical challenges in this area has consistently been lowering the\nimage noise. Techniques like BM3d, low-pass filters, and Autoencoder have taken\nthis step. Owing to their structural design and high rate of repetition, neural\nnetworks employing diverse architectures have, over the past decade, achieved\nnoise reduction with satisfactory outcomes, surpassing the traditional BM3D and\nlow-pass filters. The combination of the Hankel matrix with neural networks\nrepresents one of these configurations. The Hankel matrix aims to identify a\nlocal circle by separating individual values into local and non-local\ncomponents, utilizing a non-local matrix. A non-local matrix can be created\nusing the wave or DCT. This paper suggests integrating the waveform with the\nDaubechies (D4) wavelet due to its higher energy concentration and employs the\nu-Net neural network architecture, which incorporates the waveform exclusively\nat each stage. The outcomes were evaluated using the PSNR and SSIM criteria,\nand the outcomes were verified by using various waves. The effectiveness of a\none-wave network has increased from 0.5% to 1.2%, according to studies done on\nother datasets\n","authors":["Masoud Shahraki Mohammadi","Seyed Javad Seyed Mahdavi Chabok"],"pdf_url":"https://arxiv.org/pdf/2302.10306v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17060v1","updated":"2024-08-30T07:38:46Z","published":"2024-08-30T07:38:46Z","title":"Efficient Image Restoration through Low-Rank Adaptation and Stable\n Diffusion XL","summary":" In this study, we propose an enhanced image restoration model, SUPIR, based\non the integration of two low-rank adaptive (LoRA) modules with the Stable\nDiffusion XL (SDXL) framework. Our method leverages the advantages of LoRA to\nfine-tune SDXL models, thereby significantly improving image restoration\nquality and efficiency. We collect 2600 high-quality real-world images, each\nwith detailed descriptive text, for training the model. The proposed method is\nevaluated on standard benchmarks and achieves excellent performance,\ndemonstrated by higher peak signal-to-noise ratio (PSNR), lower learned\nperceptual image patch similarity (LPIPS), and higher structural similarity\nindex measurement (SSIM) scores. These results underscore the effectiveness of\ncombining LoRA with SDXL for advanced image restoration tasks, highlighting the\npotential of our approach in generating high-fidelity restored images.\n","authors":["Haiyang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.17060v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2408.17059v1","updated":"2024-08-30T07:38:28Z","published":"2024-08-30T07:38:28Z","title":"A Survey of the Self Supervised Learning Mechanisms for Vision\n Transformers","summary":" Deep supervised learning models require high volume of labeled data to attain\nsufficiently good results. Although, the practice of gathering and annotating\nsuch big data is costly and laborious. Recently, the application of self\nsupervised learning (SSL) in vision tasks has gained significant attention. The\nintuition behind SSL is to exploit the synchronous relationships within the\ndata as a form of self-supervision, which can be versatile. In the current big\ndata era, most of the data is unlabeled, and the success of SSL thus relies in\nfinding ways to improve this vast amount of unlabeled data available. Thus its\nbetter for deep learning algorithms to reduce reliance on human supervision and\ninstead focus on self-supervision based on the inherent relationships within\nthe data. With the advent of ViTs, which have achieved remarkable results in\ncomputer vision, it is crucial to explore and understand the various SSL\nmechanisms employed for training these models specifically in scenarios where\nthere is less label data available. In this survey we thus develop a\ncomprehensive taxonomy of systematically classifying the SSL techniques based\nupon their representations and pre-training tasks being applied. Additionally,\nwe discuss the motivations behind SSL, review popular pre-training tasks, and\nhighlight the challenges and advancements in this field. Furthermore, we\npresent a comparative analysis of different SSL methods, evaluate their\nstrengths and limitations, and identify potential avenues for future research.\n","authors":["Asifullah Khan","Anabia Sohail","Mustansar Fiaz","Mehdi Hassan","Tariq Habib Afridi","Sibghat Ullah Marwat","Farzeen Munir","Safdar Ali","Hannan Naseem","Muhammad Zaigham Zaheer","Kamran Ali","Tangina Sultana","Ziaurrehman Tanoli","Naeem Akhter"],"pdf_url":"https://arxiv.org/pdf/2408.17059v1.pdf","comment":"34 Pages, 5 Figures, 7 Tables"},{"id":"http://arxiv.org/abs/2308.09990v4","updated":"2024-08-30T07:32:50Z","published":"2023-08-19T11:40:57Z","title":"TSAR-MVS: Textureless-aware Segmentation and Correlative Refinement\n Guided Multi-View Stereo","summary":" The reconstruction of textureless areas has long been a challenging problem\nin MVS due to lack of reliable pixel correspondences between images. In this\npaper, we propose the Textureless-aware Segmentation And Correlative Refinement\nguided Multi-View Stereo (TSAR-MVS), a novel method that effectively tackles\nchallenges posed by textureless areas in 3D reconstruction through filtering,\nrefinement and segmentation. First, we implement the joint hypothesis\nfiltering, a technique that merges a confidence estimator with a disparity\ndiscontinuity detector to eliminate incorrect depth estimations. Second, to\nspread the pixels with confident depth, we introduce an iterative correlation\nrefinement strategy that leverages RANSAC to generate 3D planes based on\nsuperpixels, succeeded by a weighted median filter for broadening the influence\nof accurately determined pixels. Finally, we present a textureless-aware\nsegmentation method that leverages edge detection and line detection for\naccurately identify large textureless regions for further depth completion.\nExperiments on ETH3D, Tanks & Temples and Strecha datasets demonstrate the\nsuperior performance and strong generalization capability of our proposed\nmethod.\n","authors":["Zhenlong Yuan","Jiakai Cao","Zhaoqi Wang","Zhaoxin Li"],"pdf_url":"https://arxiv.org/pdf/2308.09990v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17057v1","updated":"2024-08-30T07:32:19Z","published":"2024-08-30T07:32:19Z","title":"LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality\n Assessment Model","summary":" Recent advancements in the field of No-Reference Image Quality Assessment\n(NR-IQA) using deep learning techniques demonstrate high performance across\nmultiple open-source datasets. However, such models are typically very large\nand complex making them not so suitable for real-world deployment, especially\non resource- and battery-constrained mobile devices. To address this\nlimitation, we propose a compact, lightweight NR-IQA model that achieves\nstate-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation\nand test datasets while being also nearly 5.7 times faster than the fastest\nSOTA model. Our model features a dual-branch architecture, with each branch\nseparately trained on synthetically and authentically distorted images which\nenhances the model's generalizability across different distortion types. To\nimprove robustness under diverse real-world visual conditions, we additionally\nincorporate multiple color spaces during the training process. We also\ndemonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks\n(KANs) for final quality regression as compared to the conventional Multi-Layer\nPerceptrons (MLPs). Our evaluation considering various open-source datasets\nhighlights the practical, high-accuracy, and robust performance of our proposed\nlightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildiyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.17057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19323v2","updated":"2024-08-30T07:30:11Z","published":"2024-07-27T19:00:44Z","title":"MSP-MVS: Multi-granularity Segmentation Prior Guided Multi-View Stereo","summary":" Reconstructing textureless areas in MVS poses challenges due to the absence\nof reliable pixel correspondences within fixed patch. Although certain methods\nemploy patch deformation to expand the receptive field, their patches\nmistakenly skip depth edges to calculate areas with depth discontinuity,\nthereby causing ambiguity. Consequently, we introduce Multi-granularity\nSegmentation Prior Multi-View Stereo (MSP-MVS). Specifically, we first propose\nmulti-granularity segmentation prior by integrating multi-granularity depth\nedges to restrict patch deformation within homogeneous areas. Moreover, we\npresent anchor equidistribution that bring deformed patches with more uniformly\ndistributed anchors to ensure an adequate coverage of their own homogeneous\nareas. Furthermore, we introduce iterative local search optimization to\nrepresent larger patch with sparse representative candidates, significantly\nboosting the expressive capacity for each patch. The state-of-the-art results\non ETH3D and Tanks & Temples benchmarks demonstrate the effectiveness and\nrobust generalization ability of our proposed method.\n","authors":["Zhenlong Yuan","Cong Liu","Fei Shen","Zhaoxin Li","Tianlu Mao","Zhaoqi Wang"],"pdf_url":"https://arxiv.org/pdf/2407.19323v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.09990"},{"id":"http://arxiv.org/abs/2408.17054v1","updated":"2024-08-30T07:25:53Z","published":"2024-08-30T07:25:53Z","title":"BTMuda: A Bi-level Multi-source unsupervised domain adaptation framework\n for breast cancer diagnosis","summary":" Deep learning has revolutionized the early detection of breast cancer,\nresulting in a significant decrease in mortality rates. However, difficulties\nin obtaining annotations and huge variations in distribution between training\nsets and real scenes have limited their clinical applications. To address these\nlimitations, unsupervised domain adaptation (UDA) methods have been used to\ntransfer knowledge from one labeled source domain to the unlabeled target\ndomain, yet these approaches suffer from severe domain shift issues and often\nignore the potential benefits of leveraging multiple relevant sources in\npractical applications. To address these limitations, in this work, we\nconstruct a Three-Branch Mixed extractor and propose a Bi-level Multi-source\nunsupervised domain adaptation method called BTMuda for breast cancer\ndiagnosis. Our method addresses the problems of domain shift by dividing domain\nshift issues into two levels: intra-domain and inter-domain. To reduce the\nintra-domain shift, we jointly train a CNN and a Transformer as two paths of a\ndomain mixed feature extractor to obtain robust representations rich in both\nlow-level local and high-level global information. As for the inter-domain\nshift, we redesign the Transformer delicately to a three-branch architecture\nwith cross-attention and distillation, which learns domain-invariant\nrepresentations from multiple domains. Besides, we introduce two alignment\nmodules - one for feature alignment and one for classifier alignment - to\nimprove the alignment process. Extensive experiments conducted on three public\nmammographic datasets demonstrate that our BTMuda outperforms state-of-the-art\nmethods.\n","authors":["Yuxiang Yang","Xinyi Zeng","Pinxian Zeng","Binyu Yan","Xi Wu","Jiliu Zhou","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17052v1","updated":"2024-08-30T07:22:11Z","published":"2024-08-30T07:22:11Z","title":"Can We Leave Deepfake Data Behind in Training Deepfake Detector?","summary":" The generalization ability of deepfake detectors is vital for their\napplications in real-world scenarios. One effective solution to enhance this\nability is to train the models with manually-blended data, which we termed\n\"blendfake\", encouraging models to learn generic forgery artifacts like\nblending boundary. Interestingly, current SoTA methods utilize blendfake\nwithout incorporating any deepfake data in their training process. This is\nlikely because previous empirical observations suggest that vanilla hybrid\ntraining (VHT), which combines deepfake and blendfake data, results in inferior\nperformance to methods using only blendfake data (so-called \"1+1<2\").\nTherefore, a critical question arises: Can we leave deepfake behind and rely\nsolely on blendfake data to train an effective deepfake detector? Intuitively,\nas deepfakes also contain additional informative forgery clues (e.g., deep\ngenerative artifacts), excluding all deepfake data in training deepfake\ndetectors seems counter-intuitive. In this paper, we rethink the role of\nblendfake in detecting deepfakes and formulate the process from \"real to\nblendfake to deepfake\" to be a progressive transition. Specifically, blendfake\nand deepfake can be explicitly delineated as the oriented pivot anchors between\n\"real-to-fake\" transitions. The accumulation of forgery information should be\noriented and progressively increasing during this transition process. To this\nend, we propose an Oriented Progressive Regularizor (OPR) to establish the\nconstraints that compel the distribution of anchors to be discretely arranged.\nFurthermore, we introduce feature bridging to facilitate the smooth transition\nbetween adjacent anchors. Extensive experiments confirm that our design allows\nleveraging forgery information from both blendfake and deepfake effectively and\ncomprehensively.\n","authors":["Jikang Cheng","Zhiyuan Yan","Ying Zhang","Yuhao Luo","Zhongyuan Wang","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.17052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17046v1","updated":"2024-08-30T07:08:01Z","published":"2024-08-30T07:08:01Z","title":"Text-to-Image Generation Via Energy-Based CLIP","summary":" Joint Energy Models (JEMs), while drawing significant research attention,\nhave not been successfully scaled to real-world, high-resolution datasets. We\npresent EB-CLIP, a novel approach extending JEMs to the multimodal\nvision-language domain using CLIP, integrating both generative and\ndiscriminative objectives. For the generative objective, we introduce an\nimage-text joint-energy function based on Cosine similarity in the CLIP space,\ntraining CLIP to assign low energy to real image-caption pairs and high energy\notherwise. For the discriminative objective, we employ contrastive adversarial\nloss, extending the adversarial training objective to the multimodal domain.\nEB-CLIP not only generates realistic images from text but also achieves\ncompetitive results on the compositionality benchmark, outperforming leading\nmethods with fewer parameters. Additionally, we demonstrate the superior\nguidance capability of EB-CLIP by enhancing CLIP-based generative frameworks\nand converting unconditional diffusion models to text-based ones. Lastly, we\nshow that EB-CLIP can serve as a more robust evaluation metric for\ntext-to-image generative tasks than CLIP.\n","authors":["Roy Ganz","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2408.17046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17036v1","updated":"2024-08-30T06:13:49Z","published":"2024-08-30T06:13:49Z","title":"CP-VoteNet: Contrastive Prototypical VoteNet for Few-Shot Point Cloud\n Object Detection","summary":" Few-shot point cloud 3D object detection (FS3D) aims to identify and localise\nobjects of novel classes from point clouds, using knowledge learnt from\nannotated base classes and novel classes with very few annotations. Thus far,\nthis challenging task has been approached using prototype learning, but the\nperformance remains far from satisfactory. We find that in existing methods,\nthe prototypes are only loosely constrained and lack of fine-grained awareness\nof the semantic and geometrical correlation embedded within the point cloud\nspace. To mitigate these issues, we propose to leverage the inherent\ncontrastive relationship within the semantic and geometrical subspaces to learn\nmore refined and generalisable prototypical representations. To this end, we\nfirst introduce contrastive semantics mining, which enables the network to\nextract discriminative categorical features by constructing positive and\nnegative pairs within training batches. Meanwhile, since point features\nrepresenting local patterns can be clustered into geometric components, we\nfurther propose to impose contrastive relationship at the primitive level.\nThrough refined primitive geometric structures, the transferability of feature\nencoding from base to novel classes is significantly enhanced. The above\ndesigns and insights lead to our novel Contrastive Prototypical VoteNet\n(CP-VoteNet). Extensive experiments on two FS3D benchmarks FS-ScanNet and\nFS-SUNRGBD demonstrate that CP-VoteNet surpasses current state-of-the-art\nmethods by considerable margins across different FS3D settings. Further\nablation studies conducted corroborate the rationale and effectiveness of our\ndesigns.\n","authors":["Xuejing Li","Weijia Zhang","Chao Ma"],"pdf_url":"https://arxiv.org/pdf/2408.17036v1.pdf","comment":"Accepted by PRCV 2024"},{"id":"http://arxiv.org/abs/2408.17027v1","updated":"2024-08-30T05:57:01Z","published":"2024-08-30T05:57:01Z","title":"ConDense: Consistent 2D/3D Pre-training for Dense and Sparse Features\n from Multi-View Images","summary":" To advance the state of the art in the creation of 3D foundation models, this\npaper introduces the ConDense framework for 3D pre-training utilizing existing\npre-trained 2D networks and large-scale multi-view datasets. We propose a novel\n2D-3D joint training scheme to extract co-embedded 2D and 3D features in an\nend-to-end pipeline, where 2D-3D feature consistency is enforced through a\nvolume rendering NeRF-like ray marching process. Using dense per pixel features\nwe are able to 1) directly distill the learned priors from 2D models to 3D\nmodels and create useful 3D backbones, 2) extract more consistent and less\nnoisy 2D features, 3) formulate a consistent embedding space where 2D, 3D, and\nother modalities of data (e.g., natural language prompts) can be jointly\nqueried. Furthermore, besides dense features, ConDense can be trained to\nextract sparse features (e.g., key points), also with 2D-3D consistency --\ncondensing 3D NeRF representations into compact sets of decorated key points.\nWe demonstrate that our pre-trained model provides good initialization for\nvarious 3D tasks including 3D classification and segmentation, outperforming\nother 3D pre-training methods by a significant margin. It also enables, by\nexploiting our sparse features, additional useful downstream tasks, such as\nmatching 2D images to 3D scenes, detecting duplicate 3D scenes, and querying a\nrepository of 3D scenes through natural language -- all quite efficiently and\nwithout any per-scene fine-tuning.\n","authors":["Xiaoshuai Zhang","Zhicheng Wang","Howard Zhou","Soham Ghosh","Danushen Gnanapragasam","Varun Jampani","Hao Su","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2408.17027v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2310.13019v4","updated":"2024-08-30T05:50:56Z","published":"2023-10-18T18:50:39Z","title":"Tailoring Adversarial Attacks on Deep Neural Networks for Targeted Class\n Manipulation Using DeepFool Algorithm","summary":" The susceptibility of deep neural networks (DNNs) to adversarial attacks\nundermines their reliability across numerous applications, underscoring the\nnecessity for an in-depth exploration of these vulnerabilities and the\nformulation of robust defense strategies. The DeepFool algorithm by\nMoosavi-Dezfooli et al. (2016) represents a pivotal step in identifying minimal\nperturbations required to induce misclassification of input images.\nNonetheless, its generic methodology falls short in scenarios necessitating\ntargeted interventions. Additionally, previous research studies have\npredominantly concentrated on the success rate of attacks without adequately\naddressing the consequential distortion of images, the maintenance of image\nquality, or the confidence threshold required for misclassification. To bridge\nthese gaps, we introduce the Enhanced Targeted DeepFool (ET DeepFool)\nalgorithm, an evolution of DeepFool that not only facilitates the specification\nof desired misclassification targets but also incorporates a configurable\nminimum confidence score. Our empirical investigations demonstrate the\nsuperiority of this refined approach in maintaining the integrity of images and\nminimizing perturbations across a variety of DNN architectures. Unlike previous\niterations, such as the Targeted DeepFool by Gajjar et al. (2022), our method\ngrants unparalleled control over the perturbation process, enabling precise\nmanipulation of model responses. Preliminary outcomes reveal that certain\nmodels, including AlexNet and the advanced Vision Transformer, display\ncommendable robustness to such manipulations. This discovery of varying levels\nof model robustness, as unveiled through our confidence level adjustments,\ncould have far-reaching implications for the field of image recognition. Our\ncode will be made public upon acceptance of the paper.\n","authors":["S. M. Fazle Rabby Labib","Joyanta Jyoti Mondal","Meem Arafat Manab","Sarfaraz Newaz","Xi Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.13019v4.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.14628v2","updated":"2024-08-30T05:34:25Z","published":"2024-03-21T17:59:59Z","title":"Zero-Shot Multi-Object Scene Completion","summary":" We present a 3D scene completion method that recovers the complete geometry\nof multiple unseen objects in complex scenes from a single RGB-D image. Despite\nnotable advancements in single-object 3D shape completion, high-quality\nreconstructions in highly cluttered real-world multi-object scenes remains a\nchallenge. To address this issue, we propose OctMAE, an architecture that\nleverages an Octree U-Net and a latent 3D MAE to achieve high-quality and near\nreal-time multi-object scene completion through both local and global geometric\nreasoning. Because a naive 3D MAE can be computationally intractable and memory\nintensive even in the latent space, we introduce a novel occlusion masking\nstrategy and adopt 3D rotary embeddings, which significantly improves the\nruntime and scene completion quality. To generalize to a wide range of objects\nin diverse scenes, we create a large-scale photorealistic dataset, featuring a\ndiverse set of 12K 3D object models from the Objaverse dataset which are\nrendered in multi-object scenes with physics-based positioning. Our method\noutperforms the current state-of-the-art on both synthetic and real-world\ndatasets and demonstrates a strong zero-shot capability.\n","authors":["Shun Iwase","Katherine Liu","Vitor Guizilini","Adrien Gaidon","Kris Kitani","Rares Ambrus","Sergey Zakharov"],"pdf_url":"https://arxiv.org/pdf/2403.14628v2.pdf","comment":"Published at ECCV 2024, Webpage: https://sh8.io/#/oct_mae"},{"id":"http://arxiv.org/abs/2212.10537v3","updated":"2024-08-30T04:51:28Z","published":"2022-12-20T18:46:28Z","title":"Does CLIP Bind Concepts? Probing Compositionality in Large Image Models","summary":" Large-scale neural network models combining text and images have made\nincredible progress in recent years. However, it remains an open question to\nwhat extent such models encode compositional representations of the concepts\nover which they operate, such as correctly identifying \"red cube\" by reasoning\nover the constituents \"red\" and \"cube\". In this work, we focus on the ability\nof a large pretrained vision and language model (CLIP) to encode compositional\nconcepts and to bind variables in a structure-sensitive way (e.g.,\ndifferentiating \"cube behind sphere\" from \"sphere behind cube\"). To inspect the\nperformance of CLIP, we compare several architectures from research on\ncompositional distributional semantics models (CDSMs), a line of research that\nattempts to implement traditional compositional linguistic structures within\nembedding spaces. We benchmark them on three synthetic datasets -\nsingle-object, two-object, and relational - designed to test concept binding.\nWe find that CLIP can compose concepts in a single-object setting, but in\nsituations where concept binding is needed, performance drops dramatically. At\nthe same time, CDSMs also perform poorly, with best performance at chance\nlevel.\n","authors":["Martha Lewis","Nihal V. Nayak","Peilin Yu","Qinan Yu","Jack Merullo","Stephen H. Bach","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2212.10537v3.pdf","comment":"Lewis and Nayak contributed equally"},{"id":"http://arxiv.org/abs/2408.17011v1","updated":"2024-08-30T04:51:19Z","published":"2024-08-30T04:51:19Z","title":"Disease Classification and Impact of Pretrained Deep Convolution Neural\n Networks on Diverse Medical Imaging Datasets across Imaging Modalities","summary":" Imaging techniques such as Chest X-rays, whole slide images, and optical\ncoherence tomography serve as the initial screening and detection for a wide\nvariety of medical pulmonary and ophthalmic conditions respectively. This paper\ninvestigates the intricacies of using pretrained deep convolutional neural\nnetworks with transfer learning across diverse medical imaging datasets with\nvarying modalities for binary and multiclass classification. We conducted a\ncomprehensive performance analysis with ten network architectures and model\nfamilies each with pretraining and random initialization. Our finding showed\nthat the use of pretrained models as fixed feature extractors yields poor\nperformance irrespective of the datasets. Contrary, histopathology microscopy\nwhole slide images have better performance. It is also found that deeper and\nmore complex architectures did not necessarily result in the best performance.\nThis observation implies that the improvements in ImageNet are not parallel to\nthe medical imaging tasks. Within a medical domain, the performance of the\nnetwork architectures varies within model families with shifts in datasets.\nThis indicates that the performance of models within a specific modality may\nnot be conclusive for another modality within the same domain. This study\nprovides a deeper understanding of the applications of deep learning techniques\nin medical imaging and highlights the impact of pretrained networks across\ndifferent medical imaging datasets under five different experimental settings.\n","authors":["Jutika Borah","Kumaresh Sarmah","Hidam Kumarjit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.17011v1.pdf","comment":"15 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.17006v1","updated":"2024-08-30T04:39:43Z","published":"2024-08-30T04:39:43Z","title":"Retrieval-Augmented Natural Language Reasoning for Explainable Visual\n Question Answering","summary":" Visual Question Answering with Natural Language Explanation (VQA-NLE) task is\nchallenging due to its high demand for reasoning-based inference. Recent\nVQA-NLE studies focus on enhancing model networks to amplify the model's\nreasoning capability but this approach is resource-consuming and unstable. In\nthis work, we introduce a new VQA-NLE model, ReRe (Retrieval-augmented natural\nlanguage Reasoning), using leverage retrieval information from the memory to\naid in generating accurate answers and persuasive explanations without relying\non complex networks and extra datasets. ReRe is an encoder-decoder architecture\nmodel using a pre-trained clip vision encoder and a pre-trained GPT-2 language\nmodel as a decoder. Cross-attention layers are added in the GPT-2 for\nprocessing retrieval features. ReRe outperforms previous methods in VQA\naccuracy and explanation score and shows improvement in NLE with more\npersuasive, reliability.\n","authors":["Su Hyeon Lim","Minkuk Kim","Hyeon Bae Kim","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2408.17006v1.pdf","comment":"ICIP Workshop 2024"},{"id":"http://arxiv.org/abs/2408.17005v1","updated":"2024-08-30T04:37:52Z","published":"2024-08-30T04:37:52Z","title":"Efficient Camera Exposure Control for Visual Odometry via Deep\n Reinforcement Learning","summary":" The stability of visual odometry (VO) systems is undermined by degraded image\nquality, especially in environments with significant illumination changes. This\nstudy employs a deep reinforcement learning (DRL) framework to train agents for\nexposure control, aiming to enhance imaging performance in challenging\nconditions. A lightweight image simulator is developed to facilitate the\ntraining process, enabling the diversification of image exposure and sequence\ntrajectory. This setup enables completely offline training, eliminating the\nneed for direct interaction with camera hardware and the real environments.\nDifferent levels of reward functions are crafted to enhance the VO systems,\nequipping the DRL agents with varying intelligence. Extensive experiments have\nshown that our exposure control agents achieve superior efficiency-with an\naverage inference duration of 1.58 ms per frame on a CPU-and respond more\nquickly than traditional feedback control schemes. By choosing an appropriate\nreward function, agents acquire an intelligent understanding of motion trends\nand anticipate future illumination changes. This predictive capability allows\nVO systems to deliver more stable and precise odometry results. The codes and\ndatasets are available at https://github.com/ShuyangUni/drl_exposure_ctrl.\n","authors":["Shuyang Zhang","Jinhao He","Yilong Zhu","Jin Wu","Jie Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.17005v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.09826v2","updated":"2024-08-30T03:58:50Z","published":"2024-07-13T09:39:11Z","title":"3D Weakly Supervised Semantic Segmentation with 2D Vision-Language\n Guidance","summary":" In this paper, we propose 3DSS-VLG, a weakly supervised approach for 3D\nSemantic Segmentation with 2D Vision-Language Guidance, an alternative approach\nthat a 3D model predicts dense-embedding for each point which is co-embedded\nwith both the aligned image and text spaces from the 2D vision-language model.\nSpecifically, our method exploits the superior generalization ability of the 2D\nvision-language models and proposes the Embeddings Soft-Guidance Stage to\nutilize it to implicitly align 3D embeddings and text embeddings. Moreover, we\nintroduce the Embeddings Specialization Stage to purify the feature\nrepresentation with the help of a given scene-level label, specifying a better\nfeature supervised by the corresponding text embedding. Thus, the 3D model is\nable to gain informative supervisions both from the image embedding and text\nembedding, leading to competitive segmentation performances. To the best of our\nknowledge, this is the first work to investigate 3D weakly supervised semantic\nsegmentation by using the textual semantic information of text category labels.\nMoreover, with extensive quantitative and qualitative experiments, we present\nthat our 3DSS-VLG is able not only to achieve the state-of-the-art performance\non both S3DIS and ScanNet datasets, but also to maintain strong generalization\ncapability.\n","authors":["Xiaoxu Xu","Yitian Yuan","Jinlong Li","Qiudan Zhang","Zequn Jie","Lin Ma","Hao Tang","Nicu Sebe","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2407.09826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10571v2","updated":"2024-08-30T03:48:40Z","published":"2024-08-20T06:17:56Z","title":"Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models","summary":" Diffusion models have revolutionized customized text-to-image generation,\nallowing for efficient synthesis of photos from personal data with textual\ndescriptions. However, these advancements bring forth risks including privacy\nbreaches and unauthorized replication of artworks. Previous researches\nprimarily center around using prompt-specific methods to generate adversarial\nexamples to protect personal images, yet the effectiveness of existing methods\nis hindered by constrained adaptability to different prompts. In this paper, we\nintroduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for\ncustomized diffusion models. PAP first models the prompt distribution using a\nLaplace Approximation, and then produces prompt-agnostic perturbations by\nmaximizing a disturbance expectation based on the modeled distribution. This\napproach effectively tackles the prompt-agnostic attacks, leading to improved\ndefense stability. Extensive experiments in face privacy and artistic style\nprotection, demonstrate the superior generalization of our method in comparison\nto existing techniques.\n","authors":["Cong Wan","Yuhang He","Xiang Song","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10571v2.pdf","comment":"The experiments are insufficient and need to be completed"},{"id":"http://arxiv.org/abs/2406.06978v4","updated":"2024-08-30T03:37:36Z","published":"2024-06-11T06:18:26Z","title":"Hydra-MDP: End-to-end Multimodal Planning with Multi-target\n Hydra-Distillation","summary":" We propose Hydra-MDP, a novel paradigm employing multiple teachers in a\nteacher-student model. This approach uses knowledge distillation from both\nhuman and rule-based teachers to train the student model, which features a\nmulti-head decoder to learn diverse trajectory candidates tailored to various\nevaluation metrics. With the knowledge of rule-based teachers, Hydra-MDP learns\nhow the environment influences the planning in an end-to-end manner instead of\nresorting to non-differentiable post-processing. This method achieves the\n$1^{st}$ place in the Navsim challenge, demonstrating significant improvements\nin generalization across diverse driving environments and conditions. More\ndetails by visiting \\url{https://github.com/NVlabs/Hydra-MDP}.\n","authors":["Zhenxin Li","Kailin Li","Shihao Wang","Shiyi Lan","Zhiding Yu","Yishen Ji","Zhiqi Li","Ziyue Zhu","Jan Kautz","Zuxuan Wu","Yu-Gang Jiang","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2406.06978v4.pdf","comment":"The 1st place solution of End-to-end Driving at Scale at the CVPR\n 2024 Autonomous Grand Challenge"},{"id":"http://arxiv.org/abs/2408.16986v1","updated":"2024-08-30T03:16:49Z","published":"2024-08-30T03:16:49Z","title":"AdaptVision: Dynamic Input Scaling in MLLMs for Versatile Scene\n Understanding","summary":" Over the past few years, the advancement of Multimodal Large Language Models\n(MLLMs) has captured the wide interest of researchers, leading to numerous\ninnovations to enhance MLLMs' comprehension. In this paper, we present\nAdaptVision, a multimodal large language model specifically designed to\ndynamically process input images at varying resolutions. We hypothesize that\nthe requisite number of visual tokens for the model is contingent upon both the\nresolution and content of the input image. Generally, natural images with a\nlower information density can be effectively interpreted by the model using\nfewer visual tokens at reduced resolutions. In contrast, images containing\ntextual content, such as documents with rich text, necessitate a higher number\nof visual tokens for accurate text interpretation due to their higher\ninformation density. Building on this insight, we devise a dynamic image\npartitioning module that adjusts the number of visual tokens according to the\nsize and aspect ratio of images. This method mitigates distortion effects that\narise from resizing images to a uniform resolution and dynamically optimizing\nthe visual tokens input to the LLMs. Our model is capable of processing images\nwith resolutions up to $1008\\times 1008$. Extensive experiments across various\ndatasets demonstrate that our method achieves impressive performance in\nhandling vision-language tasks in both natural and text-related scenes. The\nsource code and dataset are now publicly available at\n\\url{https://github.com/harrytea/AdaptVision}.\n","authors":["Yonghui Wang","Wengang Zhou","Hao Feng","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.16986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15026v3","updated":"2024-08-30T03:10:59Z","published":"2024-03-22T08:16:59Z","title":"VRSO: Visual-Centric Reconstruction for Static Object Annotation","summary":" As a part of the perception results of intelligent driving systems, static\nobject detection (SOD) in 3D space provides crucial cues for driving\nenvironment understanding. With the rapid deployment of deep neural networks\nfor SOD tasks, the demand for high-quality training samples soars. The\ntraditional, also reliable, way is manual labelling over the dense LiDAR point\nclouds and reference images. Though most public driving datasets adopt this\nstrategy to provide SOD ground truth (GT), it is still expensive and\ntime-consuming in practice. This paper introduces VRSO, a visual-centric\napproach for static object annotation. Experiments on the Waymo Open Dataset\nshow that the mean reprojection error from VRSO annotation is only 2.6 pixels,\naround four times lower than the Waymo Open Dataset labels (10.6 pixels). VRSO\nis distinguished in low cost, high efficiency, and high quality: (1) It\nrecovers static objects in 3D space with only camera images as input, and (2)\nmanual annotation is barely involved since GT for SOD tasks is generated based\non an automatic reconstruction and annotation pipeline.\n","authors":["Chenyao Yu","Yingfeng Cai","Jiaxin Zhang","Hui Kong","Wei Sui","Cong Yang"],"pdf_url":"https://arxiv.org/pdf/2403.15026v3.pdf","comment":"Accepted at 2024 IEEE International Conference on Intelligent Robots\n and Systems (IROS)"},{"id":"http://arxiv.org/abs/2309.01159v2","updated":"2024-08-30T03:08:42Z","published":"2023-09-03T12:37:59Z","title":"An Asynchronous Linear Filter Architecture for Hybrid Event-Frame\n Cameras","summary":" Event cameras are ideally suited to capture High Dynamic Range (HDR) visual\ninformation without blur but provide poor imaging capability for static or\nslowly varying scenes. Conversely, conventional image sensors measure absolute\nintensity of slowly changing scenes effectively but do poorly on HDR or quickly\nchanging scenes. In this paper, we present an asynchronous linear filter\narchitecture, fusing event and frame camera data, for HDR video reconstruction\nand spatial convolution that exploits the advantages of both sensor modalities.\nThe key idea is the introduction of a state that directly encodes the\nintegrated or convolved image information and that is updated asynchronously as\neach event or each frame arrives from the camera. The state can be read-off\nas-often-as and whenever required to feed into subsequent vision modules for\nreal-time robotic systems. Our experimental results are evaluated on both\npublicly available datasets with challenging lighting conditions and fast\nmotions, along with a new dataset with HDR reference that we provide. The\nproposed AKF pipeline outperforms other state-of-the-art methods in both\nabsolute intensity error (69.4% reduction) and image similarity indexes\n(average 35.5% improvement). We also demonstrate the integration of image\nconvolution with linear spatial kernels Gaussian, Sobel, and Laplacian as an\napplication of our architecture.\n","authors":["Ziwei Wang","Yonhon Ng","Cedric Scheerlinck","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2309.01159v2.pdf","comment":"17 pages, 10 figures. Date of Publication: 04 September 2023"},{"id":"http://arxiv.org/abs/2408.16982v1","updated":"2024-08-30T03:04:11Z","published":"2024-08-30T03:04:11Z","title":"2DGH: 2D Gaussian-Hermite Splatting for High-quality Rendering and\n Better Geometry Reconstruction","summary":" 2D Gaussian Splatting has recently emerged as a significant method in 3D\nreconstruction, enabling novel view synthesis and geometry reconstruction\nsimultaneously. While the well-known Gaussian kernel is broadly used, its lack\nof anisotropy and deformation ability leads to dim and vague edges at object\nsilhouettes, limiting the reconstruction quality of current Gaussian splatting\nmethods. To enhance the representation power, we draw inspiration from quantum\nphysics and propose to use the Gaussian-Hermite kernel as the new primitive in\nGaussian splatting. The new kernel takes a unified mathematical form and\nextends the Gaussian function, which serves as the zero-rank term in the\nupdated formulation. Our experiments demonstrate the extraordinary performance\nof Gaussian-Hermite kernel in both geometry reconstruction and novel-view\nsynthesis tasks. The proposed kernel outperforms traditional Gaussian Splatting\nkernels, showcasing its potential for high-quality 3D reconstruction and\nrendering.\n","authors":["Ruihan Yu","Tianyu Huang","Jingwang Ling","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16224v2","updated":"2024-08-30T02:49:40Z","published":"2024-08-29T02:43:20Z","title":"LLaVA-SG: Leveraging Scene Graphs as Visual Semantic Expression in\n Vision-Language Models","summary":" Recent advances in large vision-language models (VLMs) typically employ\nvision encoders based on the Vision Transformer (ViT) architecture. The\ndivision of the images into patches by ViT results in a fragmented perception,\nthereby hindering the visual understanding capabilities of VLMs. In this paper,\nwe propose an innovative enhancement to address this limitation by introducing\na Scene Graph Expression (SGE) module in VLMs. This module extracts and\nstructurally expresses the complex semantic information within images, thereby\nimproving the foundational perception and understanding abilities of VLMs.\nExtensive experiments demonstrate that integrating our SGE module significantly\nenhances the VLM's performance in vision-language tasks, indicating its\neffectiveness in preserving intricate semantic details and facilitating better\nvisual understanding.\n","authors":["Jingyi Wang","Jianzhong Ju","Jian Luan","Zhidong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.16224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16979v1","updated":"2024-08-30T02:45:56Z","published":"2024-08-30T02:45:56Z","title":"Cross Fusion RGB-T Tracking with Bi-directional Adapter","summary":" Many state-of-the-art RGB-T trackers have achieved remarkable results through\nmodality fusion. However, these trackers often either overlook temporal\ninformation or fail to fully utilize it, resulting in an ineffective balance\nbetween multi-modal and temporal information. To address this issue, we propose\na novel Cross Fusion RGB-T Tracking architecture (CFBT) that ensures the full\nparticipation of multiple modalities in tracking while dynamically fusing\ntemporal information. The effectiveness of CFBT relies on three newly designed\ncross spatio-temporal information fusion modules: Cross Spatio-Temporal\nAugmentation Fusion (CSTAF), Cross Spatio-Temporal Complementarity Fusion\n(CSTCF), and Dual-Stream Spatio-Temporal Adapter (DSTA). CSTAF employs a\ncross-attention mechanism to enhance the feature representation of the template\ncomprehensively. CSTCF utilizes complementary information between different\nbranches to enhance target features and suppress background features. DSTA\nadopts the adapter concept to adaptively fuse complementary information from\nmultiple branches within the transformer layer, using the RGB modality as a\nmedium. These ingenious fusions of multiple perspectives introduce only less\nthan 0.3\\% of the total modal parameters, but they indeed enable an efficient\nbalance between multi-modal and temporal information. Extensive experiments on\nthree popular RGB-T tracking benchmarks demonstrate that our method achieves\nnew state-of-the-art performance.\n","authors":["Zhirong Zeng","Xiaotao Liu","Meng Sun","Hongyu Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16757v2","updated":"2024-08-30T02:26:01Z","published":"2024-08-29T17:55:07Z","title":"Dissecting Out-of-Distribution Detection and Open-Set Recognition: A\n Critical Analysis of Methods and Benchmarks","summary":" Detecting test-time distribution shift has emerged as a key capability for\nsafely deployed machine learning models, with the question being tackled under\nvarious guises in recent years. In this paper, we aim to provide a consolidated\nview of the two largest sub-fields within the community: out-of-distribution\n(OOD) detection and open-set recognition (OSR). In particular, we aim to\nprovide rigorous empirical analysis of different methods across settings and\nprovide actionable takeaways for practitioners and researchers. Concretely, we\nmake the following contributions: (i) We perform rigorous cross-evaluation\nbetween state-of-the-art methods in the OOD detection and OSR settings and\nidentify a strong correlation between the performances of methods for them;\n(ii) We propose a new, large-scale benchmark setting which we suggest better\ndisentangles the problem tackled by OOD detection and OSR, re-evaluating\nstate-of-the-art OOD detection and OSR methods in this setting; (iii) We\nsurprisingly find that the best performing method on standard benchmarks\n(Outlier Exposure) struggles when tested at scale, while scoring rules which\nare sensitive to the deep feature magnitude consistently show promise; and (iv)\nWe conduct empirical analysis to explain these phenomena and highlight\ndirections for future research. Code:\nhttps://github.com/Visual-AI/Dissect-OOD-OSR\n","authors":["Hongjun Wang","Sagar Vaze","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2408.16757v2.pdf","comment":"Accepted to IJCV, preprint version; v2: add supplementary"},{"id":"http://arxiv.org/abs/2408.16971v1","updated":"2024-08-30T02:14:33Z","published":"2024-08-30T02:14:33Z","title":"Synthetic Lunar Terrain: A Multimodal Open Dataset for Training and\n Evaluating Neuromorphic Vision Algorithms","summary":" Synthetic Lunar Terrain (SLT) is an open dataset collected from an analogue\ntest site for lunar missions, featuring synthetic craters in a high-contrast\nlighting setup. It includes several side-by-side captures from event-based and\nconventional RGB cameras, supplemented with a high-resolution 3D laser scan for\ndepth estimation. The event-stream recorded from the neuromorphic vision sensor\nof the event-based camera is of particular interest as this emerging technology\nprovides several unique advantages, such as high data rates, low energy\nconsumption and resilience towards scenes of high dynamic range. SLT provides a\nsolid foundation to analyse the limits of RGB-cameras and potential advantages\nor synergies in utilizing neuromorphic visions with the goal of enabling and\nimproving lunar specific applications like rover navigation, landing in\ncratered environments or similar.\n","authors":["Marcus Märtens","Kevin Farries","John Culton","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2408.16971v1.pdf","comment":"7 pages, 5 figures, to be published at \"International Symposium on\n Artificial Intelligence, Robotics and Automation in Space, i-SAIRAS, 2024"},{"id":"http://arxiv.org/abs/2406.00971v2","updated":"2024-08-30T01:50:37Z","published":"2024-06-03T03:59:29Z","title":"MiniGPT-Reverse-Designing: Predicting Image Adjustments Utilizing\n MiniGPT-4","summary":" Vision-Language Models (VLMs) have recently seen significant advancements\nthrough integrating with Large Language Models (LLMs). The VLMs, which process\nimage and text modalities simultaneously, have demonstrated the ability to\nlearn and understand the interaction between images and texts across various\nmulti-modal tasks. Reverse designing, which could be defined as a complex\nvision-language task, aims to predict the edits and their parameters, given a\nsource image, an edited version, and an optional high-level textual edit\ndescription. This task requires VLMs to comprehend the interplay between the\nsource image, the edited version, and the optional textual context\nsimultaneously, going beyond traditional vision-language tasks. In this paper,\nwe extend and fine-tune MiniGPT-4 for the reverse designing task. Our\nexperiments demonstrate the extensibility of off-the-shelf VLMs, specifically\nMiniGPT-4, for more complex tasks such as reverse designing. Code is available\nat this \\href{https://github.com/VahidAz/MiniGPT-Reverse-Designing}\n","authors":["Vahid Azizi","Fatemeh Koochaki"],"pdf_url":"https://arxiv.org/pdf/2406.00971v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.16965v1","updated":"2024-08-30T01:47:43Z","published":"2024-08-30T01:47:43Z","title":"Contrastive Learning with Synthetic Positives","summary":" Contrastive learning with the nearest neighbor has proved to be one of the\nmost efficient self-supervised learning (SSL) techniques by utilizing the\nsimilarity of multiple instances within the same class. However, its efficacy\nis constrained as the nearest neighbor algorithm primarily identifies ``easy''\npositive pairs, where the representations are already closely located in the\nembedding space. In this paper, we introduce a novel approach called\nContrastive Learning with Synthetic Positives (CLSP) that utilizes synthetic\nimages, generated by an unconditional diffusion model, as the additional\npositives to help the model learn from diverse positives. Through feature\ninterpolation in the diffusion model sampling process, we generate images with\ndistinct backgrounds yet similar semantic content to the anchor image. These\nimages are considered ``hard'' positives for the anchor image, and when\nincluded as supplementary positives in the contrastive loss, they contribute to\na performance improvement of over 2\\% and 1\\% in linear evaluation compared to\nthe previous NNCLR and All4One methods across multiple benchmark datasets such\nas CIFAR10, achieving state-of-the-art methods. On transfer learning\nbenchmarks, CLSP outperforms existing SSL frameworks on 6 out of 8 downstream\ndatasets. We believe CLSP establishes a valuable baseline for future SSL\nstudies incorporating synthetic data in the training process.\n","authors":["Dewen Zeng","Yawen Wu","Xinrong Hu","Xiaowei Xu","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2408.16965v1.pdf","comment":"8 pages, conference"},{"id":"http://arxiv.org/abs/2408.16964v1","updated":"2024-08-30T01:45:22Z","published":"2024-08-30T01:45:22Z","title":"Causal Representation-Based Domain Generalization on Gaze Estimation","summary":" The availability of extensive datasets containing gaze information for each\nsubject has significantly enhanced gaze estimation accuracy. However, the\ndiscrepancy between domains severely affects a model's performance explicitly\ntrained for a particular domain. In this paper, we propose the Causal\nRepresentation-Based Domain Generalization on Gaze Estimation (CauGE) framework\ndesigned based on the general principle of causal mechanisms, which is\nconsistent with the domain difference. We employ an adversarial training manner\nand an additional penalizing term to extract domain-invariant features. After\nextracting features, we position the attention layer to make features\nsufficient for inferring the actual gaze. By leveraging these modules, CauGE\nensures that the neural networks learn from representations that meet the\ncausal mechanisms' general principles. By this, CauGE generalizes across\ndomains by extracting domain-invariant features, and spurious correlations\ncannot influence the model. Our method achieves state-of-the-art performance in\nthe domain generalization on gaze estimation benchmark.\n","authors":["Younghan Kim","Kangryun Moon","Yongjun Park","Yonggyu Kim"],"pdf_url":"https://arxiv.org/pdf/2408.16964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16959v1","updated":"2024-08-30T01:16:29Z","published":"2024-08-30T01:16:29Z","title":"HiTSR: A Hierarchical Transformer for Reference-based Super-Resolution","summary":" In this paper, we propose HiTSR, a hierarchical transformer model for\nreference-based image super-resolution, which enhances low-resolution input\nimages by learning matching correspondences from high-resolution reference\nimages. Diverging from existing multi-network, multi-stage approaches, we\nstreamline the architecture and training pipeline by incorporating the double\nattention block from GAN literature. Processing two visual streams\nindependently, we fuse self-attention and cross-attention blocks through a\ngating attention strategy. The model integrates a squeeze-and-excitation module\nto capture global context from the input images, facilitating long-range\nspatial interactions within window-based attention blocks. Long skip\nconnections between shallow and deep layers further enhance information flow.\nOur model demonstrates superior performance across three datasets including\nSUN80, Urban100, and Manga109. Specifically, on the SUN80 dataset, our model\nachieves PSNR/SSIM values of 30.24/0.821. These results underscore the\neffectiveness of attention mechanisms in reference-based image\nsuper-resolution. The transformer-based model attains state-of-the-art results\nwithout the need for purpose-built subnetworks, knowledge distillation, or\nmulti-stage training, emphasizing the potency of attention in meeting\nreference-based image super-resolution requirements.\n","authors":["Masoomeh Aslahishahri","Jordan Ubbens","Ian Stavness"],"pdf_url":"https://arxiv.org/pdf/2408.16959v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.08837"},{"id":"http://arxiv.org/abs/2312.09625v3","updated":"2024-08-30T01:07:08Z","published":"2023-12-15T09:08:14Z","title":"Weakly-Supervised 3D Visual Grounding based on Visual Linguistic\n Alignment","summary":" Learning to ground natural language queries to target objects or regions in\n3D point clouds is quite essential for 3D scene understanding. Nevertheless,\nexisting 3D visual grounding approaches require a substantial number of\nbounding box annotations for text queries, which is time-consuming and\nlabor-intensive to obtain. In this paper, we propose 3D-VLA, a weakly\nsupervised approach for 3D visual grounding based on Visual Linguistic\nAlignment. Our 3D-VLA exploits the superior ability of current large-scale\nvision-language models (VLMs) on aligning the semantics between texts and 2D\nimages, as well as the naturally existing correspondences between 2D images and\n3D point clouds, and thus implicitly constructs correspondences between texts\nand 3D point clouds with no need for fine-grained box annotations in the\ntraining procedure. During the inference stage, the learned text-3D\ncorrespondence will help us ground the text queries to the 3D target objects\neven without 2D images. To the best of our knowledge, this is the first work to\ninvestigate 3D visual grounding in a weakly supervised manner by involving\nlarge scale vision-language models, and extensive experiments on ReferIt3D and\nScanRefer datasets demonstrate that our 3D-VLA achieves comparable and even\nsuperior results over the fully supervised methods.\n","authors":["Xiaoxu Xu","Yitian Yuan","Qiudan Zhang","Wenhui Wu","Zequn Jie","Lin Ma","Xu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.09625v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16952v1","updated":"2024-08-30T00:27:46Z","published":"2024-08-30T00:27:46Z","title":"Transient Fault Tolerant Semantic Segmentation for Autonomous Driving","summary":" Deep learning models are crucial for autonomous vehicle perception, but their\nreliability is challenged by algorithmic limitations and hardware faults. We\naddress the latter by examining fault-tolerance in semantic segmentation\nmodels. Using established hardware fault models, we evaluate existing hardening\ntechniques both in terms of accuracy and uncertainty and introduce ReLUMax, a\nnovel simple activation function designed to enhance resilience against\ntransient faults. ReLUMax integrates seamlessly into existing architectures\nwithout time overhead. Our experiments demonstrate that ReLUMax effectively\nimproves robustness, preserving performance and boosting prediction confidence,\nthus contributing to the development of reliable autonomous driving systems.\n","authors":["Leonardo Iurada","Niccolò Cavagnero","Fernando Fernandes Dos Santos","Giuseppe Averta","Paolo Rech","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2408.16952v1.pdf","comment":"Accepted ECCV 2024 UnCV Workshop -\n https://github.com/iurada/neutron-segmentation"},{"id":"http://arxiv.org/abs/2408.05366v2","updated":"2024-08-30T22:26:55Z","published":"2024-08-09T22:29:43Z","title":"DeepSpeak Dataset v1.0","summary":" We describe a large-scale dataset--DeepSpeak--of real and deepfake footage of\npeople talking and gesturing in front of their webcams. The real videos in this\nfirst version of the dataset consist of 17 hours of footage from 220 diverse\nindividuals. Constituting more than 26 hours of footage, the fake videos\nconsist of a range of different state-of-the-art face-swap and lip-sync\ndeepfakes with natural and AI-generated voices. We expect to release future\nversions of this dataset with different and updated deepfake technologies. This\ndataset is made freely available for research and non-commercial uses; requests\nfor commercial use will be considered.\n","authors":["Sarah Barrington","Matyas Bohacek","Hany Farid"],"pdf_url":"https://arxiv.org/pdf/2408.05366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08872v2","updated":"2024-08-30T22:25:53Z","published":"2024-07-11T21:15:21Z","title":"Visual Multi-Object Tracking with Re-Identification and Occlusion\n Handling using Labeled Random Finite Sets","summary":" This paper proposes an online visual multi-object tracking (MOT) algorithm\nthat resolves object appearance-reappearance and occlusion. Our solution is\nbased on the labeled random finite set (LRFS) filtering approach, which in\nprinciple, addresses disappearance, appearance, reappearance, and occlusion via\na single Bayesian recursion. However, in practice, existing numerical\napproximations cause reappearing objects to be initialized as new tracks,\nespecially after long periods of being undetected. In occlusion handling, the\nfilter's efficacy is dictated by trade-offs between the sophistication of the\nocclusion model and computational demand. Our contribution is a novel modeling\nmethod that exploits object features to address reappearing objects whilst\nmaintaining a linear complexity in the number of detections. Moreover, to\nimprove the filter's occlusion handling, we propose a fuzzy detection model\nthat takes into consideration the overlapping areas between tracks and their\nsizes. We also develop a fast version of the filter to further reduce the\ncomputational time. The source code is publicly available at\nhttps://github.com/linh-gist/mv-glmb-ab.\n","authors":["Linh Van Ma","Tran Thien Dat Nguyen","Changbeom Shim","Du Yong Kim","Namkoo Ha","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2407.08872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10204v2","updated":"2024-08-30T20:19:10Z","published":"2024-08-19T17:58:03Z","title":"Criticality Leveraged Adversarial Training (CLAT) for Boosted\n Performance via Parameter Efficiency","summary":" Adversarial training enhances neural network robustness but suffers from a\ntendency to overfit and increased generalization errors on clean data. This\nwork introduces CLAT, an innovative approach that mitigates adversarial\noverfitting by introducing parameter efficiency into the adversarial training\nprocess, improving both clean accuracy and adversarial robustness. Instead of\ntuning the entire model, CLAT identifies and fine-tunes robustness-critical\nlayers - those predominantly learning non-robust features - while freezing the\nremaining model to enhance robustness. It employs dynamic critical layer\nselection to adapt to changes in layer criticality throughout the fine-tuning\nprocess. Empirically, CLAT can be applied on top of existing adversarial\ntraining methods, significantly reduces the number of trainable parameters by\napproximately 95%, and achieves more than a 2% improvement in adversarial\nrobustness compared to baseline methods.\n","authors":["Bhavna Gopal","Huanrui Yang","Jingyang Zhang","Mark Horton","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10204v2.pdf","comment":"9 pages + appendix/ additional experiments"},{"id":"http://arxiv.org/abs/2210.06466v3","updated":"2024-08-30T20:13:03Z","published":"2022-10-12T17:59:58Z","title":"Prompt Generation Networks for Input-Space Adaptation of Frozen Vision\n Transformers","summary":" With the introduction of the transformer architecture in computer vision,\nincreasing model scale has been demonstrated as a clear path to achieving\nperformance and robustness gains. However, with model parameter counts reaching\nthe billions, classical finetuning approaches are becoming increasingly\nlimiting and even unfeasible when models become hosted as inference APIs, as in\nNLP. Visual input-prompt learning, an adaptation technique in which additional\ninputs in visual (RGB) space are learned, has emerged as a potential solution\nfor adapting frozen and cloud-hosted models, requiring neither access to the\nforward pass, nor post-processing. Yet so far, these constraints have\ndeteriorated adaptation performances significantly. To this end, we propose the\nPrompt Generation Network (PGN) that generates a different prompt for every\ndata point, which is then used to adapt a frozen pretrained vision model to a\ntarget task. We show that the PGN effectively adapts pretrained models to\nvarious new datasets: It surpasses previous methods by a large margin on 12/12\ndatasets and even outperforms full-finetuning on 5/12, while requiring 100x\nfewer parameters. Lastly, we introduce the \"prompt inversion\" trick, with which\nPGNs can be efficiently trained in a latent space but deployed in RGB input\nspace for inference.\n","authors":["Jochem Loedeman","Maarten C. Stol","Tengda Han","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2210.06466v3.pdf","comment":"Accepted by BMVC2024. Codebase: https://github.com/jochemloedeman/PGN"},{"id":"http://arxiv.org/abs/2404.02885v2","updated":"2024-08-30T20:11:38Z","published":"2024-04-03T17:38:15Z","title":"PoCo: Point Context Cluster for RGBD Indoor Place Recognition","summary":" We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place\nrecognition task, aimed at identifying the most likely match for a given query\nframe within a reference database. The task presents inherent challenges\nattributed to the constrained field of view and limited range of perception\nsensors. We propose a new network architecture, which generalizes the recent\nContext of Clusters (CoCs) to extract global descriptors directly from the\nnoisy point clouds through end-to-end learning. Moreover, we develop the\narchitecture by integrating both color and geometric modalities into the point\nfeatures to enhance the global descriptor representation. We conducted\nevaluations on public datasets ScanNet-PR and ARKit with 807 and 5047\nscenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we\nachieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis\n(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the\nbest-published result CGis (39.82%). In addition, PoCo shows higher efficiency\nthan CGis in inference time (1.75X-faster), and we demonstrate the\neffectiveness of PoCo in recognizing places within a real-world laboratory\nenvironment.\n","authors":["Jing Liang","Zhuo Deng","Zheming Zhou","Omid Ghasemalizadeh","Dinesh Manocha","Min Sun","Cheng-Hao Kuo","Arnie Sen"],"pdf_url":"https://arxiv.org/pdf/2404.02885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11799v2","updated":"2024-08-30T20:10:51Z","published":"2024-06-17T17:47:44Z","title":"Mix-Domain Contrastive Learning for Unpaired H&E-to-IHC Stain\n Translation","summary":" H&E-to-IHC stain translation techniques offer a promising solution for\nprecise cancer diagnosis, especially in low-resource regions where there is a\nshortage of health professionals and limited access to expensive equipment.\nConsidering the pixel-level misalignment of H&E-IHC image pairs, current\nresearch explores the pathological consistency between patches from the same\npositions of the image pair. However, most of them overemphasize the\ncorrespondence between domains or patches, overlooking the side information\nprovided by the non-corresponding objects. In this paper, we propose a\nMix-Domain Contrastive Learning (MDCL) method to leverage the supervision\ninformation in unpaired H&E-to-IHC stain translation. Specifically, the\nproposed MDCL method aggregates the inter-domain and intra-domain pathology\ninformation by estimating the correlation between the anchor patch and all the\npatches from the matching images, encouraging the network to learn additional\ncontrastive knowledge from mixed domains. With the mix-domain pathology\ninformation aggregation, MDCL enhances the pathological consistency between the\ncorresponding patches and the component discrepancy of the patches from the\ndifferent positions of the generated IHC image. Extensive experiments on two\nH&E-to-IHC stain translation datasets, namely MIST and BCI, demonstrate that\nthe proposed method achieves state-of-the-art performance across multiple\nmetrics.\n","authors":["Song Wang","Zhong Zhang","Huan Yan","Ming Xu","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2406.11799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04634v4","updated":"2024-08-30T19:58:02Z","published":"2024-05-07T19:37:22Z","title":"FRACTAL: An Ultra-Large-Scale Aerial Lidar Dataset for 3D Semantic\n Segmentation of Diverse Landscapes","summary":" Mapping agencies are increasingly adopting Aerial Lidar Scanning (ALS) as a\nnew tool to map buildings and other above-ground structures. Processing ALS\ndata at scale requires efficient point classification methods that perform well\nover highly diverse territories. Large annotated Lidar datasets are needed to\nevaluate these classification methods, however, current Lidar benchmarks have\nrestricted scope and often cover a single urban area. To bridge this data gap,\nwe introduce the FRench ALS Clouds from TArgeted Landscapes (FRACTAL) dataset:\nan ultra-large-scale aerial Lidar dataset made of 100,000 dense point clouds\nwith high quality labels for 7 semantic classes and spanning 250 km$^2$.\nFRACTAL achieves high spatial and semantic diversity by explicitly sampling\nrare classes and challenging landscapes from five different regions of France.\nWe describe the data collection, annotation, and curation process of the\ndataset. We provide baseline semantic segmentation results using a state of the\nart 3D point cloud classification model. FRACTAL aims to support the\ndevelopment of 3D deep learning approaches for large-scale land monitoring.\n","authors":["Charles Gaydon","Michel Daab","Floryne Roche"],"pdf_url":"https://arxiv.org/pdf/2405.04634v4.pdf","comment":"9 (body) + 2 (bibliography) + 8 (appendices) pages | Dataset is\n available at https://huggingface.co/datasets/IGNF/FRACTAL | Trained model is\n available at https://huggingface.co/IGNF/FRACTAL-LidarHD_7cl_randlanet | Deep\n learning code repository is on Gihtub at https://github.com/IGNF/myria3d |\n Data engineering code repository is on Github at\n https://github.com/IGNF/pacasam"},{"id":"http://arxiv.org/abs/2310.15111v2","updated":"2024-08-30T19:21:36Z","published":"2023-10-23T17:20:01Z","title":"Matryoshka Diffusion Models","summary":" Diffusion models are the de facto approach for generating high-quality images\nand videos, but learning high-dimensional models remains a formidable task due\nto computational and optimization challenges. Existing methods often resort to\ntraining cascaded models in pixel space or using a downsampled latent space of\na separately trained auto-encoder. In this paper, we introduce Matryoshka\nDiffusion Models(MDM), an end-to-end framework for high-resolution image and\nvideo synthesis. We propose a diffusion process that denoises inputs at\nmultiple resolutions jointly and uses a NestedUNet architecture where features\nand parameters for small-scale inputs are nested within those of large scales.\nIn addition, MDM enables a progressive training schedule from lower to higher\nresolutions, which leads to significant improvements in optimization for\nhigh-resolution generation. We demonstrate the effectiveness of our approach on\nvarious benchmarks, including class-conditioned image generation,\nhigh-resolution text-to-image, and text-to-video applications. Remarkably, we\ncan train a single pixel-space model at resolutions of up to 1024x1024 pixels,\ndemonstrating strong zero-shot generalization using the CC12M dataset, which\ncontains only 12 million images. Our code is released at\nhttps://github.com/apple/ml-mdm\n","authors":["Jiatao Gu","Shuangfei Zhai","Yizhe Zhang","Josh Susskind","Navdeep Jaitly"],"pdf_url":"https://arxiv.org/pdf/2310.15111v2.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2405.11286v2","updated":"2024-08-30T19:17:41Z","published":"2024-05-18T13:21:14Z","title":"Motion Avatar: Generate Human and Animal Avatars with Arbitrary Motion","summary":" In recent years, there has been significant interest in creating 3D avatars\nand motions, driven by their diverse applications in areas like film-making,\nvideo games, AR/VR, and human-robot interaction. However, current efforts\nprimarily concentrate on either generating the 3D avatar mesh alone or\nproducing motion sequences, with integrating these two aspects proving to be a\npersistent challenge. Additionally, while avatar and motion generation\npredominantly target humans, extending these techniques to animals remains a\nsignificant challenge due to inadequate training data and methods. To bridge\nthese gaps, our paper presents three key contributions. Firstly, we proposed a\nnovel agent-based approach named Motion Avatar, which allows for the automatic\ngeneration of high-quality customizable human and animal avatars with motions\nthrough text queries. The method significantly advanced the progress in dynamic\n3D character generation. Secondly, we introduced a LLM planner that coordinates\nboth motion and avatar generation, which transforms a discriminative planning\ninto a customizable Q&A fashion. Lastly, we presented an animal motion dataset\nnamed Zoo-300K, comprising approximately 300,000 text-motion pairs across 65\nanimal categories and its building pipeline ZooGen, which serves as a valuable\nresource for the community. See project website\nhttps://steve-zeyu-zhang.github.io/MotionAvatar/\n","authors":["Zeyu Zhang","Yiran Wang","Biao Wu","Shuo Chen","Zhiyuan Zhang","Shiya Huang","Wenbo Zhang","Meng Fang","Ling Chen","Yang Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.11286v2.pdf","comment":"Accepted to BMVC 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.00458v2","updated":"2024-08-30T15:59:46Z","published":"2024-03-30T19:45:04Z","title":"Beyond One-Size-Fits-All: Multi-Domain, Multi-Task Framework for\n Embedding Model Selection","summary":" This position paper proposes a systematic approach towards developing a\nframework to help select the most effective embedding models for natural\nlanguage processing (NLP) tasks, addressing the challenge posed by the\nproliferation of both proprietary and open-source encoder models.\n","authors":["Vivek Khetan"],"pdf_url":"https://arxiv.org/pdf/2404.00458v2.pdf","comment":"It was an initial idea - we plan to work on a detailed version"},{"id":"http://arxiv.org/abs/2408.17344v1","updated":"2024-08-30T15:16:52Z","published":"2024-08-30T15:16:52Z","title":"rerankers: A Lightweight Python Library to Unify Ranking Methods","summary":" This paper presents rerankers, a Python library which provides an easy-to-use\ninterface to the most commonly used re-ranking approaches. Re-ranking is an\nintegral component of many retrieval pipelines; however, there exist numerous\napproaches to it, relying on different implementation methods.\n\\texttt{rerankers} unifies these methods into a single user-friendly interface,\nallowing practitioners and researchers alike to explore different methods while\nonly changing a single line of Python code. Moreover ,rerankers ensures that\nits implementations are done with the fewest dependencies possible, and re-uses\nthe original implementation whenever possible, guaranteeing that our simplified\ninterface results in no performance degradation compared to more complex ones.\nThe full source code and list of supported models are updated regularly and\navailable at https://github.com/answerdotai/rerankers.\n","authors":["Benjamin Clavié"],"pdf_url":"https://arxiv.org/pdf/2408.17344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17332v1","updated":"2024-08-30T14:48:52Z","published":"2024-08-30T14:48:52Z","title":"Not All Videos Become Outdated: Short-Video Recommendation by Learning\n to Deconfound Release Interval Bias","summary":" Short-video recommender systems often exhibit a biased preference to recently\nreleased videos. However, not all videos become outdated; certain classic\nvideos can still attract user's attention. Such bias along temporal dimension\ncan be further aggravated by the matching model between users and videos,\nbecause the model learns from preexisting interactions. From real data, we\nobserve that different videos have varying sensitivities to recency in\nattracting users' attention. Our analysis, based on a causal graph modeling\nshort-video recommendation, suggests that the release interval serves as a\nconfounder, establishing a backdoor path between users and videos. To address\nthis confounding effect, we propose a model-agnostic causal architecture called\nLearning to Deconfound the Release Interval Bias (LDRI). LDRI enables jointly\nlearning of the matching model and the video recency sensitivity perceptron. In\nthe inference stage, we apply a backdoor adjustment, effectively blocking the\nbackdoor path by intervening on each video. Extensive experiments on two\nbenchmarks demonstrate that LDRI consistently outperforms backbone models and\nexhibits superior performance against state-of-the-art models. Additional\ncomprehensive analyses confirm the deconfounding capability of LDRI.\n","authors":["Lulu Dong","Guoxiu He","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2408.17332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17309v1","updated":"2024-08-30T14:12:31Z","published":"2024-08-30T14:12:31Z","title":"Metadata practices for simulation workflows","summary":" Computer simulations are an essential pillar of knowledge generation in\nscience. Understanding, reproducing, and exploring the results of simulations\nrelies on tracking and organizing metadata describing numerical experiments.\nHowever, the models used to understand real-world systems, and the\ncomputational machinery required to simulate them, are typically complex, and\nproduce large amounts of heterogeneous metadata. Here, we present general\npractices for acquiring and handling metadata that are agnostic to software and\nhardware, and highly flexible for the user. These consist of two steps: 1)\nrecording and storing raw metadata, and 2) selecting and structuring metadata.\nAs a proof of concept, we develop the Archivist, a Python tool to help with the\nsecond step, and use it to apply our practices to distinct high-performance\ncomputing use cases from neuroscience and hydrology. Our practices and the\nArchivist can readily be applied to existing workflows without the need for\nsubstantial restructuring. They support sustainable numerical workflows,\nfacilitating reproducibility and data reuse in generic simulation-based\nresearch.\n","authors":["Jose Villamar","Matthias Kelbling","Heather L. More","Michael Denker","Tom Tetzlaff","Johanna Senk","Stephan Thober"],"pdf_url":"https://arxiv.org/pdf/2408.17309v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.16312v2","updated":"2024-08-30T11:48:40Z","published":"2024-08-29T07:20:56Z","title":"SynDL: A Large-Scale Synthetic Test Collection for Passage Retrieval","summary":" Large-scale test collections play a crucial role in Information Retrieval\n(IR) research. However, according to the Cranfield paradigm and the research\ninto publicly available datasets, the existing information retrieval research\nstudies are commonly developed on small-scale datasets that rely on human\nassessors for relevance judgments - a time-intensive and expensive process.\nRecent studies have shown the strong capability of Large Language Models (LLMs)\nin producing reliable relevance judgments with human accuracy but at a greatly\nreduced cost. In this paper, to address the missing large-scale ad-hoc document\nretrieval dataset, we extend the TREC Deep Learning Track (DL) test collection\nvia additional language model synthetic labels to enable researchers to test\nand evaluate their search systems at a large scale. Specifically, such a test\ncollection includes more than 1,900 test queries from the previous years of\ntracks. We compare system evaluation with past human labels from past years and\nfind that our synthetically created large-scale test collection can lead to\nhighly correlated system rankings.\n","authors":["Hossein A. Rahmani","Xi Wang","Emine Yilmaz","Nick Craswell","Bhaskar Mitra","Paul Thomas"],"pdf_url":"https://arxiv.org/pdf/2408.16312v2.pdf","comment":"9 pages, resource paper"},{"id":"http://arxiv.org/abs/2408.17214v1","updated":"2024-08-30T11:38:51Z","published":"2024-08-30T11:38:51Z","title":"Efficient Multi-task Prompt Tuning for Recommendation","summary":" With the expansion of business scenarios, real recommender systems are facing\nchallenges in dealing with the constantly emerging new tasks in multi-task\nlearning frameworks. In this paper, we attempt to improve the generalization\nability of multi-task recommendations when dealing with new tasks. We find that\njoint training will enhance the performance of the new task but always\nnegatively impact existing tasks in most multi-task learning methods. Besides,\nsuch a re-training mechanism with new tasks increases the training costs,\nlimiting the generalization ability of multi-task recommendation models. Based\non this consideration, we aim to design a suitable sharing mechanism among\ndifferent tasks while maintaining joint optimization efficiency in new task\nlearning. A novel two-stage prompt-tuning MTL framework (MPT-Rec) is proposed\nto address task irrelevance and training efficiency problems in multi-task\nrecommender systems. Specifically, we disentangle the task-specific and\ntask-sharing information in the multi-task pre-training stage, then use\ntask-aware prompts to transfer knowledge from other tasks to the new task\neffectively. By freezing parameters in the pre-training tasks, MPT-Rec solves\nthe negative impacts that may be brought by the new task and greatly reduces\nthe training costs. Extensive experiments on three real-world datasets show the\neffectiveness of our proposed multi-task learning framework. MPT-Rec achieves\nthe best performance compared to the SOTA multi-task learning method. Besides,\nit maintains comparable model performance but vastly improves the training\nefficiency (i.e., with up to 10% parameters in the full training way) in the\nnew task learning.\n","authors":["Ting Bai","Le Huang","Yue Yu","Cheng Yang","Cheng Hou","Zhe Zhao","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2408.17214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17180v1","updated":"2024-08-30T10:28:36Z","published":"2024-08-30T10:28:36Z","title":"Identifying and Clustering Counter Relationships of Team Compositions in\n PvP Games for Efficient Balance Analysis","summary":" How can balance be quantified in game settings? This question is crucial for\ngame designers, especially in player-versus-player (PvP) games, where analyzing\nthe strength relations among predefined team compositions-such as hero\ncombinations in multiplayer online battle arena (MOBA) games or decks in card\ngames-is essential for enhancing gameplay and achieving balance. We have\ndeveloped two advanced measures that extend beyond the simplistic win rate to\nquantify balance in zero-sum competitive scenarios. These measures are derived\nfrom win value estimations, which employ strength rating approximations via the\nBradley-Terry model and counter relationship approximations via vector\nquantization, significantly reducing the computational complexity associated\nwith traditional win value estimations. Throughout the learning process of\nthese models, we identify useful categories of compositions and pinpoint their\ncounter relationships, aligning with the experiences of human players without\nrequiring specific game knowledge. Our methodology hinges on a simple technique\nto enhance codebook utilization in discrete representation with a deterministic\nvector quantization process for an extremely small state space. Our framework\nhas been validated in popular online games, including Age of Empires II,\nHearthstone, Brawl Stars, and League of Legends. The accuracy of the observed\nstrength relations in these games is comparable to traditional pairwise win\nvalue predictions, while also offering a more manageable complexity for\nanalysis. Ultimately, our findings contribute to a deeper understanding of PvP\ngame dynamics and present a methodology that significantly improves game\nbalance evaluation and design.\n","authors":["Chiu-Chou Lin","Yu-Wei Shih","Kuei-Ting Kuo","Yu-Cheng Chen","Chien-Hua Chen","Wei-Chen Chiu","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.17180v1.pdf","comment":"TMLR 09/2024 https://openreview.net/forum?id=2D36otXvBE"},{"id":"http://arxiv.org/abs/2408.17103v1","updated":"2024-08-30T08:40:59Z","published":"2024-08-30T08:40:59Z","title":"Understanding the User: An Intent-Based Ranking Dataset","summary":" As information retrieval systems continue to evolve, accurate evaluation and\nbenchmarking of these systems become pivotal. Web search datasets, such as MS\nMARCO, primarily provide short keyword queries without accompanying intent or\ndescriptions, posing a challenge in comprehending the underlying information\nneed. This paper proposes an approach to augmenting such datasets to annotate\ninformative query descriptions, with a focus on two prominent benchmark\ndatasets: TREC-DL-21 and TREC-DL-22. Our methodology involves utilizing\nstate-of-the-art LLMs to analyze and comprehend the implicit intent within\nindividual queries from benchmark datasets. By extracting key semantic\nelements, we construct detailed and contextually rich descriptions for these\nqueries. To validate the generated query descriptions, we employ crowdsourcing\nas a reliable means of obtaining diverse human perspectives on the accuracy and\ninformativeness of the descriptions. This information can be used as an\nevaluation set for tasks such as ranking, query rewriting, or others.\n","authors":["Abhijit Anand","Jurek Leonhardt","V Venktesh","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2408.17103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12492v2","updated":"2024-08-30T07:58:46Z","published":"2024-08-22T15:33:46Z","title":"The Importance of Cognitive Biases in the Recommendation Ecosystem","summary":" Cognitive biases have been studied in psychology, sociology, and behavioral\neconomics for decades. Traditionally, they have been considered a negative\nhuman trait that leads to inferior decision-making, reinforcement of\nstereotypes, or can be exploited to manipulate consumers, respectively. We\nargue that cognitive biases also manifest in different parts of the\nrecommendation ecosystem and at different stages of the recommendation process.\nMore importantly, we contest this traditional detrimental perspective on\ncognitive biases and claim that certain cognitive biases can be beneficial when\naccounted for by recommender systems. Concretely, we provide empirical evidence\nthat biases such as feature-positive effect, Ikea effect, and cultural\nhomophily can be observed in various components of the recommendation pipeline,\nincluding input data (such as ratings or side information), recommendation\nalgorithm or model (and consequently recommended items), and user interactions\nwith the system. In three small experiments covering recruitment and\nentertainment domains, we study the pervasiveness of the aforementioned biases.\nWe ultimately advocate for a prejudice-free consideration of cognitive biases\nto improve user and item models as well as recommendation algorithms.\n","authors":["Markus Schedl","Oleg Lesota","Stefan Brandl","Mohammad Lotfi","Gustavo Junior Escobedo Ticona","Shahed Masoudian"],"pdf_url":"https://arxiv.org/pdf/2408.12492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17008v1","updated":"2024-08-30T04:40:35Z","published":"2024-08-30T04:40:35Z","title":"Evaluation of Table Representations to Answer Questions from Tables in\n Documents : A Case Study using 3GPP Specifications","summary":" With the ubiquitous use of document corpora for question answering, one\nimportant aspect which is especially relevant for technical documents is the\nability to extract information from tables which are interspersed with text.\nThe major challenge in this is that unlike free-flow text or isolated set of\ntables, the representation of a table in terms of what is a relevant chunk is\nnot obvious. We conduct a series of experiments examining various\nrepresentations of tabular data interspersed with text to understand the\nrelative benefits of different representations. We choose a corpus of $3^{rd}$\nGeneration Partnership Project (3GPP) documents since they are heavily\ninterspersed with tables. We create expert curated dataset of question answers\nto evaluate our approach. We conclude that row level representations with\ncorresponding table header information being included in every cell improves\nthe performance of the retrieval, thus leveraging the structural information\npresent in the tabular data.\n","authors":["Sujoy Roychowdhury","Sumit Soman","HG Ranjani","Avantika Sharma","Neeraj Gunda","Sai Krishna Bala"],"pdf_url":"https://arxiv.org/pdf/2408.17008v1.pdf","comment":"10 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.06051v2","updated":"2024-08-30T03:19:26Z","published":"2024-08-12T10:55:42Z","title":"Perceptual Similarity for Measuring Decision-Making Style and Policy\n Diversity in Games","summary":" Defining and measuring decision-making styles, also known as playstyles, is\ncrucial in gaming, where these styles reflect a broad spectrum of individuality\nand diversity. However, finding a universally applicable measure for these\nstyles poses a challenge. Building on Playstyle Distance, the first\nunsupervised metric to measure playstyle similarity based on game screens and\nraw actions, we introduce three enhancements to increase accuracy: multiscale\nanalysis with varied state granularity, a perceptual kernel rooted in\npsychology, and the utilization of the intersection-over-union method for\nefficient evaluation. These innovations not only advance measurement precision\nbut also offer insights into human cognition of similarity. Across two racing\ngames and seven Atari games, our techniques significantly improve the precision\nof zero-shot playstyle classification, achieving an accuracy exceeding 90\npercent with fewer than 512 observation-action pairs, which is less than half\nan episode of these games. Furthermore, our experiments with 2048 and Go\ndemonstrate the potential of discrete playstyle measures in puzzle and board\ngames. We also develop an algorithm for assessing decision-making diversity\nusing these measures. Our findings improve the measurement of end-to-end game\nanalysis and the evolution of artificial intelligence for diverse playstyles.\n","authors":["Chiu-Chou Lin","Wei-Chen Chiu","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06051v2.pdf","comment":"TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49"},{"id":"http://arxiv.org/abs/2408.16672v2","updated":"2024-08-30T18:14:24Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce a novel architecture and a training framework to\nsupport long context window and multilingual retrieval. Leveraging Matryoshka\nRepresentation Loss, we further demonstrate that the reducing the embedding\ndimensionality from 128 to 64 has insignificant impact on the model's retrieval\nperformance and cut storage requirements by up to 50%. Our new model,\nJina-ColBERT-v2, demonstrates strong performance across a range of English and\nmultilingual retrieval tasks,\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Georgios Mastrapas","Saba Sturua","Isabelle Mohr","Andreas Koukounas","Mohammad Kalim Akram","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00164v1","updated":"2024-08-30T16:54:06Z","published":"2024-08-30T16:54:06Z","title":"Facilitating phenotyping from clinical texts: the medkit library","summary":" Phenotyping consists in applying algorithms to identify individuals\nassociated with a specific, potentially complex, trait or condition, typically\nout of a collection of Electronic Health Records (EHRs). Because a lot of the\nclinical information of EHRs are lying in texts, phenotyping from text takes an\nimportant role in studies that rely on the secondary use of EHRs. However, the\nheterogeneity and highly specialized aspect of both the content and form of\nclinical texts makes this task particularly tedious, and is the source of time\nand cost constraints in observational studies. To facilitate the development,\nevaluation and reproductibility of phenotyping pipelines, we developed an\nopen-source Python library named medkit. It enables composing data processing\npipelines made of easy-to-reuse software bricks, named medkit operations. In\naddition to the core of the library, we share the operations and pipelines we\nalready developed and invite the phenotyping community for their reuse and\nenrichment. medkit is available at https://github.com/medkit-lib/medkit\n","authors":["Antoine Neuraz","Ghislain Vaillant","Camila Arias","Olivier Birot","Kim-Tam Huynh","Thibaut Fabacher","Alice Rogier","Nicolas Garcelon","Ivan Lerner","Bastien Rance","Adrien Coulet"],"pdf_url":"https://arxiv.org/pdf/2409.00164v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2402.17295v2","updated":"2024-08-30T17:36:49Z","published":"2024-02-27T08:16:17Z","title":"Quantum Distance Approximation for Persistence Diagrams","summary":" Topological Data Analysis methods can be useful for classification and\nclustering tasks in many different fields as they can provide two dimensional\npersistence diagrams that summarize important information about the shape of\npotentially complex and high dimensional data sets. The space of persistence\ndiagrams can be endowed with various metrics such as the Wasserstein distance\nwhich admit a statistical structure and allow to use these summaries for\nmachine learning algorithms. However, computing the distance between two\npersistence diagrams involves finding an optimal way to match the points of the\ntwo diagrams and may not always be an easy task for classical computers. In\nthis work we explore the potential of quantum computers to estimate the\ndistance between persistence diagrams, in particular we propose variational\nquantum algorithms for the Wasserstein distance as well as the $d^{c}_{p}$\ndistance. Our implementation is a weighted version of the Quantum Approximate\nOptimization Algorithm that relies on control clauses to encode the constraints\nof the optimization problem.\n","authors":["Bernardo Ameneyro","Rebekah Herrman","George Siopsis","Vasileios Maroulas"],"pdf_url":"https://arxiv.org/pdf/2402.17295v2.pdf","comment":"39 pages, 12 figures, 2 tables, submitted to Journal of Physics:\n Complexity"},{"id":"http://arxiv.org/abs/2408.17432v1","updated":"2024-08-30T17:34:46Z","published":"2024-08-30T17:34:46Z","title":"SelectTTS: Synthesizing Anyone's Voice via Discrete Unit-Based Frame\n Selection","summary":" Synthesizing the voices of unseen speakers is a persisting challenge in\nmulti-speaker text-to-speech (TTS). Most multi-speaker TTS models rely on\nmodeling speaker characteristics through speaker conditioning during training.\nModeling unseen speaker attributes through this approach has necessitated an\nincrease in model complexity, which makes it challenging to reproduce results\nand improve upon them. We design a simple alternative to this. We propose\nSelectTTS, a novel method to select the appropriate frames from the target\nspeaker and decode using frame-level self-supervised learning (SSL) features.\nWe show that this approach can effectively capture speaker characteristics for\nunseen speakers, and achieves comparable results to other multi-speaker TTS\nframeworks in both objective and subjective metrics. With SelectTTS, we show\nthat frame selection from the target speaker's speech is a direct way to\nachieve generalization in unseen speakers with low model complexity. We achieve\nbetter speaker similarity performance than SOTA baselines XTTS-v2 and VALL-E\nwith over an 8x reduction in model parameters and a 270x reduction in training\ndata\n","authors":["Ismail Rasim Ulgen","Shreeram Suresh Chandra","Junchen Lu","Berrak Sisman"],"pdf_url":"https://arxiv.org/pdf/2408.17432v1.pdf","comment":"Submitted to IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2310.19704v3","updated":"2024-08-30T17:22:01Z","published":"2023-10-30T16:29:47Z","title":"A Survey on Knowledge Editing of Neural Networks","summary":" Deep neural networks are becoming increasingly pervasive in academia and\nindustry, matching and surpassing human performance on a wide variety of fields\nand related tasks. However, just as humans, even the largest artificial neural\nnetworks make mistakes, and once-correct predictions can become invalid as the\nworld progresses in time. Augmenting datasets with samples that account for\nmistakes or up-to-date information has become a common workaround in practical\napplications. However, the well-known phenomenon of catastrophic forgetting\nposes a challenge in achieving precise changes in the implicitly memorized\nknowledge of neural network parameters, often requiring a full model\nre-training to achieve desired behaviors. That is expensive, unreliable, and\nincompatible with the current trend of large self-supervised pre-training,\nmaking it necessary to find more efficient and effective methods for adapting\nneural network models to changing data. To address this need, knowledge editing\nis emerging as a novel area of research that aims to enable reliable,\ndata-efficient, and fast changes to a pre-trained target model, without\naffecting model behaviors on previously learned tasks. In this survey, we\nprovide a brief review of this recent artificial intelligence field of\nresearch. We first introduce the problem of editing neural networks, formalize\nit in a common framework and differentiate it from more notorious branches of\nresearch such as continuous learning. Next, we provide a review of the most\nrelevant knowledge editing approaches and datasets proposed so far, grouping\nworks under four different families: regularization techniques, meta-learning,\ndirect model editing, and architectural strategies. Finally, we outline some\nintersections with other fields of research and potential directions for future\nworks.\n","authors":["Vittorio Mazzia","Alessandro Pedrani","Andrea Caciolai","Kay Rottmann","Davide Bernardi"],"pdf_url":"https://arxiv.org/pdf/2310.19704v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12212v2","updated":"2024-08-30T17:02:11Z","published":"2024-03-18T19:53:56Z","title":"Evaluating Named Entity Recognition: A comparative analysis of mono- and\n multilingual transformer models on a novel Brazilian corporate earnings call\n transcripts dataset","summary":" Since 2018, when the Transformer architecture was introduced, Natural\nLanguage Processing has gained significant momentum with pre-trained\nTransformer-based models that can be fine-tuned for various tasks. Most models\nare pre-trained on large English corpora, making them less applicable to other\nlanguages, such as Brazilian Portuguese. In our research, we identified two\nmodels pre-trained in Brazilian Portuguese (BERTimbau and PTT5) and two\nmultilingual models (mBERT and mT5). BERTimbau and mBERT use only the Encoder\nmodule, while PTT5 and mT5 use both the Encoder and Decoder. Our study aimed to\nevaluate their performance on a financial Named Entity Recognition (NER) task\nand determine the computational requirements for fine-tuning and inference. To\nthis end, we developed the Brazilian Financial NER (BraFiNER) dataset,\ncomprising sentences from Brazilian banks' earnings calls transcripts annotated\nusing a weakly supervised approach. Additionally, we introduced a novel\napproach that reframes the token classification task as a text generation\nproblem. After fine-tuning the models, we evaluated them using performance and\nerror metrics. Our findings reveal that BERT-based models consistently\noutperform T5-based models. While the multilingual models exhibit comparable\nmacro F1-scores, BERTimbau demonstrates superior performance over PTT5. In\nterms of error metrics, BERTimbau outperforms the other models. We also\nobserved that PTT5 and mT5 generated sentences with changes in monetary and\npercentage values, highlighting the importance of accuracy and consistency in\nthe financial domain. Our findings provide insights into the differing\nperformance of BERT- and T5-based models for the NER task.\n","authors":["Ramon Abilio","Guilherme Palermo Coelho","Ana Estela Antunes da Silva"],"pdf_url":"https://arxiv.org/pdf/2403.12212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03496v9","updated":"2024-08-30T16:45:58Z","published":"2024-02-05T20:15:19Z","title":"Can We Remove the Square-Root in Adaptive Gradient Methods? A\n Second-Order Perspective","summary":" Adaptive gradient optimizers like Adam(W) are the default training algorithms\nfor many deep learning architectures, such as transformers. Their diagonal\npreconditioner is based on the gradient outer product which is incorporated\ninto the parameter update via a square root. While these methods are often\nmotivated as approximate second-order methods, the square root represents a\nfundamental difference. In this work, we investigate how the behavior of\nadaptive methods changes when we remove the root, i.e., strengthen their\nsecond-order motivation. Surprisingly, we find that such square-root-free\nadaptive methods close the generalization gap to SGD on convolutional\narchitectures, while maintaining their root-based counterpart's performance on\ntransformers. The second-order perspective also has practical benefits for\ndeveloping non-diagonal methods that can incorporate arbitrary curvature\napproximations through the concept of preconditioner invariance. In contrast to\nroot-based methods like Shampoo, root-free counterparts work well and fast with\nhalf-precision since they do not require numerically unstable matrix root\ndecompositions and inversions. Overall, our findings provide new insights into\nthe development of adaptive methods and raise important questions regarding the\noverlooked role of adaptivity in their success. (experiment code:\nhttps://github.com/yorkerlin/remove-the-square-root optimizer code:\nhttps://github.com/f-dangel/sirfshampoo)\n","authors":["Wu Lin","Felix Dangel","Runa Eschenhagen","Juhan Bae","Richard E. Turner","Alireza Makhzani"],"pdf_url":"https://arxiv.org/pdf/2402.03496v9.pdf","comment":"A long version of the ICML 2024 paper. Added root-free update schemes\n for n-dim tensor cases"},{"id":"http://arxiv.org/abs/2405.02175v3","updated":"2024-08-30T16:40:15Z","published":"2024-05-03T15:25:48Z","title":"Hoaxpedia: A Unified Wikipedia Hoax Articles Dataset","summary":" Hoaxes are a recognised form of disinformation created deliberately, with\npotential serious implications in the credibility of reference knowledge\nresources such as Wikipedia. What makes detecting Wikipedia hoaxes hard is that\nthey often are written according to the official style guidelines. In this\nwork, we first provide a systematic analysis of similarities and discrepancies\nbetween legitimate and hoax Wikipedia articles, and introduce Hoaxpedia, a\ncollection of 311 hoax articles (from existing literature and official\nWikipedia lists), together with semantically similar legitimate articles, which\ntogether form a binary text classification dataset aimed at fostering research\nin automated hoax detection. In this paper, We report results after analyzing\nseveral language models, hoax-to-legit ratios, and the amount of text\nclassifiers are exposed to (full article vs the article's definition alone).\nOur results suggest that detecting deceitful content in Wikipedia based on\ncontent alone is hard but feasible, and complement our analysis with a study on\nthe differences in distributions in edit histories, and find that looking at\nthis feature yields better classification results than context.\n","authors":["Hsuvas Borkakoty","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2405.02175v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17396v1","updated":"2024-08-30T16:30:00Z","published":"2024-08-30T16:30:00Z","title":"Fairness-Aware Estimation of Graphical Models","summary":" This paper examines the issue of fairness in the estimation of graphical\nmodels (GMs), particularly Gaussian, Covariance, and Ising models. These models\nplay a vital role in understanding complex relationships in high-dimensional\ndata. However, standard GMs can result in biased outcomes, especially when the\nunderlying data involves sensitive characteristics or protected groups. To\naddress this, we introduce a comprehensive framework designed to reduce bias in\nthe estimation of GMs related to protected attributes. Our approach involves\nthe integration of the pairwise graph disparity error and a tailored loss\nfunction into a nonsmooth multi-objective optimization problem, striving to\nachieve fairness across different sensitive groups while maintaining the\neffectiveness of the GMs. Experimental evaluations on synthetic and real-world\ndatasets demonstrate that our framework effectively mitigates bias without\nundermining GMs' performance.\n","authors":["Zhuoping Zhou","Davoud Ataee Tarzanagh","Bojian Hou","Qi Long","Li Shen"],"pdf_url":"https://arxiv.org/pdf/2408.17396v1.pdf","comment":"32 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2408.17394v1","updated":"2024-08-30T16:29:09Z","published":"2024-08-30T16:29:09Z","title":"Continual learning with the neural tangent ensemble","summary":" A natural strategy for continual learning is to weigh a Bayesian ensemble of\nfixed functions. This suggests that if a (single) neural network could be\ninterpreted as an ensemble, one could design effective algorithms that learn\nwithout forgetting. To realize this possibility, we observe that a neural\nnetwork classifier with N parameters can be interpreted as a weighted ensemble\nof N classifiers, and that in the lazy regime limit these classifiers are fixed\nthroughout learning. We term these classifiers the neural tangent experts and\nshow they output valid probability distributions over the labels. We then\nderive the likelihood and posterior probability of each expert given past data.\nSurprisingly, we learn that the posterior updates for these experts are\nequivalent to a scaled and projected form of stochastic gradient descent (SGD)\nover the network weights. Away from the lazy regime, networks can be seen as\nensembles of adaptive experts which improve over time. These results offer a\nnew interpretation of neural networks as Bayesian ensembles of experts,\nproviding a principled framework for understanding and mitigating catastrophic\nforgetting in continual learning settings.\n","authors":["Ari S. Benjamin","Christian Pehle","Kyle Daruwalla"],"pdf_url":"https://arxiv.org/pdf/2408.17394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17387v1","updated":"2024-08-30T16:26:31Z","published":"2024-08-30T16:26:31Z","title":"Bayesian Optimization for Non-Convex Two-Stage Stochastic Optimization\n Problems","summary":" Bayesian optimization is a sample-efficient method for solving expensive,\nblack-box optimization problems. Stochastic programming concerns optimization\nunder uncertainty where, typically, average performance is the quantity of\ninterest. In the first stage of a two-stage problem, here-and-now decisions\nmust be made in the face of this uncertainty, while in the second stage,\nwait-and-see decisions are made after the uncertainty has been resolved. Many\nmethods in stochastic programming assume that the objective is cheap to\nevaluate and linear or convex. In this work, we apply Bayesian optimization to\nsolve non-convex, two-stage stochastic programs which are expensive to\nevaluate. We formulate a knowledge-gradient-based acquisition function to\njointly optimize the first- and second-stage variables, establish a guarantee\nof asymptotic consistency and provide a computationally efficient\napproximation. We demonstrate comparable empirical results to an alternative we\nformulate which alternates its focus between the two variable types, and\nsuperior empirical results over the standard, naive, two-step benchmark. We\nshow that differences in the dimension and length scales between the variable\ntypes can lead to inefficiencies of the two-step algorithm, while the joint and\nalternating acquisition functions perform well in all problems tested.\nExperiments are conducted on both synthetic and real-world examples.\n","authors":["Jack M. Buckingham","Ivo Couckuyt","Juergen Branke"],"pdf_url":"https://arxiv.org/pdf/2408.17387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17384v1","updated":"2024-08-30T16:26:04Z","published":"2024-08-30T16:26:04Z","title":"LASSO-MOGAT: A Multi-Omics Graph Attention Framework for Cancer\n Classification","summary":" The application of machine learning methods to analyze changes in gene\nexpression patterns has recently emerged as a powerful approach in cancer\nresearch, enhancing our understanding of the molecular mechanisms underpinning\ncancer development and progression. Combining gene expression data with other\ntypes of omics data has been reported by numerous works to improve cancer\nclassification outcomes. Despite these advances, effectively integrating\nhigh-dimensional multi-omics data and capturing the complex relationships\nacross different biological layers remains challenging. This paper introduces\nLASSO-MOGAT (LASSO-Multi-Omics Gated ATtention), a novel graph-based deep\nlearning framework that integrates messenger RNA, microRNA, and DNA methylation\ndata to classify 31 cancer types. Utilizing differential expression analysis\nwith LIMMA and LASSO regression for feature selection, and leveraging Graph\nAttention Networks (GATs) to incorporate protein-protein interaction (PPI)\nnetworks, LASSO-MOGAT effectively captures intricate relationships within\nmulti-omics data. Experimental validation using five-fold cross-validation\ndemonstrates the method's precision, reliability, and capacity for providing\ncomprehensive insights into cancer molecular mechanisms. The computation of\nattention coefficients for the edges in the graph by the proposed\ngraph-attention architecture based on protein-protein interactions proved\nbeneficial for identifying synergies in multi-omics data for cancer\nclassification.\n","authors":["Fadi Alharbi","Aleksandar Vakanski","Murtada K. Elbashir","Mohanad Mohammed"],"pdf_url":"https://arxiv.org/pdf/2408.17384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17383v1","updated":"2024-08-30T16:24:27Z","published":"2024-08-30T16:24:27Z","title":"MoRe Fine-Tuning with 10x Fewer Parameters","summary":" Parameter-efficient fine-tuning (PEFT) techniques have unlocked the potential\nto cheaply and easily specialize large pretrained models. However, the most\nprominent approaches, like low-rank adapters (LoRA), depend on heuristics or\nrules-of-thumb for their architectural choices -- potentially limiting their\nperformance for new models and architectures. This limitation suggests that\ntechniques from neural architecture search could be used to obtain optimal\nadapter architectures, but these are often expensive and difficult to\nimplement. We address this challenge with Monarch Rectangular Fine-tuning\n(MoRe), a simple framework to search over adapter architectures that relies on\nthe Monarch matrix class. Theoretically, we show that MoRe is more expressive\nthan LoRA. Empirically, our approach is more parameter-efficient and performant\nthan state-of-the-art PEFTs on a range of tasks and models, with as few as 5\\%\nof LoRA's parameters.\n","authors":["Wenxuan Tan","Nicholas Roberts","Tzu-Heng Huang","Jitian Zhao","John Cooper","Samuel Guo","Chengyu Duan","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2408.17383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17380v1","updated":"2024-08-30T16:16:57Z","published":"2024-08-30T16:16:57Z","title":"Traffic expertise meets residual RL: Knowledge-informed model-based\n residual reinforcement learning for CAV trajectory control","summary":" Model-based reinforcement learning (RL) is anticipated to exhibit higher\nsample efficiency compared to model-free RL by utilizing a virtual environment\nmodel. However, it is challenging to obtain sufficiently accurate\nrepresentations of the environmental dynamics due to uncertainties in complex\nsystems and environments. An inaccurate environment model may degrade the\nsample efficiency and performance of model-based RL. Furthermore, while\nmodel-based RL can improve sample efficiency, it often still requires\nsubstantial training time to learn from scratch, potentially limiting its\nadvantages over model-free approaches. To address these challenges, this paper\nintroduces a knowledge-informed model-based residual reinforcement learning\nframework aimed at enhancing learning efficiency by infusing established expert\nknowledge into the learning process and avoiding the issue of beginning from\nzero. Our approach integrates traffic expert knowledge into a virtual\nenvironment model, employing the Intelligent Driver Model (IDM) for basic\ndynamics and neural networks for residual dynamics, thus ensuring adaptability\nto complex scenarios. We propose a novel strategy that combines traditional\ncontrol methods with residual RL, facilitating efficient learning and policy\noptimization without the need to learn from scratch. The proposed approach is\napplied to CAV trajectory control tasks for the dissipation of stop-and-go\nwaves in mixed traffic flow. Experimental results demonstrate that our proposed\napproach enables the CAV agent to achieve superior performance in trajectory\ncontrol compared to the baseline agents in terms of sample efficiency, traffic\nflow smoothness and traffic mobility. The source code and supplementary\nmaterials are available at https://github.com/zihaosheng/traffic-expertise-RL/.\n","authors":["Zihao Sheng","Zilin Huang","Sikai Chen"],"pdf_url":"https://arxiv.org/pdf/2408.17380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17376v1","updated":"2024-08-30T16:12:57Z","published":"2024-08-30T16:12:57Z","title":"Exploring the Impact of Environmental Pollutants on Multiple Sclerosis\n Progression","summary":" Multiple Sclerosis (MS) is a chronic autoimmune and inflammatory neurological\ndisorder characterised by episodes of symptom exacerbation, known as relapses.\nIn this study, we investigate the role of environmental factors in relapse\noccurrence among MS patients, using data from the H2020 BRAINTEASER project. We\nemployed predictive models, including Random Forest (RF) and Logistic\nRegression (LR), with varying sets of input features to predict the occurrence\nof relapses based on clinical and pollutant data collected over a week. The RF\nyielded the best result, with an AUC-ROC score of 0.713. Environmental\nvariables, such as precipitation, NO2, PM2.5, humidity, and temperature, were\nfound to be relevant to the prediction.\n","authors":["Elena Marinello","Erica Tavazzi","Enrico Longato","Pietro Bosoni","Arianna Dagliati","Mahin Vazifehdan","Riccardo Bellazzi","Isotta Trescato","Alessandro Guazzo","Martina Vettoretti","Eleonora Tavazzi","Lara Ahmad","Roberto Bergamaschi","Paola Cavalla","Umberto Manera","Adriano Chio","Barbara Di Camillo"],"pdf_url":"https://arxiv.org/pdf/2408.17376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05955v2","updated":"2024-08-30T16:04:04Z","published":"2023-02-12T16:55:58Z","title":"Recursive Estimation of Conditional Kernel Mean Embeddings","summary":" Kernel mean embeddings, a widely used technique in machine learning, map\nprobability distributions to elements of a reproducing kernel Hilbert space\n(RKHS). For supervised learning problems, where input-output pairs are\nobserved, the conditional distribution of outputs given the inputs is a key\nobject. The input dependent conditional distribution of an output can be\nencoded with an RKHS valued function, the conditional kernel mean map. In this\npaper we present a new recursive algorithm to estimate the conditional kernel\nmean map in a Hilbert space valued $L_2$ space, that is in a Bochner space. We\nprove the weak and strong $L_2$ consistency of our recursive estimator under\nmild conditions. The idea is to generalize Stone's theorem for Hilbert space\nvalued regression in a locally compact Polish space. We present new insights\nabout conditional kernel mean embeddings and give strong asymptotic bounds\nregarding the convergence of the proposed recursive method. Finally, the\nresults are demonstrated on three application domains: for inputs coming from\nEuclidean spaces, Riemannian manifolds and locally compact subsets of function\nspaces.\n","authors":["Ambrus Tamás","Balázs Csanád Csáji"],"pdf_url":"https://arxiv.org/pdf/2302.05955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09102v3","updated":"2024-08-30T16:03:53Z","published":"2022-07-19T06:49:24Z","title":"Complexity of High-Dimensional Identity Testing with Coordinate\n Conditional Sampling","summary":" We study the identity testing problem for high-dimensional distributions.\nGiven as input an explicit distribution $\\mu$, an $\\varepsilon>0$, and access\nto sampling oracle(s) for a hidden distribution $\\pi$, the goal in identity\ntesting is to distinguish whether the two distributions $\\mu$ and $\\pi$ are\nidentical or are at least $\\varepsilon$-far apart. When there is only access to\nfull samples from the hidden distribution $\\pi$, it is known that exponentially\nmany samples (in the dimension) may be needed for identity testing, and hence\nprevious works have studied identity testing with additional access to various\n\"conditional\" sampling oracles. We consider a significantly weaker conditional\nsampling oracle, which we call the $\\mathsf{Coordinate\\ Oracle}$, and provide a\ncomputational and statistical characterization of the identity testing problem\nin this new model.\n We prove that if an analytic property known as approximate tensorization of\nentropy holds for an $n$-dimensional visible distribution $\\mu$, then there is\nan efficient identity testing algorithm for any hidden distribution $\\pi$ using\n$\\tilde{O}(n/\\varepsilon)$ queries to the $\\mathsf{Coordinate\\ Oracle}$.\nApproximate tensorization of entropy is a pertinent condition as recent works\nhave established it for a large class of high-dimensional distributions. We\nalso prove a computational phase transition: for a well-studied class of\n$n$-dimensional distributions, specifically sparse antiferromagnetic Ising\nmodels over $\\{+1,-1\\}^n$, we show that in the regime where approximate\ntensorization of entropy fails, there is no efficient identity testing\nalgorithm unless $\\mathsf{RP}=\\mathsf{NP}$. We complement our results with a\nmatching $\\Omega(n/\\varepsilon)$ statistical lower bound for the sample\ncomplexity of identity testing in the $\\mathsf{Coordinate\\ Oracle}$ model.\n","authors":["Antonio Blanca","Zongchen Chen","Daniel Štefankovič","Eric Vigoda"],"pdf_url":"https://arxiv.org/pdf/2207.09102v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17366v1","updated":"2024-08-30T15:54:50Z","published":"2024-08-30T15:54:50Z","title":"Leveraging Graph Neural Networks to Forecast Electricity Consumption","summary":" Accurate electricity demand forecasting is essential for several reasons,\nespecially as the integration of renewable energy sources and the transition to\na decentralized network paradigm introduce greater complexity and uncertainty.\nThe proposed methodology leverages graph-based representations to effectively\ncapture the spatial distribution and relational intricacies inherent in this\ndecentralized network structure. This research work offers a novel approach\nthat extends beyond the conventional Generalized Additive Model framework by\nconsidering models like Graph Convolutional Networks or Graph SAGE. These\ngraph-based models enable the incorporation of various levels of\ninterconnectedness and information sharing among nodes, where each node\ncorresponds to the combined load (i.e. consumption) of a subset of consumers\n(e.g. the regions of a country). More specifically, we introduce a range of\nmethods for inferring graphs tailored to consumption forecasting, along with a\nframework for evaluating the developed models in terms of both performance and\nexplainability. We conduct experiments on electricity forecasting, in both a\nsynthetic and a real framework considering the French mainland regions, and the\nperformance and merits of our approach are discussed.\n","authors":["Eloi Campagne","Yvenn Amara-Ouali","Yannig Goude","Argyris Kalogeratos"],"pdf_url":"https://arxiv.org/pdf/2408.17366v1.pdf","comment":"17 pages, ECML PKDD 2024 Workshop paper"},{"id":"http://arxiv.org/abs/2408.17358v1","updated":"2024-08-30T15:49:31Z","published":"2024-08-30T15:49:31Z","title":"Hold Me Tight: Stable Encoder-Decoder Design for Speech Enhancement","summary":" Convolutional layers with 1-D filters are often used as frontend to encode\naudio signals. Unlike fixed time-frequency representations, they can adapt to\nthe local characteristics of input data. However, 1-D filters on raw audio are\nhard to train and often suffer from instabilities. In this paper, we address\nthese problems with hybrid solutions, i.e., combining theory-driven and\ndata-driven approaches. First, we preprocess the audio signals via a auditory\nfilterbank, guaranteeing good frequency localization for the learned encoder.\nSecond, we use results from frame theory to define an unsupervised learning\nobjective that encourages energy conservation and perfect reconstruction.\nThird, we adapt mixed compressed spectral norms as learning objectives to the\nencoder coefficients. Using these solutions in a low-complexity\nencoder-mask-decoder model significantly improves the perceptual evaluation of\nspeech quality (PESQ) in speech enhancement.\n","authors":["Daniel Haider","Felix Perfler","Vincent Lostanlen","Martin Ehler","Peter Balazs"],"pdf_url":"https://arxiv.org/pdf/2408.17358v1.pdf","comment":"Accepted at INTERSPEECH 2024"},{"id":"http://arxiv.org/abs/2406.17585v2","updated":"2024-08-30T15:45:11Z","published":"2024-06-25T14:28:17Z","title":"Learning Dynamic Bayesian Networks from Data: Foundations, First\n Principles and Numerical Comparisons","summary":" In this paper, we present a guide to the foundations of learning Dynamic\nBayesian Networks (DBNs) from data in the form of multiple samples of\ntrajectories for some length of time. We present the formalism for a generic as\nwell as a set of common types of DBNs for particular variable distributions. We\npresent the analytical form of the models, with a comprehensive discussion on\nthe interdependence between structure and weights in a DBN model and their\nimplications for learning. Next, we give a broad overview of learning methods\nand describe and categorize them based on the most important statistical\nfeatures, and how they treat the interplay between learning structure and\nweights. We give the analytical form of the likelihood and Bayesian score\nfunctions, emphasizing the distinction from the static case. We discuss\nfunctions used in optimization to enforce structural requirements. We briefly\ndiscuss more complex extensions and representations. Finally we present a set\nof comparisons in different settings for various distinct but representative\nalgorithms across the variants.\n","authors":["Vyacheslav Kungurtsev","Fadwa Idlahcen","Petr Rysavy","Pavel Rytir","Ales Wodecki"],"pdf_url":"https://arxiv.org/pdf/2406.17585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17356v1","updated":"2024-08-30T15:39:37Z","published":"2024-08-30T15:39:37Z","title":"C-RADAR: A Centralized Deep Learning System for Intrusion Detection in\n Software Defined Networks","summary":" The popularity of Software Defined Networks (SDNs) has grown in recent years,\nmainly because of their ability to simplify network management and improve\nnetwork flexibility. However, this also makes them vulnerable to various types\nof cyber attacks. SDNs work on a centralized control plane which makes them\nmore prone to network attacks. Research has demonstrated that deep learning\n(DL) methods can be successful in identifying intrusions in conventional\nnetworks, but their application in SDNs is still an open research area. In this\nresearch, we propose the use of DL techniques for intrusion detection in SDNs.\nWe measure the effectiveness of our method by experimentation on a dataset of\nnetwork traffic and comparing it to existing techniques. Our results show that\nthe DL-based approach outperforms traditional methods in terms of detection\naccuracy and computational efficiency. The deep learning architecture that has\nbeen used in this research is a Long Short Term Memory Network and\nSelf-Attention based architecture i.e. LSTM-Attn which achieves an Fl-score of\n0.9721. Furthermore, this technique can be trained to detect new attack\npatterns and improve the overall security of SDNs.\n","authors":["Osama Mustafa","Khizer Ali","Talha Naqash"],"pdf_url":"https://arxiv.org/pdf/2408.17356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17355v1","updated":"2024-08-30T15:39:34Z","published":"2024-08-30T15:39:34Z","title":"Bidirectional Decoding: Improving Action Chunking via Closed-Loop\n Resampling","summary":" Predicting and executing a sequence of actions without intermediate\nreplanning, known as action chunking, is increasingly used in robot learning\nfrom human demonstrations. However, its effects on learned policies remain\npuzzling: some studies highlight its importance for achieving strong\nperformance, while others observe detrimental effects. In this paper, we first\ndissect the role of action chunking by analyzing the divergence between the\nlearner and the demonstrator. We find that longer action chunks enable a policy\nto better capture temporal dependencies by taking into account more past states\nand actions within the chunk. However, this advantage comes at the cost of\nexacerbating errors in stochastic environments due to fewer observations of\nrecent states. To address this, we propose Bidirectional Decoding (BID), a\ntest-time inference algorithm that bridges action chunking with closed-loop\noperations. BID samples multiple predictions at each time step and searches for\nthe optimal one based on two criteria: (i) backward coherence, which favors\nsamples aligned with previous decisions, (ii) forward contrast, which favors\nsamples close to outputs of a stronger policy and distant from those of a\nweaker policy. By coupling decisions within and across action chunks, BID\nenhances temporal consistency over extended sequences while enabling adaptive\nreplanning in stochastic environments. Experimental results show that BID\nsubstantially outperforms conventional closed-loop operations of two\nstate-of-the-art generative policies across seven simulation benchmarks and two\nreal-world tasks.\n","authors":["Yuejiang Liu","Jubayer Ibn Hamid","Annie Xie","Yoonho Lee","Maximilian Du","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2408.17355v1.pdf","comment":"Project website: https://bid-robot.github.io/"},{"id":"http://arxiv.org/abs/2408.17354v1","updated":"2024-08-30T15:35:09Z","published":"2024-08-30T15:35:09Z","title":"Forget to Flourish: Leveraging Machine-Unlearning on Pretrained Language\n Models for Privacy Leakage","summary":" Fine-tuning large language models on private data for downstream applications\nposes significant privacy risks in potentially exposing sensitive information.\nSeveral popular community platforms now offer convenient distribution of a\nlarge variety of pre-trained models, allowing anyone to publish without\nrigorous verification. This scenario creates a privacy threat, as pre-trained\nmodels can be intentionally crafted to compromise the privacy of fine-tuning\ndatasets. In this study, we introduce a novel poisoning technique that uses\nmodel-unlearning as an attack tool. This approach manipulates a pre-trained\nlanguage model to increase the leakage of private data during the fine-tuning\nprocess. Our method enhances both membership inference and data extraction\nattacks while preserving model utility. Experimental results across different\nmodels, datasets, and fine-tuning setups demonstrate that our attacks\nsignificantly surpass baseline performance. This work serves as a cautionary\nnote for users who download pre-trained models from unverified sources,\nhighlighting the potential risks involved.\n","authors":["Md Rafi Ur Rashid","Jing Liu","Toshiaki Koike-Akino","Shagufta Mehnaz","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05241v6","updated":"2024-08-30T15:17:11Z","published":"2024-04-08T07:11:33Z","title":"LightFF: Lightweight Inference for Forward-Forward Algorithm","summary":" The human brain performs tasks with an outstanding energy efficiency, i.e.,\nwith approximately 20 Watts. The state-of-the-art Artificial/Deep Neural\nNetworks (ANN/DNN), on the other hand, have recently been shown to consume\nmassive amounts of energy. The training of these ANNs/DNNs is done almost\nexclusively based on the back-propagation algorithm, which is known to be\nbiologically implausible. This has led to a new generation of forward-only\ntechniques, including the Forward-Forward algorithm. In this paper, we propose\na lightweight inference scheme specifically designed for DNNs trained using the\nForward-Forward algorithm. We have evaluated our proposed lightweight inference\nscheme in the case of the MNIST and CIFAR datasets, as well as two real-world\napplications, namely, epileptic seizure detection and cardiac arrhythmia\nclassification using wearable technologies, where complexity overheads/energy\nconsumption is a major constraint, and demonstrate its relevance. Our code is\navailable at https://github.com/AminAminifar/LightFF.\n","authors":["Amin Aminifar","Baichuan Huang","Azra Abtahi","Amir Aminifar"],"pdf_url":"https://arxiv.org/pdf/2404.05241v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17337v1","updated":"2024-08-30T15:02:22Z","published":"2024-08-30T15:02:22Z","title":"Evaluating Reliability in Medical DNNs: A Critical Analysis of Feature\n and Confidence-Based OOD Detection","summary":" Reliable use of deep neural networks (DNNs) for medical image analysis\nrequires methods to identify inputs that differ significantly from the training\ndata, called out-of-distribution (OOD), to prevent erroneous predictions. OOD\ndetection methods can be categorised as either confidence-based (using the\nmodel's output layer for OOD detection) or feature-based (not using the output\nlayer). We created two new OOD benchmarks by dividing the D7P (dermatology) and\nBreastMNIST (ultrasound) datasets into subsets which either contain or don't\ncontain an artefact (rulers or annotations respectively). Models were trained\nwith artefact-free images, and images with the artefacts were used as OOD test\nsets. For each OOD image, we created a counterfactual by manually removing the\nartefact via image processing, to assess the artefact's impact on the model's\npredictions. We show that OOD artefacts can boost a model's softmax confidence\nin its predictions, due to correlations in training data among other factors.\nThis contradicts the common assumption that OOD artefacts should lead to more\nuncertain outputs, an assumption on which most confidence-based methods rely.\nWe use this to explain why feature-based methods (e.g. Mahalanobis score)\ntypically have greater OOD detection performance than confidence-based methods\n(e.g. MCP). However, we also show that feature-based methods typically perform\nworse at distinguishing between inputs that lead to correct and incorrect\npredictions (for both OOD and ID data). Following from these insights, we argue\nthat a combination of feature-based and confidence-based methods should be used\nwithin DNN pipelines to mitigate their respective weaknesses. These project's\ncode and OOD benchmarks are available at:\nhttps://github.com/HarryAnthony/Evaluating_OOD_detection.\n","authors":["Harry Anthony","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2408.17337v1.pdf","comment":"Accepted for the Uncertainty for Safe Utilization of Machine Learning\n in Medical Imaging (UNSURE 2024) workshop at the MICCAI 2023"},{"id":"http://arxiv.org/abs/2403.06087v2","updated":"2024-08-30T14:46:41Z","published":"2024-03-10T04:17:42Z","title":"Learning the irreversible progression trajectory of Alzheimer's disease","summary":" Alzheimer's disease (AD) is a progressive and irreversible brain disorder\nthat unfolds over the course of 30 years. Therefore, it is critical to capture\nthe disease progression in an early stage such that intervention can be applied\nbefore the onset of symptoms. Machine learning (ML) models have been shown\neffective in predicting the onset of AD. Yet for subjects with follow-up\nvisits, existing techniques for AD classification only aim for accurate group\nassignment, where the monotonically increasing risk across follow-up visits is\nusually ignored. Resulted fluctuating risk scores across visits violate the\nirreversibility of AD, hampering the trustworthiness of models and also\nproviding little value to understanding the disease progression. To address\nthis issue, we propose a novel regularization approach to predict AD\nlongitudinally. Our technique aims to maintain the expected monotonicity of\nincreasing disease risk during progression while preserving expressiveness.\nSpecifically, we introduce a monotonicity constraint that encourages the model\nto predict disease risk in a consistent and ordered manner across follow-up\nvisits. We evaluate our method using the longitudinal structural MRI and\namyloid-PET imaging data from the Alzheimer's Disease Neuroimaging Initiative\n(ADNI). Our model outperforms existing techniques in capturing the\nprogressiveness of disease risk, and at the same time preserves prediction\naccuracy.\n","authors":["Yipei Wang","Bing He","Shannon Risacher","Andrew Saykin","Jingwen Yan","Xiaoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06087v2.pdf","comment":"accepted by ISBI 2024"},{"id":"http://arxiv.org/abs/2408.17329v1","updated":"2024-08-30T14:42:03Z","published":"2024-08-30T14:42:03Z","title":"Estimation of Cardiac and Non-cardiac Diagnosis from Electrocardiogram\n Features","summary":" Introduction: Ensuring timely and accurate diagnosis of medical conditions is\nparamount for effective patient care. Electrocardiogram (ECG) signals are\nfundamental for evaluating a patient's cardiac health and are readily\navailable. Despite this, little attention has been given to the remarkable\npotential of ECG data in detecting non-cardiac conditions.\n Methods: In our study, we used publicly available datasets (MIMIC-IV-ECG-ICD\nand ECG-VIEW II) to investigate the feasibility of inferring general diagnostic\nconditions from ECG features. To this end, we trained a tree-based model\n(XGBoost) based on ECG features and basic demographic features to estimate a\nwide range of diagnoses, encompassing both cardiac and non-cardiac conditions.\n Results: Our results demonstrate the reliability of estimating 23 cardiac as\nwell as 21 non-cardiac conditions above 0.7 AUROC in a statistically\nsignificant manner across a wide range of physiological categories. Our\nfindings underscore the predictive potential of ECG data in identifying\nwell-known cardiac conditions. However, even more striking, this research\nrepresents a pioneering effort in systematically expanding the scope of\nECG-based diagnosis to conditions not traditionally associated with the cardiac\nsystem.\n","authors":["Juan Miguel Lopez Alcaraz","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2408.17329v1.pdf","comment":"4 pages, source code under https://github.com/AI4HealthUOL/CardioDiag"},{"id":"http://arxiv.org/abs/2408.05990v2","updated":"2024-08-30T14:39:24Z","published":"2024-08-12T08:33:09Z","title":"Parameters Inference for Nonlinear Wave Equations with Markovian\n Switching","summary":" Traditional partial differential equations with constant coefficients often\nstruggle to capture abrupt changes in real-world phenomena, leading to the\ndevelopment of variable coefficient PDEs and Markovian switching models.\nRecently, research has introduced the concept of PDEs with Markov switching\nmodels, established their well-posedness and presented numerical methods.\nHowever, there has been limited discussion on parameter estimation for the jump\ncoefficients in these models. This paper addresses this gap by focusing on\nparameter inference for the wave equation with Markovian switching. We propose\na Bayesian statistical framework using discrete sparse Bayesian learning to\nestablish its convergence and a uniform error bound. Our method requires fewer\nassumptions and enables independent parameter inference for each segment by\nallowing different underlying structures for the parameter estimation problem\nwithin each segmented time interval. The effectiveness of our approach is\ndemonstrated through three numerical cases, which involve noisy spatiotemporal\ndata from different wave equations with Markovian switching. The results show\nstrong performance in parameter estimation for variable coefficient PDEs.\n","authors":["Yi Zhang","Zhikun Zhang","Xiangjun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.05990v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00910v2","updated":"2024-08-30T14:37:26Z","published":"2023-12-01T20:19:12Z","title":"Effectiveness of probabilistic contact tracing in epidemic containment:\n the role of super-spreaders and transmission path reconstruction","summary":" The recent COVID-19 pandemic underscores the significance of early-stage\nnon-pharmacological intervention strategies. The widespread use of masks and\nthe systematic implementation of contact tracing strategies provide a\npotentially equally effective and socially less impactful alternative to more\nconventional approaches, such as large-scale mobility restrictions. However,\nmanual contact tracing faces strong limitations in accessing the network of\ncontacts, and the scalability of currently implemented protocols for\nsmartphone-based digital contact tracing becomes impractical during the rapid\nexpansion phases of the outbreaks, due to the surge in exposure notifications\nand associated tests. A substantial improvement in digital contact tracing can\nbe obtained through the integration of probabilistic techniques for risk\nassessment that can more effectively guide the allocation of new diagnostic\ntests. In this study, we first quantitatively analyze the diagnostic and social\ncosts associated with these containment measures based on contact tracing,\nemploying three state-of-the-art models of SARS-CoV-2 spreading. Our results\nsuggest that probabilistic techniques allow for more effective mitigation at a\nlower cost. Secondly, our findings reveal a remarkable efficacy of\nprobabilistic contact-tracing techniques in performing backward and multi-step\ntracing and capturing super-spreading events.\n","authors":["A. P. Muntoni","F. Mazza","A. Braunstein","G. Catania","L. Dall'Asta"],"pdf_url":"https://arxiv.org/pdf/2312.00910v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18249v2","updated":"2024-08-30T14:36:08Z","published":"2024-06-26T10:51:44Z","title":"Foundational Models for Pathology and Endoscopy Images: Application for\n Gastric Inflammation","summary":" The integration of artificial intelligence (AI) in medical diagnostics\nrepresents a significant advancement in managing upper gastrointestinal (GI)\ncancer, a major cause of global cancer mortality. Specifically for gastric\ncancer (GC), chronic inflammation causes changes in the mucosa such as atrophy,\nintestinal metaplasia (IM), dysplasia and ultimately cancer. Early detection\nthrough endoscopic regular surveillance is essential for better outcomes.\nFoundation models (FM), which are machine or deep learning models trained on\ndiverse data and applicable to broad use cases, offer a promising solution to\nenhance the accuracy of endoscopy and its subsequent pathology image analysis.\nThis review explores the recent advancements, applications, and challenges\nassociated with FM in endoscopy and pathology imaging. We started by\nelucidating the core principles and architectures underlying these models,\nincluding their training methodologies and the pivotal role of large-scale data\nin developing their predictive capabilities. Moreover, this work discusses\nemerging trends and future research directions, emphasizing the integration of\nmultimodal data, the development of more robust and equitable models, and the\npotential for real-time diagnostic support. This review aims to provide a\nroadmap for researchers and practitioners in navigating the complexities of\nincorporating FM into clinical practice for prevention/management of GC cases,\nthereby improving patient outcomes.\n","authors":["Hamideh Kerdegari","Kyle Higgins","Dennis Veselkov","Ivan Laponogov","Inese Polaka","Miguel Coimbra","Junior Andrea Pescino","Marcis Leja","Mario Dinis-Ribeiro","Tania Fleitas Kanonnikoff","Kirill Veselkov"],"pdf_url":"https://arxiv.org/pdf/2406.18249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17324v1","updated":"2024-08-30T14:35:01Z","published":"2024-08-30T14:35:01Z","title":"Modularity in Transformers: Investigating Neuron Separability &\n Specialization","summary":" Transformer models are increasingly prevalent in various applications, yet\nour understanding of their internal workings remains limited. This paper\ninvestigates the modularity and task specialization of neurons within\ntransformer architectures, focusing on both vision (ViT) and language (Mistral\n7B) models. Using a combination of selective pruning and MoEfication clustering\ntechniques, we analyze the overlap and specialization of neurons across\ndifferent tasks and data subsets. Our findings reveal evidence of task-specific\nneuron clusters, with varying degrees of overlap between related tasks. We\nobserve that neuron importance patterns persist to some extent even in randomly\ninitialized models, suggesting an inherent structure that training refines.\nAdditionally, we find that neuron clusters identified through MoEfication\ncorrespond more strongly to task-specific neurons in earlier and later layers\nof the models. This work contributes to a more nuanced understanding of\ntransformer internals and offers insights into potential avenues for improving\nmodel interpretability and efficiency.\n","authors":["Nicholas Pochinkov","Thomas Jones","Mohammed Rashidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2408.17324v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.17322v1","updated":"2024-08-30T14:32:25Z","published":"2024-08-30T14:32:25Z","title":"Investigating Neuron Ablation in Attention Heads: The Case for Peak\n Activation Centering","summary":" The use of transformer-based models is growing rapidly throughout society.\nWith this growth, it is important to understand how they work, and in\nparticular, how the attention mechanisms represent concepts. Though there are\nmany interpretability methods, many look at models through their neuronal\nactivations, which are poorly understood. We describe different lenses through\nwhich to view neuron activations, and investigate the effectiveness in language\nmodels and vision transformers through various methods of neural ablation: zero\nablation, mean ablation, activation resampling, and a novel approach we term\n'peak ablation'. Through experimental analysis, we find that in different\nregimes and models, each method can offer the lowest degradation of model\nperformance compared to other methods, with resampling usually causing the most\nsignificant performance deterioration. We make our code available at\nhttps://github.com/nickypro/investigating-ablation.\n","authors":["Nicholas Pochinkov","Ben Pasero","Skylar Shibayama"],"pdf_url":"https://arxiv.org/pdf/2408.17322v1.pdf","comment":"9 pages, 2 figures, XAI World Conference 2024 Late-Breaking Work"},{"id":"http://arxiv.org/abs/2301.04204v2","updated":"2024-08-30T14:30:13Z","published":"2023-01-10T20:43:29Z","title":"A Newton-CG based barrier-augmented Lagrangian method for general\n nonconvex conic optimization","summary":" In this paper we consider finding an approximate second-order stationary\npoint (SOSP) of general nonconvex conic optimization that minimizes a twice\ndifferentiable function subject to nonlinear equality constraints and also a\nconvex conic constraint. In particular, we propose a Newton-conjugate gradient\n(Newton-CG) based barrier-augmented Lagrangian method for finding an\napproximate SOSP of this problem. Under some mild assumptions, we show that our\nmethod enjoys a total inner iteration complexity of $\\widetilde{\\cal\nO}(\\epsilon^{-11/2})$ and an operation complexity of $\\widetilde{\\cal\nO}(\\epsilon^{-11/2}\\min\\{n,\\epsilon^{-5/4}\\})$ for finding an\n$(\\epsilon,\\sqrt{\\epsilon})$-SOSP of general nonconvex conic optimization with\nhigh probability. Moreover, under a constraint qualification, these complexity\nbounds are improved to $\\widetilde{\\cal O}(\\epsilon^{-7/2})$ and\n$\\widetilde{\\cal O}(\\epsilon^{-7/2}\\min\\{n,\\epsilon^{-3/4}\\})$, respectively.\nTo the best of our knowledge, this is the first study on the complexity of\nfinding an approximate SOSP of general nonconvex conic optimization.\nPreliminary numerical results are presented to demonstrate superiority of the\nproposed method over first-order methods in terms of solution quality.\n","authors":["Chuan He","Heng Huang","Zhaosong Lu"],"pdf_url":"https://arxiv.org/pdf/2301.04204v2.pdf","comment":"To appear in Computational Optimization and Applications. arXiv admin\n note: text overlap with arXiv:2301.03139"},{"id":"http://arxiv.org/abs/2401.05218v2","updated":"2024-08-30T14:27:33Z","published":"2024-01-10T15:34:42Z","title":"Invariant Causal Prediction with Local Models","summary":" We consider the task of identifying the causal parents of a target variable\namong a set of candidates from observational data. Our main assumption is that\nthe candidate variables are observed in different environments which may, under\ncertain assumptions, be regarded as interventions on the observed system. We\nassume a linear relationship between target and candidates, which can be\ndifferent in each environment with the only restriction that the causal\nstructure is invariant across environments. Within our proposed setting we\nprovide sufficient conditions for identifiability of the causal parents and\nintroduce a practical method called L-ICP ($\\textbf{L}$ocalized\n$\\textbf{I}$nvariant $\\textbf{Ca}$usal $\\textbf{P}$rediction), which is based\non a hypothesis test for parent identification using a ratio of minimum and\nmaximum statistics. We then show in a simplified setting that the statistical\npower of L-ICP converges exponentially fast in the sample size, and finally we\nanalyze the behavior of L-ICP experimentally in more general settings.\n","authors":["Alexander Mey","Rui Manuel Castro"],"pdf_url":"https://arxiv.org/pdf/2401.05218v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17313v1","updated":"2024-08-30T14:18:34Z","published":"2024-08-30T14:18:34Z","title":"Fair Best Arm Identification with Fixed Confidence","summary":" In this work, we present a novel framework for Best Arm Identification (BAI)\nunder fairness constraints, a setting that we refer to as \\textit{F-BAI} (fair\nBAI). Unlike traditional BAI, which solely focuses on identifying the optimal\narm with minimal sample complexity, F-BAI also includes a set of fairness\nconstraints. These constraints impose a lower limit on the selection rate of\neach arm and can be either model-agnostic or model-dependent. For this setting,\nwe establish an instance-specific sample complexity lower bound and analyze the\n\\textit{price of fairness}, quantifying how fairness impacts sample complexity.\nBased on the sample complexity lower bound, we propose F-TaS, an algorithm\nprovably matching the sample complexity lower bound, while ensuring that the\nfairness constraints are satisfied. Numerical results, conducted using both a\nsynthetic model and a practical wireless scheduling application, show the\nefficiency of F-TaS in minimizing the sample complexity while achieving low\nfairness violations.\n","authors":["Alessio Russo","Filippo Vannella"],"pdf_url":"https://arxiv.org/pdf/2408.17313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17311v1","updated":"2024-08-30T14:15:48Z","published":"2024-08-30T14:15:48Z","title":"Structuring a Training Strategy to Robustify Perception Models with\n Realistic Image Augmentations","summary":" Advancing Machine Learning (ML)-based perception models for autonomous\nsystems necessitates addressing weak spots within the models, particularly in\nchallenging Operational Design Domains (ODDs). These are environmental\noperating conditions of an autonomous vehicle which can contain difficult\nconditions, e.g., lens flare at night or objects reflected in a wet street.\nThis report introduces a novel methodology for training with augmentations to\nenhance model robustness and performance in such conditions. The proposed\napproach leverages customized physics-based augmentation functions, to generate\nrealistic training data that simulates diverse ODD scenarios.\n We present a comprehensive framework that includes identifying weak spots in\nML models, selecting suitable augmentations, and devising effective training\nstrategies. The methodology integrates hyperparameter optimization and latent\nspace optimization to fine-tune augmentation parameters, ensuring they\nmaximally improve the ML models' performance. Experimental results demonstrate\nimprovements in model performance, as measured by commonly used metrics such as\nmean Average Precision (mAP) and mean Intersection over Union (mIoU) on\nopen-source object detection and semantic segmentation models and datasets.\n Our findings emphasize that optimal training strategies are model- and\ndata-specific and highlight the benefits of integrating augmentations into the\ntraining pipeline. By incorporating augmentations, we observe enhanced\nrobustness of ML-based perception models, making them more resilient to edge\ncases encountered in real-world ODDs. This work underlines the importance of\ncustomized augmentations and offers an effective solution for improving the\nsafety and reliability of autonomous driving functions.\n","authors":["Ahmed Hammam","Bharathwaj Krishnaswami Sreedhar","Nura Kawa","Tim Patzelt","Oliver De Candido"],"pdf_url":"https://arxiv.org/pdf/2408.17311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.07799v3","updated":"2024-08-30T14:12:30Z","published":"2020-09-16T16:48:28Z","title":"On the Curse of Memory in Recurrent Neural Networks: Approximation and\n Optimization Analysis","summary":" We study the approximation properties and optimization dynamics of recurrent\nneural networks (RNNs) when applied to learn input-output relationships in\ntemporal data. We consider the simple but representative setting of using\ncontinuous-time linear RNNs to learn from data generated by linear\nrelationships. Mathematically, the latter can be understood as a sequence of\nlinear functionals. We prove a universal approximation theorem of such linear\nfunctionals, and characterize the approximation rate and its relation with\nmemory. Moreover, we perform a fine-grained dynamical analysis of training\nlinear RNNs, which further reveal the intricate interactions between memory and\nlearning. A unifying theme uncovered is the non-trivial effect of memory, a\nnotion that can be made precise in our framework, on approximation and\noptimization: when there is long term memory in the target, it takes a large\nnumber of neurons to approximate it. Moreover, the training process will suffer\nfrom slow downs. In particular, both of these effects become exponentially more\npronounced with memory - a phenomenon we call the \"curse of memory\". These\nanalyses represent a basic step towards a concrete mathematical understanding\nof new phenomenon that may arise in learning temporal relationships using\nrecurrent architectures.\n","authors":["Zhong Li","Jiequn Han","Weinan E","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2009.07799v3.pdf","comment":"Updated to include the condition $\\sup_n \\| \\boldsymbol{x}(n)\n \\|_{\\mathcal{X}} \\leq 1$ in the definition of regularity, which excludes the\n trivial case where only the zero functional is regular. Fixed various typos\n and improved clarity"},{"id":"http://arxiv.org/abs/2408.17307v1","updated":"2024-08-30T14:11:12Z","published":"2024-08-30T14:11:12Z","title":"Hybridizing Base-Line 2D-CNN Model with Cat Swarm Optimization for\n Enhanced Advanced Persistent Threat Detection","summary":" In the realm of cyber-security, detecting Advanced Persistent Threats (APTs)\nremains a formidable challenge due to their stealthy and sophisticated nature.\nThis research paper presents an innovative approach that leverages\nConvolutional Neural Networks (CNNs) with a 2D baseline model, enhanced by the\ncutting-edge Cat Swarm Optimization (CSO) algorithm, to significantly improve\nAPT detection accuracy. By seamlessly integrating the 2D-CNN baseline model\nwith CSO, we unlock the potential for unprecedented accuracy and efficiency in\nAPT detection. The results unveil an impressive accuracy score of $98.4\\%$,\nmarking a significant enhancement in APT detection across various attack\nstages, illuminating a path forward in combating these relentless and\nsophisticated threats.\n","authors":["Ali M. Bakhiet","Salah A. Aly"],"pdf_url":"https://arxiv.org/pdf/2408.17307v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2207.05442v3","updated":"2024-08-30T14:11:08Z","published":"2022-07-12T10:18:36Z","title":"Wasserstein multivariate auto-regressive models for modeling\n distributional time series","summary":" This paper is focused on the statistical analysis of data consisting of a\ncollection of multiple series of probability measures that are indexed by\ndistinct time instants and supported over a bounded interval of the real line.\nBy modeling these time-dependent probability measures as random objects in the\nWasserstein space, we propose a new auto-regressive model for the statistical\nanalysis of multivariate distributional time series. Using the theory of\niterated random function systems, results on the existence, uniqueness and\nstationarity of the solution of such a model are provided. We also propose a\nconsistent estimator for the auto-regressive coefficients of this model. Due to\nthe simplex constraints that we impose on the model coefficients, the proposed\nestimator that is learned under these constraints, naturally has a sparse\nstructure. The sparsity allows the application of the proposed model in\nlearning a graph of temporal dependency from multivariate distributional time\nseries. We explore the numerical performances of our estimation procedure using\nsimulated data. To shed some light on the benefits of our approach for real\ndata analysis, we also apply this methodology to a data set made of\nobservations from age distribution in different countries.\n","authors":["Yiye Jiang","Jérémie Bigot"],"pdf_url":"https://arxiv.org/pdf/2207.05442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.11498v3","updated":"2024-08-30T14:07:35Z","published":"2022-12-22T06:18:41Z","title":"Scalable Multi-Agent Reinforcement Learning for Warehouse Logistics with\n Robotic and Human Co-Workers","summary":" We consider a warehouse in which dozens of mobile robots and human pickers\nwork together to collect and deliver items within the warehouse. The\nfundamental problem we tackle, called the order-picking problem, is how these\nworker agents must coordinate their movement and actions in the warehouse to\nmaximise performance in this task. Established industry methods using heuristic\napproaches require large engineering efforts to optimise for innately variable\nwarehouse configurations. In contrast, multi-agent reinforcement learning\n(MARL) can be flexibly applied to diverse warehouse configurations (e.g. size,\nlayout, number/types of workers, item replenishment frequency), and different\ntypes of order-picking paradigms (e.g. Goods-to-Person and Person-to-Goods), as\nthe agents can learn how to cooperate optimally through experience. We develop\nhierarchical MARL algorithms in which a manager agent assigns goals to worker\nagents, and the policies of the manager and workers are co-trained toward\nmaximising a global objective (e.g. pick rate). Our hierarchical algorithms\nachieve significant gains in sample efficiency over baseline MARL algorithms\nand overall pick rates over multiple established industry heuristics in a\ndiverse set of warehouse configurations and different order-picking paradigms.\n","authors":["Aleksandar Krnjaic","Raul D. Steleac","Jonathan D. Thomas","Georgios Papoudakis","Lukas Schäfer","Andrew Wing Keung To","Kuan-Ho Lao","Murat Cubuktepe","Matthew Haley","Peter Börsting","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2212.11498v3.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n (IROS), 2024"},{"id":"http://arxiv.org/abs/2402.19037v2","updated":"2024-08-30T13:58:55Z","published":"2024-02-29T11:02:47Z","title":"A Deep-Learning Technique to Locate Cryptographic Operations in\n Side-Channel Traces","summary":" Side-channel attacks allow extracting secret information from the execution\nof cryptographic primitives by correlating the partially known computed data\nand the measured side-channel signal. However, to set up a successful\nside-channel attack, the attacker has to perform i) the challenging task of\nlocating the time instant in which the target cryptographic primitive is\nexecuted inside a side-channel trace and then ii)the time-alignment of the\nmeasured data on that time instant. This paper presents a novel deep-learning\ntechnique to locate the time instant in which the target computed cryptographic\noperations are executed in the side-channel trace. In contrast to\nstate-of-the-art solutions, the proposed methodology works even in the presence\nof trace deformations obtained through random delay insertion techniques. We\nvalidated our proposal through a successful attack against a variety of\nunprotected and protected cryptographic primitives that have been executed on\nan FPGA-implemented system-on-chip featuring a RISC-V CPU.\n","authors":["Giuseppe Chiari","Davide Galli","Francesco Lattari","Matteo Matteucci","Davide Zoni"],"pdf_url":"https://arxiv.org/pdf/2402.19037v2.pdf","comment":"6 pages, 3 figures. Presented at DATE24"},{"id":"http://arxiv.org/abs/2408.17298v1","updated":"2024-08-30T13:55:19Z","published":"2024-08-30T13:55:19Z","title":"Accelerating the discovery of steady-states of planetary interior\n dynamics with machine learning","summary":" Simulating mantle convection often requires reaching a computationally\nexpensive steady-state, crucial for deriving scaling laws for thermal and\ndynamical flow properties and benchmarking numerical solutions. The strong\ntemperature dependence of the rheology of mantle rocks causes viscosity\nvariations of several orders of magnitude, leading to a slow-evolving stagnant\nlid where heat conduction dominates, overlying a rapidly-evolving and strongly\nconvecting region. Time-stepping methods, while effective for fluids with\nconstant viscosity, are hindered by the Courant criterion, which restricts the\ntime step based on the system's maximum velocity and grid size. Consequently,\nachieving steady-state requires a large number of time steps due to the\ndisparate time scales governing the stagnant and convecting regions.\n We present a concept for accelerating mantle convection simulations using\nmachine learning. We generate a dataset of 128 two-dimensional simulations with\nmixed basal and internal heating, and pressure- and temperature-dependent\nviscosity. We train a feedforward neural network on 97 simulations to predict\nsteady-state temperature profiles. These can then be used to initialize\nnumerical time stepping methods for different simulation parameters. Compared\nto typical initializations, the number of time steps required to reach\nsteady-state is reduced by a median factor of 3.75. The benefit of this method\nlies in requiring very few simulations to train on, providing a solution with\nno prediction error as we initialize a numerical method, and posing minimal\ncomputational overhead at inference time. We demonstrate the effectiveness of\nour approach and discuss the potential implications for accelerated simulations\nfor advancing mantle convection research.\n","authors":["Siddhant Agarwal","Nicola Tosi","Christian Hüttig","David S. Greenberg","Ali Can Bekar"],"pdf_url":"https://arxiv.org/pdf/2408.17298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17286v1","updated":"2024-08-30T13:33:18Z","published":"2024-08-30T13:33:18Z","title":"Stationary Policies are Optimal in Risk-averse Total-reward MDPs with\n EVaR","summary":" Optimizing risk-averse objectives in discounted MDPs is challenging because\nmost models do not admit direct dynamic programming equations and require\ncomplex history-dependent policies. In this paper, we show that the risk-averse\n{\\em total reward criterion}, under the Entropic Risk Measure (ERM) and\nEntropic Value at Risk (EVaR) risk measures, can be optimized by a stationary\npolicy, making it simple to analyze, interpret, and deploy. We propose\nexponential value iteration, policy iteration, and linear programming to\ncompute optimal policies. In comparison with prior work, our results only\nrequire the relatively mild condition of transient MDPs and allow for {\\em\nboth} positive and negative rewards. Our results indicate that the total reward\ncriterion may be preferable to the discounted criterion in a broad range of\nrisk-averse reinforcement learning domains.\n","authors":["Xihong Su","Marek Petrik","Julien Grand-Clément"],"pdf_url":"https://arxiv.org/pdf/2408.17286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17285v1","updated":"2024-08-30T13:33:07Z","published":"2024-08-30T13:33:07Z","title":"Image-Perfect Imperfections: Safety, Bias, and Authenticity in the\n Shadow of Text-To-Image Model Evolution","summary":" Text-to-image models, such as Stable Diffusion (SD), undergo iterative\nupdates to improve image quality and address concerns such as safety.\nImprovements in image quality are straightforward to assess. However, how model\nupdates resolve existing concerns and whether they raise new questions remain\nunexplored. This study takes an initial step in investigating the evolution of\ntext-to-image models from the perspectives of safety, bias, and authenticity.\nOur findings, centered on Stable Diffusion, indicate that model updates paint a\nmixed picture. While updates progressively reduce the generation of unsafe\nimages, the bias issue, particularly in gender, intensifies. We also find that\nnegative stereotypes either persist within the same Non-White race group or\nshift towards other Non-White race groups through SD updates, yet with minimal\nassociation of these traits with the White race group. Additionally, our\nevaluation reveals a new concern stemming from SD updates: State-of-the-art\nfake image detectors, initially trained for earlier SD versions, struggle to\nidentify fake images generated by updated versions. We show that fine-tuning\nthese detectors on fake images generated by updated versions achieves at least\n96.6\\% accuracy across various SD versions, addressing this issue. Our insights\nhighlight the importance of continued efforts to mitigate biases and\nvulnerabilities in evolving text-to-image models.\n","authors":["Yixin Wu","Yun Shen","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.17285v1.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, October 14-18, 2024"},{"id":"http://arxiv.org/abs/2401.05735v3","updated":"2024-08-30T13:28:38Z","published":"2024-01-11T08:36:15Z","title":"Object-Centric Diffusion for Efficient Video Editing","summary":" Diffusion-based video editing have reached impressive quality and can\ntransform either the global style, local structure, and attributes of given\nvideo inputs, following textual edit prompts. However, such solutions typically\nincur heavy memory and computational costs to generate temporally-coherent\nframes, either in the form of diffusion inversion and/or cross-frame attention.\nIn this paper, we conduct an analysis of such inefficiencies, and suggest\nsimple yet effective modifications that allow significant speed-ups whilst\nmaintaining quality. Moreover, we introduce Object-Centric Diffusion, to fix\ngeneration artifacts and further reduce latency by allocating more computations\ntowards foreground edited regions, arguably more important for perceptual\nquality. We achieve this by two novel proposals: i) Object-Centric Sampling,\ndecoupling the diffusion steps spent on salient or background regions and\nspending most on the former, and ii) Object-Centric Token Merging, which\nreduces cost of cross-frame attention by fusing redundant tokens in unimportant\nbackground regions. Both techniques are readily applicable to a given video\nediting model without retraining, and can drastically reduce its memory and\ncomputational cost. We evaluate our proposals on inversion-based and\ncontrol-signal-based editing pipelines, and show a latency reduction up to 10x\nfor a comparable synthesis quality. Project page:\nqualcomm-ai-research.github.io/object-centric-diffusion.\n","authors":["Kumara Kahatapitiya","Adil Karjauv","Davide Abati","Fatih Porikli","Yuki M. Asano","Amirhossein Habibian"],"pdf_url":"https://arxiv.org/pdf/2401.05735v3.pdf","comment":"ECCV24"},{"id":"http://arxiv.org/abs/2408.17276v1","updated":"2024-08-30T13:22:08Z","published":"2024-08-30T13:22:08Z","title":"Minimax and Communication-Efficient Distributed Best Subset Selection\n with Oracle Property","summary":" The explosion of large-scale data in fields such as finance, e-commerce, and\nsocial media has outstripped the processing capabilities of single-machine\nsystems, driving the need for distributed statistical inference methods.\nTraditional approaches to distributed inference often struggle with achieving\ntrue sparsity in high-dimensional datasets and involve high computational\ncosts. We propose a novel, two-stage, distributed best subset selection\nalgorithm to address these issues. Our approach starts by efficiently\nestimating the active set while adhering to the $\\ell_0$ norm-constrained\nsurrogate likelihood function, effectively reducing dimensionality and\nisolating key variables. A refined estimation within the active set follows,\nensuring sparse estimates and matching the minimax $\\ell_2$ error bound. We\nintroduce a new splicing technique for adaptive parameter selection to tackle\nsubproblems under $\\ell_0$ constraints and a Generalized Information Criterion\n(GIC). Our theoretical and numerical studies show that the proposed algorithm\ncorrectly finds the true sparsity pattern, has the oracle property, and greatly\nlowers communication costs. This is a big step forward in distributed sparse\nestimation.\n","authors":["Jingguo Lan","Hongmei Lin","Xueqin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.17276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17274v1","updated":"2024-08-30T13:19:20Z","published":"2024-08-30T13:19:20Z","title":"The Transferability of Downsampling Sparse Graph Convolutional Networks","summary":" In this paper, we propose a large-scale sparse graph downsampling method\nbased on a sparse random graph model, which allows for the adjustment of\ndifferent sparsity levels. We combine sparsity and topological similarity: the\nsparse graph model reduces the node connection probability as the graph size\nincreases, while the downsampling method preserves a specific topological\nconnection pattern during this change. Based on the downsampling method, we\nderive a theoretical transferability bound about downsampling sparse graph\nconvolutional networks (GCNs), that higher sampling rates, greater average\ndegree expectations, and smaller initial graph sizes lead to better\ndownsampling transferability performance.\n","authors":["Qinji Shu","Hang Sheng","Hui Feng","Bo Hu"],"pdf_url":"https://arxiv.org/pdf/2408.17274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17271v1","updated":"2024-08-30T13:17:57Z","published":"2024-08-30T13:17:57Z","title":"Equation identification for fluid flows via physics-informed neural\n networks","summary":" Scientific machine learning (SciML) methods such as physics-informed neural\nnetworks (PINNs) are used to estimate parameters of interest from governing\nequations and small quantities of data. However, there has been little work in\nassessing how well PINNs perform for inverse problems across wide ranges of\ngoverning equations across the mathematical sciences. We present a new and\nchallenging benchmark problem for inverse PINNs based on a parametric sweep of\nthe 2D Burgers' equation with rotational flow. We show that a novel strategy\nthat alternates between first- and second-order optimization proves superior to\ntypical first-order strategies for estimating parameters. In addition, we\npropose a novel data-driven method to characterize PINN effectiveness in the\ninverse setting. PINNs' physics-informed regularization enables them to\nleverage small quantities of data more efficiently than the data-driven\nbaseline. However, both PINNs and the baseline can fail to recover parameters\nfor highly inviscid flows, motivating the need for further development of PINN\nmethods.\n","authors":["Alexander New","Marisel Villafañe-Delgado","Charles Shugert"],"pdf_url":"https://arxiv.org/pdf/2408.17271v1.pdf","comment":"Published at ICML 2024 AI4Science:\n https://openreview.net/forum?id=XsvCLEYH3O"},{"id":"http://arxiv.org/abs/2404.08981v2","updated":"2024-08-30T13:06:28Z","published":"2024-04-13T12:09:37Z","title":"Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active\n Image Classification","summary":" Deep active learning (AL) seeks to minimize the annotation costs for training\ndeep neural networks. BAIT, a recently proposed AL strategy based on the Fisher\nInformation, has demonstrated impressive performance across various datasets.\nHowever, BAIT's high computational and memory requirements hinder its\napplicability on large-scale classification tasks, resulting in current\nresearch neglecting BAIT in their evaluation. This paper introduces two methods\nto enhance BAIT's computational efficiency and scalability. Notably, we\nsignificantly reduce its time complexity by approximating the Fisher\nInformation. In particular, we adapt the original formulation by i) taking the\nexpectation over the most probable classes, and ii) constructing a binary\nclassification task, leading to an alternative likelihood for gradient\ncomputations. Consequently, this allows the efficient use of BAIT on\nlarge-scale datasets, including ImageNet. Our unified and comprehensive\nevaluation across a variety of datasets demonstrates that our approximations\nachieve strong performance with considerably reduced time complexity.\nFurthermore, we provide an extensive open-source toolbox that implements recent\nstate-of-the-art AL strategies, available at\nhttps://github.com/dhuseljic/dal-toolbox.\n","authors":["Denis Huseljic","Paul Hahn","Marek Herde","Lukas Rauch","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2404.08981v2.pdf","comment":"Accepted at ECML PKDD 2024"},{"id":"http://arxiv.org/abs/2408.17258v1","updated":"2024-08-30T12:56:17Z","published":"2024-08-30T12:56:17Z","title":"Joint Estimation and Prediction of City-wide Delivery Demand: A Large\n Language Model Empowered Graph-based Learning Approach","summary":" The proliferation of e-commerce and urbanization has significantly\nintensified delivery operations in urban areas, boosting the volume and\ncomplexity of delivery demand. Data-driven predictive methods, especially those\nutilizing machine learning techniques, have emerged to handle these\ncomplexities in urban delivery demand management problems. One particularly\npressing problem that has not yet been sufficiently studied is the joint\nestimation and prediction of city-wide delivery demand. To this end, we\nformulate this problem as a graph-based spatiotemporal learning task. First, a\nmessage-passing neural network model is formalized to capture the interaction\nbetween demand patterns of associated regions. Second, by exploiting recent\nadvances in large language models, we extract general geospatial knowledge\nencodings from the unstructured locational data and integrate them into the\ndemand predictor. Last, to encourage the cross-city transferability of the\nmodel, an inductive training scheme is developed in an end-to-end routine.\nExtensive empirical results on two real-world delivery datasets, including\neight cities in China and the US, demonstrate that our model significantly\noutperforms state-of-the-art baselines in these challenging tasks.\n","authors":["Tong Nie","Junlin He","Yuewen Mei","Guoyang Qin","Guilong Li","Jian Sun","Wei Ma"],"pdf_url":"https://arxiv.org/pdf/2408.17258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17255v1","updated":"2024-08-30T12:53:40Z","published":"2024-08-30T12:53:40Z","title":"Self-supervised learning for crystal property prediction via denoising","summary":" Accurate prediction of the properties of crystalline materials is crucial for\ntargeted discovery, and this prediction is increasingly done with data-driven\nmodels. However, for many properties of interest, the number of materials for\nwhich a specific property has been determined is much smaller than the number\nof known materials. To overcome this disparity, we propose a novel\nself-supervised learning (SSL) strategy for material property prediction. Our\napproach, crystal denoising self-supervised learning (CDSSL), pretrains\npredictive models (e.g., graph networks) with a pretext task based on\nrecovering valid material structures when given perturbed versions of these\nstructures. We demonstrate that CDSSL models out-perform models trained without\nSSL, across material types, properties, and dataset sizes.\n","authors":["Alexander New","Nam Q. Le","Michael J. Pekala","Christopher D. Stiles"],"pdf_url":"https://arxiv.org/pdf/2408.17255v1.pdf","comment":"Published at ICML 2024 AI4Science:\n https://openreview.net/forum?id=yML9ufAEoV"},{"id":"http://arxiv.org/abs/2304.01762v3","updated":"2024-08-30T12:51:53Z","published":"2023-04-04T12:51:35Z","title":"Incorporating Unlabelled Data into Bayesian Neural Networks","summary":" Conventional Bayesian Neural Networks (BNNs) are unable to leverage\nunlabelled data to improve their predictions. To overcome this limitation, we\nintroduce Self-Supervised Bayesian Neural Networks, which use unlabelled data\nto learn models with suitable prior predictive distributions. This is achieved\nby leveraging contrastive pretraining techniques and optimising a variational\nlower bound. We then show that the prior predictive distributions of\nself-supervised BNNs capture problem semantics better than conventional BNN\npriors. In turn, our approach offers improved predictive performance over\nconventional BNNs, especially in low-budget regimes.\n","authors":["Mrinank Sharma","Tom Rainforth","Yee Whye Teh","Vincent Fortuin"],"pdf_url":"https://arxiv.org/pdf/2304.01762v3.pdf","comment":"Published in the Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2408.17246v1","updated":"2024-08-30T12:40:12Z","published":"2024-08-30T12:40:12Z","title":"Learning and Verifying Maximal Taylor-Neural Lyapunov functions","summary":" We introduce a novel neural network architecture, termed Taylor-neural\nLyapunov functions, designed to approximate Lyapunov functions with formal\ncertification. This architecture innovatively encodes local approximations and\nextends them globally by leveraging neural networks to approximate the\nresiduals. Our method recasts the problem of estimating the largest region of\nattraction - specifically for maximal Lyapunov functions - into a learning\nproblem, ensuring convergence around the origin through robust control theory.\nPhysics-informed machine learning techniques further refine the estimation of\nthe largest region of attraction. Remarkably, this method is versatile,\noperating effectively even without simulated data points. We validate the\nefficacy of our approach by providing numerical certificates of convergence\nacross multiple examples. Our proposed methodology not only competes closely\nwith state-of-the-art approaches, such as sum-of-squares and LyZNet, but also\nachieves comparable results even in the absence of simulated data. This work\nrepresents a significant advancement in control theory, with broad potential\napplications in the design of stable control systems and beyond.\n","authors":["Matthieu Barreau","Nicola Bastianello"],"pdf_url":"https://arxiv.org/pdf/2408.17246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12614v4","updated":"2024-08-30T12:40:04Z","published":"2024-06-18T13:43:22Z","title":"EUvsDisinfo: A Dataset for Multilingual Detection of Pro-Kremlin\n Disinformation in News Articles","summary":" This work introduces EUvsDisinfo, a multilingual dataset of disinformation\narticles originating from pro-Kremlin outlets, along with trustworthy articles\nfrom credible / less biased sources. It is sourced directly from the debunk\narticles written by experts leading the EUvsDisinfo project. Our dataset is the\nlargest to-date resource in terms of the overall number of articles and\ndistinct languages. It also provides the largest topical and temporal coverage.\nUsing this dataset, we investigate the dissemination of pro-Kremlin\ndisinformation across different languages, uncovering language-specific\npatterns targeting certain disinformation topics. We further analyse the\nevolution of topic distribution over an eight-year period, noting a significant\nsurge in disinformation content before the full-scale invasion of Ukraine in\n2022. Lastly, we demonstrate the dataset's applicability in training models to\neffectively distinguish between disinformation and trustworthy content in\nmultilingual settings.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2406.12614v4.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2408.17244v1","updated":"2024-08-30T12:36:00Z","published":"2024-08-30T12:36:00Z","title":"Categorical data clustering: 25 years beyond K-modes","summary":" The clustering of categorical data is a common and important task in computer\nscience, offering profound implications across a spectrum of applications.\nUnlike purely numerical datasets, categorical data often lack inherent ordering\nas in nominal data, or have varying levels of order as in ordinal data, thus\nrequiring specialized methodologies for efficient organization and analysis.\nThis review provides a comprehensive synthesis of categorical data clustering\nin the past twenty-five years, starting from the introduction of K-modes. It\nelucidates the pivotal role of categorical data clustering in diverse fields\nsuch as health sciences, natural sciences, social sciences, education,\nengineering and economics. Practical comparisons are conducted for algorithms\nhaving public implementations, highlighting distinguishing clustering\nmethodologies and revealing the performance of recent algorithms on several\nbenchmark categorical datasets. Finally, challenges and opportunities in the\nfield are discussed.\n","authors":["Tai Dinh","Wong Hauchi","Philippe Fournier-Viger","Daniil Lisik","Minh-Quyet Ha","Hieu-Chi Dam","Van-Nam Huynh"],"pdf_url":"https://arxiv.org/pdf/2408.17244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17240v1","updated":"2024-08-30T12:31:25Z","published":"2024-08-30T12:31:25Z","title":"Using Quantum Solved Deep Boltzmann Machines to Increase the Data\n Efficiency of RL Agents","summary":" Deep Learning algorithms, such as those used in Reinforcement Learning, often\nrequire large quantities of data to train effectively. In most cases, the\navailability of data is not a significant issue. However, for some contexts,\nsuch as in autonomous cyber defence, we require data efficient methods.\nRecently, Quantum Machine Learning and Boltzmann Machines have been proposed as\nsolutions to this challenge. In this work we build upon the pre-existing work\nto extend the use of Deep Boltzmann Machines to the cutting edge algorithm\nProximal Policy Optimisation in a Reinforcement Learning cyber defence\nenvironment. We show that this approach, when solved using a D-WAVE quantum\nannealer, can lead to a two-fold increase in data efficiency. We therefore\nexpect it to be used by the machine learning and quantum communities who are\nhoping to capitalise on data-efficient Reinforcement Learning methods.\n","authors":["Daniel Kent","Clement O'Rourke","Jake Southall","Kirsty Duncan","Adrian Bedford"],"pdf_url":"https://arxiv.org/pdf/2408.17240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17235v1","updated":"2024-08-30T12:26:23Z","published":"2024-08-30T12:26:23Z","title":"AI-Driven Intrusion Detection Systems (IDS) on the ROAD dataset: A\n Comparative Analysis for automotive Controller Area Network (CAN)","summary":" The integration of digital devices in modern vehicles has revolutionized\nautomotive technology, enhancing safety and the overall driving experience. The\nController Area Network (CAN) bus is a central system for managing in-vehicle\ncommunication between the electronic control units (ECUs). However, the CAN\nprotocol poses security challenges due to inherent vulnerabilities, lacking\nencryption and authentication, which, combined with an expanding attack\nsurface, necessitates robust security measures. In response to this challenge,\nnumerous Intrusion Detection Systems (IDS) have been developed and deployed.\nNonetheless, an open, comprehensive, and realistic dataset to test the\neffectiveness of such IDSs remains absent in the existing literature. This\npaper addresses this gap by considering the latest ROAD dataset, containing\nstealthy and sophisticated injections. The methodology involves dataset\nlabelling and the implementation of both state-of-the-art deep learning models\nand traditional machine learning models to show the discrepancy in performance\nbetween the datasets most commonly used in the literature and the ROAD dataset,\na more realistic alternative.\n","authors":["Lorenzo Guerra","Linhan Xu","Pavlo Mozharovskyi","Paolo Bellavista","Thomas Chapuis","Guillaume Duc","Van-Tam Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.17235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17221v1","updated":"2024-08-30T12:00:36Z","published":"2024-08-30T12:00:36Z","title":"Geometry of Lightning Self-Attention: Identifiability and Dimension","summary":" We consider function spaces defined by self-attention networks without\nnormalization, and theoretically analyze their geometry. Since these networks\nare polynomial, we rely on tools from algebraic geometry. In particular, we\nstudy the identifiability of deep attention by providing a description of the\ngeneric fibers of the parametrization for an arbitrary number of layers and, as\na consequence, compute the dimension of the function space. Additionally, for a\nsingle-layer model, we characterize the singular and boundary points. Finally,\nwe formulate a conjectural extension of our results to normalized\nself-attention networks, prove it for a single layer, and numerically verify it\nin the deep case.\n","authors":["Nathan W. Henry","Giovanni Luca Marchetti","Kathlén Kohn"],"pdf_url":"https://arxiv.org/pdf/2408.17221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04295v2","updated":"2024-08-30T11:57:47Z","published":"2024-07-05T06:57:30Z","title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","summary":" Large Language Models (LLMs) have performed exceptionally in various\ntext-generative tasks, including question answering, translation, code\ncompletion, etc. However, the over-assistance of LLMs has raised the challenge\nof \"jailbreaking\", which induces the model to generate malicious responses\nagainst the usage policy and society by designing adversarial prompts. With the\nemergence of jailbreak attack methods exploiting different vulnerabilities in\nLLMs, the corresponding safety alignment measures are also evolving. In this\npaper, we propose a comprehensive and detailed taxonomy of jailbreak attack and\ndefense methods. For instance, the attack methods are divided into black-box\nand white-box attacks based on the transparency of the target model. Meanwhile,\nwe classify defense methods into prompt-level and model-level defenses.\nAdditionally, we further subdivide these attack and defense methods into\ndistinct sub-classes and present a coherent diagram illustrating their\nrelationships. We also conduct an investigation into the current evaluation\nmethods and compare them from different perspectives. Our findings aim to\ninspire future research and practical implementations in safeguarding LLMs\nagainst adversarial attacks. Above all, although jailbreak remains a\nsignificant concern within the community, we believe that our work enhances the\nunderstanding of this domain and provides a foundation for developing more\nsecure LLMs.\n","authors":["Sibo Yi","Yule Liu","Zhen Sun","Tianshuo Cong","Xinlei He","Jiaxing Song","Ke Xu","Qi Li"],"pdf_url":"https://arxiv.org/pdf/2407.04295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17216v1","updated":"2024-08-30T11:46:39Z","published":"2024-08-30T11:46:39Z","title":"Democratizing AI in Africa: FL for Low-Resource Edge Devices","summary":" Africa faces significant challenges in healthcare delivery due to limited\ninfrastructure and access to advanced medical technologies. This study explores\nthe use of federated learning to overcome these barriers, focusing on perinatal\nhealth. We trained a fetal plane classifier using perinatal data from five\nAfrican countries: Algeria, Ghana, Egypt, Malawi, and Uganda, along with data\nfrom Spanish hospitals. To incorporate the lack of computational resources in\nthe analysis, we considered a heterogeneous set of devices, including a\nRaspberry Pi and several laptops, for model training. We demonstrate\ncomparative performance between a centralized and a federated model, despite\nthe compute limitations, and a significant improvement in model\ngeneralizability when compared to models trained only locally. These results\nshow the potential for a future implementation at a large scale of a federated\nlearning platform to bridge the accessibility gap and improve model\ngeneralizability with very little requirements.\n","authors":["Jorge Fabila","Víctor M. Campello","Carlos Martín-Isla","Johnes Obungoloch","Kinyera Leo","Amodoi Ronald","Karim Lekadir"],"pdf_url":"https://arxiv.org/pdf/2408.17216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15874v2","updated":"2024-08-30T11:18:08Z","published":"2024-08-28T15:44:34Z","title":"Robust Statistical Scaling of Outlier Scores: Improving the Quality of\n Outlier Probabilities for Outliers (Extended Version)","summary":" Outlier detection algorithms typically assign an outlier score to each\nobservation in a dataset, indicating the degree to which an observation is an\noutlier. However, these scores are often not comparable across algorithms and\ncan be difficult for humans to interpret. Statistical scaling addresses this\nproblem by transforming outlier scores into outlier probabilities without using\nground-truth labels, thereby improving interpretability and comparability\nacross algorithms. However, the quality of this transformation can be different\nfor outliers and inliers. Missing outliers in scenarios where they are of\nparticular interest - such as healthcare, finance, or engineering - can be\ncostly or dangerous. Thus, ensuring good probabilities for outliers is\nessential. This paper argues that statistical scaling, as commonly used in the\nliterature, does not produce equally good probabilities for outliers as for\ninliers. Therefore, we propose robust statistical scaling, which uses robust\nestimators to improve the probabilities for outliers. We evaluate several\nvariants of our method against other outlier score transformations for\nreal-world datasets and outlier detection algorithms, where it can improve the\nprobabilities for outliers.\n","authors":["Philipp Röchner","Henrique O. Marques","Ricardo J. G. B. Campello","Arthur Zimek","Franz Rothlauf"],"pdf_url":"https://arxiv.org/pdf/2408.15874v2.pdf","comment":"15 pages, 4 figures, extended version of an original article accepted\n for publication in SISAP 2024 by Springer Nature"},{"id":"http://arxiv.org/abs/2303.11428v3","updated":"2024-08-30T11:15:27Z","published":"2023-03-20T20:18:04Z","title":"Lamarr: LHCb ultra-fast simulation based on machine learning models\n deployed within Gauss","summary":" About 90% of the computing resources available to the LHCb experiment has\nbeen spent to produce simulated data samples for Run 2 of the Large Hadron\nCollider at CERN. The upgraded LHCb detector will be able to collect larger\ndata samples, requiring many more simulated events to analyze the data to be\ncollected in Run 3. Simulation is a key necessity of analysis to interpret\nsignal, reject background and measure efficiencies. The needed simulation will\nfar exceed the pledged resources, requiring an evolution in technologies and\ntechniques to produce these simulated data samples. In this contribution, we\ndiscuss Lamarr, a Gaudi-based framework to speed-up the simulation production\nparameterizing both the detector response and the reconstruction algorithms of\nthe LHCb experiment. Deep Generative Models powered by several algorithms and\nstrategies are employed to effectively parameterize the high-level response of\nthe single components of the LHCb detector, encoding within neural networks the\nexperimental errors and uncertainties introduced in the detection and\nreconstruction phases. Where possible, models are trained directly on real\ndata, statistically subtracting any background components by applying\nappropriate reweighing procedures. Embedding Lamarr in the general LHCb Gauss\nSimulation framework allows to combine its execution with any of the available\ngenerators in a seamless way. The resulting software package enables a\nsimulation process independent of the detailed simulation used to date.\n","authors":["Matteo Barbetti"],"pdf_url":"https://arxiv.org/pdf/2303.11428v3.pdf","comment":"To be published in Journal of Physics: Conference Series (ACAT 2022)"},{"id":"http://arxiv.org/abs/2408.12594v3","updated":"2024-08-30T10:55:58Z","published":"2024-08-22T17:57:31Z","title":"Non-Homophilic Graph Pre-Training and Prompt Learning","summary":" Graphs are ubiquitous for modeling complex relationships between objects\nacross various fields. Graph neural networks (GNNs) have become a mainstream\ntechnique for graph-based applications, but their performance heavily relies on\nabundant labeled data. To reduce labeling requirement, pre-training and prompt\nlearning has become a popular alternative. However, most existing prompt\nmethods do not differentiate homophilic and heterophilic characteristics of\nreal-world graphs. In particular, many real-world graphs are non-homophilic,\nnot strictly or uniformly homophilic with mixing homophilic and heterophilic\npatterns, exhibiting varying non-homophilic characteristics across graphs and\nnodes. In this paper, we propose ProNoG, a novel pre-training and prompt\nlearning framework for such non-homophilic graphs. First, we analyze existing\ngraph pre-training methods, providing theoretical insights into the choice of\npre-training tasks. Second, recognizing that each node exhibits unique\nnon-homophilic characteristics, we propose a conditional network to\ncharacterize the node-specific patterns in downstream tasks. Finally, we\nthoroughly evaluate and analyze ProNoG through extensive experiments on ten\npublic datasets.\n","authors":["Xingtong Yu","Jie Zhang","Yuan Fang","Renhe Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.12594v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.17198v1","updated":"2024-08-30T10:52:18Z","published":"2024-08-30T10:52:18Z","title":"Towards Symbolic XAI -- Explanation Through Human Understandable Logical\n Relationships Between Features","summary":" Explainable Artificial Intelligence (XAI) plays a crucial role in fostering\ntransparency and trust in AI systems, where traditional XAI approaches\ntypically offer one level of abstraction for explanations, often in the form of\nheatmaps highlighting single or multiple input features. However, we ask\nwhether abstract reasoning or problem-solving strategies of a model may also be\nrelevant, as these align more closely with how humans approach solutions to\nproblems. We propose a framework, called Symbolic XAI, that attributes\nrelevance to symbolic queries expressing logical relationships between input\nfeatures, thereby capturing the abstract reasoning behind a model's\npredictions. The methodology is built upon a simple yet general multi-order\ndecomposition of model predictions. This decomposition can be specified using\nhigher-order propagation-based relevance methods, such as GNN-LRP, or\nperturbation-based explanation methods commonly used in XAI. The effectiveness\nof our framework is demonstrated in the domains of natural language processing\n(NLP), vision, and quantum chemistry (QC), where abstract symbolic domain\nknowledge is abundant and of significant interest to users. The Symbolic XAI\nframework provides an understanding of the model's decision-making process that\nis both flexible for customization by the user and human-readable through\nlogical formulas.\n","authors":["Thomas Schnake","Farnoush Rezaei Jafaria","Jonas Lederer","Ping Xiong","Shinichi Nakajima","Stefan Gugler","Grégoire Montavon","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2408.17198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01001v4","updated":"2024-08-30T10:41:55Z","published":"2023-05-31T05:04:50Z","title":"DiffLoad: Uncertainty Quantification in Electrical Load Forecasting with\n Diffusion Model","summary":" Electrical load forecasting plays a crucial role in decision-making for power\nsystems, including unit commitment and economic dispatch. The integration of\nrenewable energy sources and the occurrence of external events, such as the\nCOVID-19 pandemic, have rapidly increased uncertainties in load forecasting.\nThe uncertainties in load forecasting can be divided into two types: epistemic\nuncertainty and aleatoric uncertainty. Separating these types of uncertainties\ncan help decision-makers better understand where and to what extent the\nuncertainty is, thereby enhancing their confidence in the following\ndecision-making. This paper proposes a diffusion-based Seq2Seq structure to\nestimate epistemic uncertainty and employs the robust additive Cauchy\ndistribution to estimate aleatoric uncertainty. Our method not only ensures the\naccuracy of load forecasting but also demonstrates the ability to separate the\ntwo types of uncertainties and be applicable to different levels of loads. The\nrelevant code can be found at\n\\url{https://anonymous.4open.science/r/DiffLoad-4714/}.\n","authors":["Zhixian Wang","Qingsong Wen","Chaoli Zhang","Liang Sun","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01001v4.pdf","comment":"Accepted by IEEE Transactions on Power Systems, 2024"},{"id":"http://arxiv.org/abs/2301.05522v3","updated":"2024-08-30T10:39:34Z","published":"2023-01-13T12:57:48Z","title":"Hyperparameter Optimization as a Service on INFN Cloud","summary":" The simplest and often most effective way of parallelizing the training of\ncomplex machine learning models is to execute several training instances on\nmultiple machines, scanning the hyperparameter space to optimize the underlying\nstatistical model and the learning procedure. Often, such a meta-learning\nprocedure is limited by the ability of accessing securely a common database\norganizing the knowledge of the previous and ongoing trials. Exploiting\nopportunistic GPUs provided in different environments represents a further\nchallenge when designing such optimization campaigns. In this contribution, we\ndiscuss how a set of REST APIs can be used to access a dedicated service based\non INFN Cloud to monitor and coordinate multiple training instances, with\ngradient-less optimization techniques, via simple HTTP requests. The service,\ncalled Hopaas (Hyperparameter OPtimization As A Service), is made of a web\ninterface and sets of APIs implemented with a FastAPI backend running through\nUvicorn and NGINX in a virtual instance of INFN Cloud. The optimization\nalgorithms are currently based on Bayesian techniques as provided by Optuna. A\nPython frontend is also made available for quick prototyping. We present\napplications to hyperparameter optimization campaigns performed by combining\nprivate, INFN Cloud, and CINECA resources. Such multi-node multi-site\noptimization studies have given a significant boost to the development of a set\nof parameterizations for the ultra-fast simulation of the LHCb experiment.\n","authors":["Matteo Barbetti","Lucio Anderlini"],"pdf_url":"https://arxiv.org/pdf/2301.05522v3.pdf","comment":"To be published in Journal of Physics: Conference Series (ACAT 2022)"},{"id":"http://arxiv.org/abs/2408.17185v1","updated":"2024-08-30T10:35:59Z","published":"2024-08-30T10:35:59Z","title":"Short-term Wind Speed Forecasting for Power Integration in Smart Grids\n based on Hybrid LSSVM-SVMD Method","summary":" Owing to its minimal pollution and efficient energy use, wind energy has\nbecome one of the most widely exploited renewable energy resources. The\nsuccessful integration of wind power into the grid system is contingent upon\naccurate wind speed forecasting models. However, the task of wind speed\nforecasting is challenging due to the inherent intermittent characteristics of\nwind speed. In this paper, a hybrid machine learning approach is developed for\npredicting short-term wind speed. First, the wind data was decomposed into\nmodal components using Successive Variational Mode Decomposition (SVMD). Then,\neach sub-signal was fitted into a Least Squares Support Vector Machines (LSSVM)\nmodel, with its hyperparameter optimized by a novel variant of Quantum-behaved\nParticle Swarm Optimization (QPSO), QPSO with elitist breeding (EBQPSO).\nSecond, the residuals making up for the differences between the original wind\nseries and the aggregate of the SVMD modes were modeled using long short-term\nmodel (LSTM). Then, the overall predicted values were computed using the\naggregate of the LSSVM and the LSTM models. Finally, the performance of the\nproposed model was compared against state-of-the-art benchmark models for\nforecasting wind speed using two separate data sets collected from a local wind\nfarm. Empirical results show significant improvement in performance by the\nproposed method, achieving a 1.21% to 32.76% reduction in root mean square\nerror (RMSE) and a 2.05% to 40.75% reduction in mean average error (MAE)\ncompared to the benchmark methods. The entire code implementation of this work\nis freely available in Github.\n","authors":["Ephrem Admasu Yekun","Alem H. Fitwib","Selvi Karpaga Subramaniand","Anubhav Kumard","Teshome Goa Tella"],"pdf_url":"https://arxiv.org/pdf/2408.17185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14058v2","updated":"2024-08-30T10:30:55Z","published":"2024-07-19T06:35:49Z","title":"On the Causal Sufficiency and Necessity of Multi-Modal Representation\n Learning","summary":" An effective paradigm of multi-modal learning (MML) is to learn unified\nrepresentations among modalities. From a causal perspective, constraining the\nconsistency between different modalities can mine causal representations that\nconvey primary events. However, such simple consistency may face the risk of\nlearning insufficient or unnecessary information: a necessary but insufficient\ncause is invariant across modalities but may not have the required accuracy; a\nsufficient but unnecessary cause tends to adapt well to specific modalities but\nmay be hard to adapt to new data. To address this issue, in this paper, we aim\nto learn representations that are both causal sufficient and necessary, i.e.,\nCausal Complete Cause ($C^3$), for MML. Firstly, we define the concept of $C^3$\nfor MML, which reflects the probability of being causal sufficiency and\nnecessity. We also propose the identifiability and measurement of $C^3$, i.e.,\n$C^3$ risk, to ensure calculating the learned representations' $C^3$ scores in\npractice. Then, we theoretically prove the effectiveness of $C^3$ risk by\nestablishing the performance guarantee of MML with a tight generalization\nbound. Based on these theoretical results, we propose a plug-and-play method,\nnamely Causal Complete Cause Regularization ($C^3$R), to learn causal complete\nrepresentations by constraining the $C^3$ risk bound. Extensive experiments\nconducted on various benchmark datasets empirically demonstrate the\neffectiveness of $C^3$R.\n","authors":["Jingyao Wang","Wenwen Qiang","Jiangmeng Li","Lingyu Si","Changwen Zheng","Bing Su"],"pdf_url":"https://arxiv.org/pdf/2407.14058v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17180v1","updated":"2024-08-30T10:28:36Z","published":"2024-08-30T10:28:36Z","title":"Identifying and Clustering Counter Relationships of Team Compositions in\n PvP Games for Efficient Balance Analysis","summary":" How can balance be quantified in game settings? This question is crucial for\ngame designers, especially in player-versus-player (PvP) games, where analyzing\nthe strength relations among predefined team compositions-such as hero\ncombinations in multiplayer online battle arena (MOBA) games or decks in card\ngames-is essential for enhancing gameplay and achieving balance. We have\ndeveloped two advanced measures that extend beyond the simplistic win rate to\nquantify balance in zero-sum competitive scenarios. These measures are derived\nfrom win value estimations, which employ strength rating approximations via the\nBradley-Terry model and counter relationship approximations via vector\nquantization, significantly reducing the computational complexity associated\nwith traditional win value estimations. Throughout the learning process of\nthese models, we identify useful categories of compositions and pinpoint their\ncounter relationships, aligning with the experiences of human players without\nrequiring specific game knowledge. Our methodology hinges on a simple technique\nto enhance codebook utilization in discrete representation with a deterministic\nvector quantization process for an extremely small state space. Our framework\nhas been validated in popular online games, including Age of Empires II,\nHearthstone, Brawl Stars, and League of Legends. The accuracy of the observed\nstrength relations in these games is comparable to traditional pairwise win\nvalue predictions, while also offering a more manageable complexity for\nanalysis. Ultimately, our findings contribute to a deeper understanding of PvP\ngame dynamics and present a methodology that significantly improves game\nbalance evaluation and design.\n","authors":["Chiu-Chou Lin","Yu-Wei Shih","Kuei-Ting Kuo","Yu-Cheng Chen","Chien-Hua Chen","Wei-Chen Chiu","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.17180v1.pdf","comment":"TMLR 09/2024 https://openreview.net/forum?id=2D36otXvBE"},{"id":"http://arxiv.org/abs/2408.17171v1","updated":"2024-08-30T10:17:37Z","published":"2024-08-30T10:17:37Z","title":"SafeTail: Efficient Tail Latency Optimization in Edge Service Scheduling\n via Computational Redundancy Management","summary":" Optimizing tail latency while efficiently managing computational resources is\ncrucial for delivering high-performance, latency-sensitive services in edge\ncomputing. Emerging applications, such as augmented reality, require\nlow-latency computing services with high reliability on user devices, which\noften have limited computational capabilities. Consequently, these devices\ndepend on nearby edge servers for processing. However, inherent uncertainties\nin network and computation latencies stemming from variability in wireless\nnetworks and fluctuating server loads make service delivery on time\nchallenging. Existing approaches often focus on optimizing median latency but\nfall short of addressing the specific challenges of tail latency in edge\nenvironments, particularly under uncertain network and computational\nconditions. Although some methods do address tail latency, they typically rely\non fixed or excessive redundancy and lack adaptability to dynamic network\nconditions, often being designed for cloud environments rather than the unique\ndemands of edge computing. In this paper, we introduce SafeTail, a framework\nthat meets both median and tail response time targets, with tail latency\ndefined as latency beyond the 90^th percentile threshold. SafeTail addresses\nthis challenge by selectively replicating services across multiple edge servers\nto meet target latencies. SafeTail employs a reward-based deep learning\nframework to learn optimal placement strategies, balancing the need to achieve\ntarget latencies with minimizing additional resource usage. Through\ntrace-driven simulations, SafeTail demonstrated near-optimal performance and\noutperformed most baseline strategies across three diverse services.\n","authors":["Jyoti Shokhanda","Utkarsh Pal","Aman Kumar","Soumi Chattopadhyay","Arani Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2408.17171v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2408.17166v1","updated":"2024-08-30T10:09:12Z","published":"2024-08-30T10:09:12Z","title":"Learning Multi-Target TDOA Features for Sound Event Localization and\n Detection","summary":" Sound event localization and detection (SELD) systems using audio recordings\nfrom a microphone array rely on spatial cues for determining the location of\nsound events. As a consequence, the localization performance of such systems is\nto a large extent determined by the quality of the audio features that are used\nas inputs to the system. We propose a new feature, based on neural generalized\ncross-correlations with phase-transform (NGCC-PHAT), that learns audio\nrepresentations suitable for localization. Using permutation invariant training\nfor the time-difference of arrival (TDOA) estimation problem enables NGCC-PHAT\nto learn TDOA features for multiple overlapping sound events. These features\ncan be used as a drop-in replacement for GCC-PHAT inputs to a SELD-network. We\ntest our method on the STARSS23 dataset and demonstrate improved localization\nperformance compared to using standard GCC-PHAT or SALSA-Lite input features.\n","authors":["Axel Berg","Johanna Engman","Jens Gulin","Karl Åström","Magnus Oskarsson"],"pdf_url":"https://arxiv.org/pdf/2408.17166v1.pdf","comment":"DCASE 2024"},{"id":"http://arxiv.org/abs/2408.17165v1","updated":"2024-08-30T10:08:21Z","published":"2024-08-30T10:08:21Z","title":"Efficient Testable Learning of General Halfspaces with Adversarial Label\n Noise","summary":" We study the task of testable learning of general -- not necessarily\nhomogeneous -- halfspaces with adversarial label noise with respect to the\nGaussian distribution. In the testable learning framework, the goal is to\ndevelop a tester-learner such that if the data passes the tester, then one can\ntrust the output of the robust learner on the data.Our main result is the first\npolynomial time tester-learner for general halfspaces that achieves\ndimension-independent misclassification error. At the heart of our approach is\na new methodology to reduce testable learning of general halfspaces to testable\nlearning of nearly homogeneous halfspaces that may be of broader interest.\n","authors":["Ilias Diakonikolas","Daniel M. Kane","Sihan Liu","Nikos Zarifis"],"pdf_url":"https://arxiv.org/pdf/2408.17165v1.pdf","comment":"Presented to COLT'24"},{"id":"http://arxiv.org/abs/2408.17163v1","updated":"2024-08-30T10:06:26Z","published":"2024-08-30T10:06:26Z","title":"The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by\n Leveraging Second-Order Information","summary":" The rising footprint of machine learning has led to a focus on imposing\n\\emph{model sparsity} as a means of reducing computational and memory costs.\nFor deep neural networks (DNNs), the state-of-the-art accuracy-vs-sparsity is\nachieved by heuristics inspired by the classical Optimal Brain Surgeon (OBS)\nframework~\\citep{lecun90brain, hassibi1992second, hassibi1993optimal}, which\nleverages loss curvature information to make better pruning decisions. Yet,\nthese results still lack a solid theoretical understanding, and it is unclear\nwhether they can be improved by leveraging connections to the wealth of work on\nsparse recovery algorithms. In this paper, we draw new connections between\nthese two areas and present new sparse recovery algorithms inspired by the OBS\nframework that comes with theoretical guarantees under reasonable assumptions\nand have strong practical performance. Specifically, our work starts from the\nobservation that we can leverage curvature information in OBS-like fashion upon\nthe projection step of classic iterative sparse recovery algorithms such as\nIHT. We show for the first time that this leads both to improved convergence\nbounds under standard assumptions. Furthermore, we present extensions of this\napproach to the practical task of obtaining accurate sparse DNNs, and validate\nit experimentally at scale for Transformer-based models on vision and language\ntasks.\n","authors":["Diyuan Wu","Ionut-Vlad Modoranu","Mher Safaryan","Denis Kuznedelev","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2408.17163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17162v1","updated":"2024-08-30T10:05:24Z","published":"2024-08-30T10:05:24Z","title":"Deep Feature Embedding for Tabular Data","summary":" Tabular data learning has extensive applications in deep learning but its\nexisting embedding techniques are limited in numerical and categorical features\nsuch as the inability to capture complex relationships and engineering. This\npaper proposes a novel deep embedding framework with leverages lightweight deep\nneural networks to generate effective feature embeddings for tabular data in\nmachine learning research. For numerical features, a two-step feature expansion\nand deep transformation technique is used to capture copious semantic\ninformation. For categorical features, a unique identification vector for each\nentity is referred by a compact lookup table with a parameterized deep\nembedding function to uniform the embedding size dimensions, and transformed\ninto a embedding vector using deep neural network. Experiments are conducted on\nreal-world datasets for performance evaluation.\n","authors":["Yuqian Wu","Hengyi Luo","Raymond S. T. Lee"],"pdf_url":"https://arxiv.org/pdf/2408.17162v1.pdf","comment":"15 pages, 2figures, accepted to ICONIP 2024, Paper ID: 1399"},{"id":"http://arxiv.org/abs/2401.10726v3","updated":"2024-08-30T10:04:21Z","published":"2024-01-19T14:43:04Z","title":"Empowering Aggregators with Practical Data-Driven Tools: Harnessing\n Aggregated and Disaggregated Flexibility for Demand Response","summary":" This study explores the interaction between aggregators and building\noccupants in activating flexibility through Demand Response (DR) programs, with\na focus on reinforcing the resilience of the energy system considering the\nuncertainties presented by Renewable Energy Sources (RES). Firstly, it\nintroduces a methodology of optimizing aggregated flexibility provision\nstrategies in environments with limited data, utilizing Discrete Fourier\nTransformation (DFT) and clustering techniques to identify building occupants'\nactivity patterns. Secondly, the study assesses the disaggregated flexibility\nprovision of Heating Ventilation and Air Conditioning (HVAC) systems during DR\nevents, employing machine learning and optimization techniques for precise,\ndevice-level analysis. The first approach offers a non-intrusive pathway for\naggregators to provide flexibility services in environments of a single smart\nmeter for the whole building's consumption, while the second approach maximizes\nthe amount of flexibility in the case of dedicated metering devices to the HVAC\nsystems by carefully considering building occupants' thermal comfort profiles.\nThrough the application of data-driven techniques and encompassing case studies\nfrom both industrial and residential buildings, this paper not only unveils\npivotal opportunities for aggregators in the balancing and emerging flexibility\nmarkets but also successfully develops and demonstrates end-to-end practical\ntools for aggregators.\n","authors":["Costas Mylonas","Donata Boric","Leila Luttenberger Maric","Alexandros Tsitsanis","Eleftheria Petrianou","Magda Foti"],"pdf_url":"https://arxiv.org/pdf/2401.10726v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00877v3","updated":"2024-08-30T10:02:22Z","published":"2024-05-01T21:42:38Z","title":"Markov flow policy -- deep MC","summary":" Discounted algorithms often encounter evaluation errors due to their reliance\non short-term estimations, which can impede their efficacy in addressing\nsimple, short-term tasks and impose undesired temporal discounts (\\(\\gamma\\)).\nInterestingly, these algorithms are often tested without applying a discount, a\nphenomenon we refer as the \\textit{train-test bias}. In response to these\nchallenges, we propose the Markov Flow Policy, which utilizes a non-negative\nneural network flow to enable comprehensive forward-view predictions. Through\nintegration into the TD7 codebase and evaluation using the MuJoCo benchmark, we\nobserve significant performance improvements, positioning MFP as a\nstraightforward, practical, and easily implementable solution within the domain\nof average rewards algorithms.\n","authors":["Nitsan Soffair","Gilad Katz"],"pdf_url":"https://arxiv.org/pdf/2405.00877v3.pdf","comment":"Paper has been not finished"},{"id":"http://arxiv.org/abs/2211.15411v6","updated":"2024-08-30T10:01:25Z","published":"2022-11-09T08:11:16Z","title":"Solving Collaborative Dec-POMDPs with Deep Reinforcement Learning\n Heuristics","summary":" WQMIX, QMIX, QTRAN, and VDN are SOTA algorithms for Dec-POMDP. All of them\ncannot solve complex agents' cooperation domains. We give an algorithm to solve\nsuch problems. In the first stage, we solve a single-agent problem and get a\npolicy. In the second stage, we solve the multi-agent problem with the\nsingle-agent policy. SA2MA has a clear advantage over all competitors in\ncomplex agents' cooperative domains.\n","authors":["Nitsan Soffair"],"pdf_url":"https://arxiv.org/pdf/2211.15411v6.pdf","comment":"Paper has been not finished"},{"id":"http://arxiv.org/abs/2408.17151v1","updated":"2024-08-30T09:40:52Z","published":"2024-08-30T09:40:52Z","title":"Investigating Privacy Leakage in Dimensionality Reduction Methods via\n Reconstruction Attack","summary":" This study investigates privacy leakage in dimensionality reduction methods\nthrough a novel machine learning-based reconstruction attack. Employing an\n\\emph{informed adversary} threat model, we develop a neural network capable of\nreconstructing high-dimensional data from low-dimensional embeddings.\n We evaluate six popular dimensionality reduction techniques: PCA, sparse\nrandom projection (SRP), multidimensional scaling (MDS), Isomap, $t$-SNE, and\nUMAP. Using both MNIST and NIH Chest X-ray datasets, we perform a qualitative\nanalysis to identify key factors affecting reconstruction quality. Furthermore,\nwe assess the effectiveness of an additive noise mechanism in mitigating these\nreconstruction attacks.\n","authors":["Chayadon Lumbut","Donlapark Ponnoprat"],"pdf_url":"https://arxiv.org/pdf/2408.17151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17148v1","updated":"2024-08-30T09:38:51Z","published":"2024-08-30T09:38:51Z","title":"The Many Faces of Optimal Weak-to-Strong Learning","summary":" Boosting is an extremely successful idea, allowing one to combine multiple\nlow accuracy classifiers into a much more accurate voting classifier. In this\nwork, we present a new and surprisingly simple Boosting algorithm that obtains\na provably optimal sample complexity. Sample optimal Boosting algorithms have\nonly recently been developed, and our new algorithm has the fastest runtime\namong all such algorithms and is the simplest to describe: Partition your\ntraining data into 5 disjoint pieces of equal size, run AdaBoost on each, and\ncombine the resulting classifiers via a majority vote. In addition to this\ntheoretical contribution, we also perform the first empirical comparison of the\nproposed sample optimal Boosting algorithms. Our pilot empirical study suggests\nthat our new algorithm might outperform previous algorithms on large data sets.\n","authors":["Mikael Møller Høgsgaard","Kasper Green Larsen","Markus Engelund Mathiasen"],"pdf_url":"https://arxiv.org/pdf/2408.17148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17145v1","updated":"2024-08-30T09:35:36Z","published":"2024-08-30T09:35:36Z","title":"Towards Hyper-parameter-free Federated Learning","summary":" The adaptive synchronization techniques in federated learning (FL) for scaled\nglobal model updates show superior performance over the vanilla federated\naveraging (FedAvg) scheme. However, existing methods employ additional tunable\nhyperparameters on the server to determine the scaling factor. A contrasting\napproach is automated scaling analogous to tuning-free step-size schemes in\nstochastic gradient descent (SGD) methods, which offer competitive convergence\nrates and exhibit good empirical performance. In this work, we introduce two\nalgorithms for automated scaling of global model updates. In our first\nalgorithm, we establish that a descent-ensuring step-size regime at the clients\nensures descent for the server objective. We show that such a scheme enables\nlinear convergence for strongly convex federated objectives. Our second\nalgorithm shows that the average of objective values of sampled clients is a\npractical and effective substitute for the objective function value at the\nserver required for computing the scaling factor, whose computation is\notherwise not permitted. Our extensive empirical results show that the proposed\nmethods perform at par or better than the popular federated learning algorithms\nfor both convex and non-convex problems. Our work takes a step towards\ndesigning hyper-parameter-free federated learning.\n","authors":[" Geetika","Drishya Uniyal","Bapi Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2408.17145v1.pdf","comment":"28 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.15799v5","updated":"2024-08-30T09:33:53Z","published":"2023-03-28T08:07:28Z","title":"FedAgg: Adaptive Federated Learning with Aggregated Gradients","summary":" Federated Learning (FL) has emerged as a crucial distributed training\nparadigm, enabling discrete devices to collaboratively train a shared model\nunder the coordination of a central server, while leveraging their locally\nstored private data. Nonetheless, the\nnon-independent-and-identically-distributed (Non-IID) data generated on\nheterogeneous clients and the incessant information exchange among participants\nmay significantly impede training efficacy, retard the model convergence rate\nand increase the risk of privacy leakage. To alleviate the divergence between\nthe local and average model parameters and obtain a fast model convergence\nrate, we propose an adaptive FEDerated learning algorithm called FedAgg by\nrefining the conventional stochastic gradient descent (SGD) methodology with an\nAGgregated Gradient term at each local training epoch and adaptively adjusting\nthe learning rate based on a penalty term that quantifies the local model\ndeviation. To tackle the challenge of information exchange among clients during\nlocal training and design a decentralized adaptive learning rate for each\nclient, we introduce two mean-field terms to approximate the average local\nparameters and gradients over time. Through rigorous theoretical analysis, we\ndemonstrate the existence and convergence of the mean-field terms and provide a\nrobust upper bound on the convergence of our proposed algorithm. The extensive\nexperimental results on real-world datasets substantiate the superiority of our\nframework in comparison with existing state-of-the-art FL strategies for\nenhancing model performance and accelerating convergence rate under IID and\nNon-IID datasets.\n","authors":["Wenhao Yuan","Xuehe Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15799v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17139v1","updated":"2024-08-30T09:28:32Z","published":"2024-08-30T09:28:32Z","title":"Flow Matching for Optimal Reaction Coordinates of Biomolecular System","summary":" We present Flow Matching for Reaction Coordinates (FMRC), a novel deep\nlearning algorithm designed to identify optimal reaction coordinates (RC) in\nbiomolecular reversible dynamics. FMRC is based on the mathematical principles\nof lumpability and decomposability, which we reformulate into a conditional\nprobability framework for efficient data-driven optimization using deep\ngenerative models. While FMRC does not explicitly learn the well-established\ntransfer operator or its eigenfunctions, it can effectively encode the dynamics\nof leading eigenfunctions of the system transfer operator into its\nlow-dimensional RC space. We further quantitatively compare its performance\nwith several state-of-the-art algorithms by evaluating the quality of Markov\nState Models (MSM) constructed in their respective RC spaces, demonstrating the\nsuperiority of FMRC in three increasingly complex biomolecular systems.\nFinally, we discuss its potential applications in downstream applications such\nas enhanced sampling methods and MSM construction.\n","authors":["Mingyuan Zhang","Zhicheng Zhang","Yong Wang","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.17139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17129v1","updated":"2024-08-30T09:14:38Z","published":"2024-08-30T09:14:38Z","title":"Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph\n Neural Networks for Drug Response Prediction","summary":" Graph Neural Networks have been widely applied in critical decision-making\nareas that demand interpretable predictions, leading to the flourishing\ndevelopment of interpretability algorithms. However, current graph\ninterpretability algorithms tend to emphasize generality and often overlook\nbiological significance, thereby limiting their applicability in predicting\ncancer drug responses. In this paper, we propose a novel post-hoc\ninterpretability algorithm for cancer drug response prediction, CETExplainer,\nwhich incorporates a controllable edge-type-specific weighting mechanism. It\nconsiders the mutual information between subgraphs and predictions, proposing a\nstructural scoring approach to provide fine-grained, biologically meaningful\nexplanations for predictive models. We also introduce a method for constructing\nground truth based on real-world datasets to quantitatively evaluate the\nproposed interpretability algorithm. Empirical analysis on the real-world\ndataset demonstrates that CETExplainer achieves superior stability and improves\nexplanation quality compared to leading algorithms, thereby offering a robust\nand insightful tool for cancer drug prediction.\n","authors":["Xiaodi Li","Jianfeng Gui","Qian Gao","Haoyuan Shi","Zhenyu Yue"],"pdf_url":"https://arxiv.org/pdf/2408.17129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02895v2","updated":"2024-08-30T09:14:18Z","published":"2022-12-06T11:38:22Z","title":"Training Neural Networks on Data Sources with Unknown Reliability","summary":" When data is generated by multiple sources, conventional training methods\nupdate models assuming equal reliability for each source and do not consider\ntheir individual data quality during training. However, in many applications,\nsources have varied levels of reliability that can have negative effects on the\nperformance of a neural network. A key issue is that often the quality of data\nfor individual sources is not known during training. Focusing on supervised\nlearning, this work presents a solution that aims to train neural networks on\neach data source for a number of steps proportional to the source's estimated\nrelative reliability. This way, we allow training on all sources during the\nwarm-up, and reduce learning on less reliable sources during the final training\nstages, when it has been shown models overfit to noise. We show through diverse\nexperiments, this can significantly improve model performance when trained on\nmixtures of reliable and unreliable data sources, and maintain performance when\nmodels are trained on reliable sources only.\n","authors":["Alexander Capstick","Francesca Palermo","Tianyu Cui","Payam Barnaghi"],"pdf_url":"https://arxiv.org/pdf/2212.02895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17118v1","updated":"2024-08-30T09:01:04Z","published":"2024-08-30T09:01:04Z","title":"Efficient Estimation of Unique Components in Independent Component\n Analysis by Matrix Representation","summary":" Independent component analysis (ICA) is a widely used method in various\napplications of signal processing and feature extraction. It extends principal\ncomponent analysis (PCA) and can extract important and complicated components\nwith small variances. One of the major problems of ICA is that the uniqueness\nof the solution is not guaranteed, unlike PCA. That is because there are many\nlocal optima in optimizing the objective function of ICA. It has been shown\npreviously that the unique global optimum of ICA can be estimated from many\nrandom initializations by handcrafted thread computation. In this paper, the\nunique estimation of ICA is highly accelerated by reformulating the algorithm\nin matrix representation and reducing redundant calculations. Experimental\nresults on artificial datasets and EEG data verified the efficiency of the\nproposed method.\n","authors":["Yoshitatsu Matsuda","Kazunori Yamaguch"],"pdf_url":"https://arxiv.org/pdf/2408.17118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17108v1","updated":"2024-08-30T08:49:27Z","published":"2024-08-30T08:49:27Z","title":"Sparse Uncertainty-Informed Sampling from Federated Streaming Data","summary":" We present a numerically robust, computationally efficient approach for\nnon-I.I.D. data stream sampling in federated client systems, where resources\nare limited and labeled data for local model adaptation is sparse and\nexpensive. The proposed method identifies relevant stream observations to\noptimize the underlying client model, given a local labeling budget, and\nperforms instantaneous labeling decisions without relying on any memory\nbuffering strategies. Our experiments show enhanced training batch diversity\nand an improved numerical robustness of the proposal compared to existing\nstrategies over large-scale data streams, making our approach an effective and\nconvenient solution in FL environments.\n","authors":["Manuel Röder","Frank-Michael Schleif"],"pdf_url":"https://arxiv.org/pdf/2408.17108v1.pdf","comment":"Preprint, 6 pages, 3 figures, Accepted for ESANN 2024"},{"id":"http://arxiv.org/abs/2408.17095v1","updated":"2024-08-30T08:26:55Z","published":"2024-08-30T08:26:55Z","title":"RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation\n and Retrieval-Guidance","summary":" Diffusion-based models demonstrate impressive generation capabilities.\nHowever, they also have a massive number of parameters, resulting in enormous\nmodel sizes, thus making them unsuitable for deployment on resource-constraint\ndevices. Block-wise generation can be a promising alternative for designing\ncompact-sized (parameter-efficient) deep generative models since the model can\ngenerate one block at a time instead of generating the whole image at once.\nHowever, block-wise generation is also considerably challenging because\nensuring coherence across generated blocks can be non-trivial. To this end, we\ndesign a retrieval-augmented generation (RAG) approach and leverage the\ncorresponding blocks of the images retrieved by the RAG module to condition the\ntraining and generation stages of a block-wise denoising diffusion model. Our\nconditioning schemes ensure coherence across the different blocks during\ntraining and, consequently, during generation. While we showcase our approach\nusing the latent diffusion model (LDM) as the base model, it can be used with\nother variants of denoising diffusion models. We validate the solution of the\ncoherence problem through the proposed approach by reporting substantive\nexperiments to demonstrate our approach's effectiveness in compact model size\nand excellent generation quality.\n","authors":["Avideep Mukherjee","Soumya Banerjee","Vinay P. Namboodiri","Piyush Rai"],"pdf_url":"https://arxiv.org/pdf/2408.17095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17090v1","updated":"2024-08-30T08:22:30Z","published":"2024-08-30T08:22:30Z","title":"FissionVAE: Federated Non-IID Image Generation with Latent Space and\n Decoder Decomposition","summary":" Federated learning is a machine learning paradigm that enables decentralized\nclients to collaboratively learn a shared model while keeping all the training\ndata local. While considerable research has focused on federated image\ngeneration, particularly Generative Adversarial Networks, Variational\nAutoencoders have received less attention. In this paper, we address the\nchallenges of non-IID (independently and identically distributed) data\nenvironments featuring multiple groups of images of different types.\nSpecifically, heterogeneous data distributions can lead to difficulties in\nmaintaining a consistent latent space and can also result in local generators\nwith disparate texture features being blended during aggregation. We introduce\na novel approach, FissionVAE, which decomposes the latent space and constructs\ndecoder branches tailored to individual client groups. This method allows for\ncustomized learning that aligns with the unique data distributions of each\ngroup. Additionally, we investigate the incorporation of hierarchical VAE\narchitectures and demonstrate the use of heterogeneous decoder architectures\nwithin our model. We also explore strategies for setting the latent prior\ndistributions to enhance the decomposition process. To evaluate our approach,\nwe assemble two composite datasets: the first combines MNIST and FashionMNIST;\nthe second comprises RGB datasets of cartoon and human faces, wild animals,\nmarine vessels, and remote sensing images of Earth. Our experiments demonstrate\nthat FissionVAE greatly improves generation quality on these datasets compared\nto baseline federated VAE models.\n","authors":["Chen Hu","Jingjing Deng","Xianghua Xie","Xiaoke Ma"],"pdf_url":"https://arxiv.org/pdf/2408.17090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02606v2","updated":"2024-08-30T08:12:23Z","published":"2023-10-04T06:42:33Z","title":"Mending of Spatio-Temporal Dependencies in Block Adjacency Matrix","summary":" In the realm of applications where data dynamically evolves across spatial\nand temporal dimensions, Graph Neural Networks (GNNs) are often complemented by\nsequence modeling architectures, such as RNNs and transformers, to effectively\nmodel temporal changes. These hybrid models typically arrange the spatial and\ntemporal learning components in series. A pioneering effort to jointly model\nthe spatio-temporal dependencies using only GNNs was the introduction of the\nBlock Adjacency Matrix \\(\\mathbf{A_B}\\) \\cite{1}, which was constructed by\ndiagonally concatenating adjacency matrices from graphs at different time\nsteps. This approach resulted in a single graph encompassing complete\nspatio-temporal data; however, the graphs from different time steps remained\ndisconnected, limiting GNN message-passing to spatially connected nodes only.\nAddressing this critical challenge, we propose a novel end-to-end learning\narchitecture specifically designed to mend the temporal dependencies, resulting\nin a well-connected graph. Thus, we provide a framework for the learnable\nrepresentation of spatio-temporal data as graphs. Our methodology demonstrates\nsuperior performance on benchmark datasets, such as SurgVisDom and C2D2,\nsurpassing existing state-of-the-art graph models in terms of accuracy. Our\nmodel also achieves significantly lower computational complexity, having far\nfewer parameters than methods reliant on CLIP and 3D CNN architectures.\n","authors":["Osama Ahmad","Omer Abdul Jalil","Usman Nazir","Murtaza Taj"],"pdf_url":"https://arxiv.org/pdf/2310.02606v2.pdf","comment":"Accepted at ICONIP 2024"},{"id":"http://arxiv.org/abs/2406.04099v2","updated":"2024-08-30T08:05:08Z","published":"2024-06-06T14:15:12Z","title":"Enhancing Weather Predictions: Super-Resolution via Deep Diffusion\n Models","summary":" This study investigates the application of deep-learning diffusion models for\nthe super-resolution of weather data, a novel approach aimed at enhancing the\nspatial resolution and detail of meteorological variables. Leveraging the\ncapabilities of diffusion models, specifically the SR3 and ResDiff\narchitectures, we present a methodology for transforming low-resolution weather\ndata into high-resolution outputs. Our experiments, conducted using the\nWeatherBench dataset, focus on the super-resolution of the two-meter\ntemperature variable, demonstrating the models' ability to generate detailed\nand accurate weather maps. The results indicate that the ResDiff model, further\nimproved by incorporating physics-based modifications, significantly\noutperforms traditional SR3 methods in terms of Mean Squared Error (MSE),\nStructural Similarity Index (SSIM), and Peak Signal-to-Noise Ratio (PSNR). This\nresearch highlights the potential of diffusion models in meteorological\napplications, offering insights into their effectiveness, challenges, and\nprospects for future advancements in weather prediction and climate analysis.\n","authors":["Jan Martinů","Petr Šimánek"],"pdf_url":"https://arxiv.org/pdf/2406.04099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17064v1","updated":"2024-08-30T07:49:35Z","published":"2024-08-30T07:49:35Z","title":"Instant Adversarial Purification with Adversarial Consistency\n Distillation","summary":" Neural networks, despite their remarkable performance in widespread\napplications, including image classification, are also known to be vulnerable\nto subtle adversarial noise. Although some diffusion-based purification methods\nhave been proposed, for example, DiffPure, those methods are time-consuming. In\nthis paper, we propose One Step Control Purification (OSCP), a diffusion-based\npurification model that can purify the adversarial image in one Neural Function\nEvaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and\nControlNet for our one-step purification. OSCP is computationally friendly and\ntime efficient compared to other diffusion-based purification methods; we\nachieve defense success rate of 74.19\\% on ImageNet, only requiring 0.1s for\neach purification. Moreover, there is a fundamental incongruence between\nconsistency distillation and adversarial perturbation. To address this\nontological dissonance, we propose Gaussian Adversarial Noise Distillation\n(GAND), a novel consistency distillation framework that facilitates a more\nnuanced reconciliation of the latent space dynamics, effectively bridging the\nnatural and adversarial manifolds. Our experiments show that the GAND does not\nneed a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient.\n","authors":["Chun Tong Lei","Hon Ming Yam","Zhongliang Guo","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.17064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17059v1","updated":"2024-08-30T07:38:28Z","published":"2024-08-30T07:38:28Z","title":"A Survey of the Self Supervised Learning Mechanisms for Vision\n Transformers","summary":" Deep supervised learning models require high volume of labeled data to attain\nsufficiently good results. Although, the practice of gathering and annotating\nsuch big data is costly and laborious. Recently, the application of self\nsupervised learning (SSL) in vision tasks has gained significant attention. The\nintuition behind SSL is to exploit the synchronous relationships within the\ndata as a form of self-supervision, which can be versatile. In the current big\ndata era, most of the data is unlabeled, and the success of SSL thus relies in\nfinding ways to improve this vast amount of unlabeled data available. Thus its\nbetter for deep learning algorithms to reduce reliance on human supervision and\ninstead focus on self-supervision based on the inherent relationships within\nthe data. With the advent of ViTs, which have achieved remarkable results in\ncomputer vision, it is crucial to explore and understand the various SSL\nmechanisms employed for training these models specifically in scenarios where\nthere is less label data available. In this survey we thus develop a\ncomprehensive taxonomy of systematically classifying the SSL techniques based\nupon their representations and pre-training tasks being applied. Additionally,\nwe discuss the motivations behind SSL, review popular pre-training tasks, and\nhighlight the challenges and advancements in this field. Furthermore, we\npresent a comparative analysis of different SSL methods, evaluate their\nstrengths and limitations, and identify potential avenues for future research.\n","authors":["Asifullah Khan","Anabia Sohail","Mustansar Fiaz","Mehdi Hassan","Tariq Habib Afridi","Sibghat Ullah Marwat","Farzeen Munir","Safdar Ali","Hannan Naseem","Muhammad Zaigham Zaheer","Kamran Ali","Tangina Sultana","Ziaurrehman Tanoli","Naeem Akhter"],"pdf_url":"https://arxiv.org/pdf/2408.17059v1.pdf","comment":"34 Pages, 5 Figures, 7 Tables"},{"id":"http://arxiv.org/abs/2405.11432v2","updated":"2024-08-30T07:37:25Z","published":"2024-05-19T03:27:31Z","title":"On Robust Reinforcement Learning with Lipschitz-Bounded Policy Networks","summary":" This paper presents a study of robust policy networks in deep reinforcement\nlearning. We investigate the benefits of policy parameterizations that\nnaturally satisfy constraints on their Lipschitz bound, analyzing their\nempirical performance and robustness on two representative problems: pendulum\nswing-up and Atari Pong. We illustrate that policy networks with smaller\nLipschitz bounds are more robust to disturbances, random noise, and targeted\nadversarial attacks than unconstrained policies composed of vanilla multi-layer\nperceptrons or convolutional neural networks. However, the structure of the\nLipschitz layer is important. We find that the widely-used method of spectral\nnormalization is too conservative and severely impacts clean performance,\nwhereas more expressive Lipschitz layers such as the recently-proposed Sandwich\nlayer can achieve improved robustness without sacrificing clean performance.\n","authors":["Nicholas H. Barbara","Ruigang Wang","Ian R. Manchester"],"pdf_url":"https://arxiv.org/pdf/2405.11432v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17053v1","updated":"2024-08-30T07:23:59Z","published":"2024-08-30T07:23:59Z","title":"Estimating Conditional Average Treatment Effects via Sufficient\n Representation Learning","summary":" Estimating the conditional average treatment effects (CATE) is very important\nin causal inference and has a wide range of applications across many fields. In\nthe estimation process of CATE, the unconfoundedness assumption is typically\nrequired to ensure the identifiability of the regression problems. When\nestimating CATE using high-dimensional data, there have been many variable\nselection methods and neural network approaches based on representation\nlearning, while these methods do not provide a way to verify whether the subset\nof variables after dimensionality reduction or the learned representations\nstill satisfy the unconfoundedness assumption during the estimation process,\nwhich can lead to ineffective estimates of the treatment effects. Additionally,\nthese methods typically use data from only the treatment or control group when\nestimating the regression functions for each group. This paper proposes a novel\nneural network approach named \\textbf{CrossNet} to learn a sufficient\nrepresentation for the features, based on which we then estimate the CATE,\nwhere cross indicates that in estimating the regression functions, we used data\nfrom their own group as well as cross-utilized data from another group.\nNumerical simulations and empirical results demonstrate that our method\noutperforms the competitive approaches.\n","authors":["Pengfei Shi","Wei Zhong","Xinyu Zhang","Ningtao Wang","Xing Fu","Weiqiang Wang","Yin Jin"],"pdf_url":"https://arxiv.org/pdf/2408.17053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13192v3","updated":"2024-08-30T06:49:51Z","published":"2024-01-24T02:36:52Z","title":"Generative Design of Crystal Structures by Point Cloud Representations\n and Diffusion Model","summary":" Efficiently generating energetically stable crystal structures has long been\na challenge in material design, primarily due to the immense arrangement of\natoms in a crystal lattice. To facilitate the discovery of stable material, we\npresent a framework for the generation of synthesizable materials, leveraging a\npoint cloud representation to encode intricate structural information. At the\nheart of this framework lies the introduction of a diffusion model as its\nfoundational pillar. To gauge the efficacy of our approach, we employ it to\nreconstruct input structures from our training datasets, rigorously validating\nits high reconstruction performance. Furthermore, we demonstrate the profound\npotential of Point Cloud-Based Crystal Diffusion (PCCD) by generating entirely\nnew materials, emphasizing their synthesizability. Our research stands as a\nnoteworthy contribution to the advancement of materials design and synthesis\nthrough the cutting-edge avenue of generative design instead of the\nconventional substitution or experience-based discovery.\n","authors":["Zhelin Li","Rami Mrad","Runxian Jiao","Guan Huang","Jun Shan","Shibing Chu","Yuanping Chen"],"pdf_url":"https://arxiv.org/pdf/2401.13192v3.pdf","comment":"I have submitted to a journal"},{"id":"http://arxiv.org/abs/2408.15545v2","updated":"2024-08-30T06:42:36Z","published":"2024-08-28T05:41:52Z","title":"SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding","summary":" Scientific literature understanding is crucial for extracting targeted\ninformation and garnering insights, thereby significantly advancing scientific\ndiscovery. Despite the remarkable success of Large Language Models (LLMs), they\nface challenges in scientific literature understanding, primarily due to (1) a\nlack of scientific knowledge and (2) unfamiliarity with specialized scientific\ntasks.\n To develop an LLM specialized in scientific literature understanding, we\npropose a hybrid strategy that integrates continual pre-training (CPT) and\nsupervised fine-tuning (SFT), to simultaneously infuse scientific domain\nknowledge and enhance instruction-following capabilities for domain-specific\ntasks.cIn this process, we identify two key challenges: (1) constructing\nhigh-quality CPT corpora, and (2) generating diverse SFT instructions. We\naddress these challenges through a meticulous pipeline, including PDF text\nextraction, parsing content error correction, quality filtering, and synthetic\ninstruction creation. Applying this strategy, we present a suite of LLMs:\nSciLitLLM, specialized in scientific literature understanding. These models\ndemonstrate promising performance on scientific literature understanding\nbenchmarks.\n Our contributions are threefold: (1) We present an effective framework that\nintegrates CPT and SFT to adapt LLMs to scientific literature understanding,\nwhich can also be easily adapted to other domains. (2) We propose an LLM-based\nsynthesis method to generate diverse and high-quality scientific instructions,\nresulting in a new instruction set -- SciLitIns -- for supervised fine-tuning\nin less-represented scientific domains. (3) SciLitLLM achieves promising\nperformance improvements on scientific literature understanding benchmarks.\n","authors":["Sihang Li","Jin Huang","Jiaxi Zhuang","Yaorui Shi","Xiaochen Cai","Mingjun Xu","Xiang Wang","Linfeng Zhang","Guolin Ke","Hengxing Cai"],"pdf_url":"https://arxiv.org/pdf/2408.15545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15268v2","updated":"2024-08-30T06:18:45Z","published":"2024-08-12T14:23:42Z","title":"Anomaly Detection in Time Series of EDFA Pump Currents to Monitor\n Degeneration Processes using Fuzzy Clustering","summary":" This article proposes a novel fuzzy clustering based anomaly detection method\nfor pump current time series of EDFA systems. The proposed change detection\nframework (CDF) strategically combines the advantages of entropy analysis (EA)\nand principle component analysis (PCA) with fuzzy clustering procedures. In the\nframework, EA is applied for dynamic selection of features for reduction of the\nfeature space and increase of computational performance. Furthermore, PCA is\nutilized to extract features from the raw feature space to enable\ngeneralization capability of the subsequent fuzzy clustering procedures. Three\ndifferent fuzzy clustering methods, more precisely the fuzzy clustering\nalgorithm, a probabilistic clustering algorithm and a possibilistic clustering\nalgorithm are evaluated for performance and generalization. Hence, the proposed\nframework has the innovative feature to detect changes in pump current time\nseries at an early stage for arbitrary points of operation, compared to\nstate-of-the-art predefined alarms in commercially used EDFAs. Moreover, the\napproach is implemented and tested using experimental data. In addition, the\nproposed framework enables further approaches of applying decentralized\npredictive maintenance for optical fiber networks.\n","authors":["Dominic Schneider","Lutz Rapp","Christoph Ament"],"pdf_url":"https://arxiv.org/pdf/2408.15268v2.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.19121v3","updated":"2024-08-30T06:17:46Z","published":"2024-06-27T12:05:55Z","title":"Towards Learning Abductive Reasoning using VSA Distributed\n Representations","summary":" We introduce the Abductive Rule Learner with Context-awareness (ARLC), a\nmodel that solves abstract reasoning tasks based on Learn-VRF. ARLC features a\nnovel and more broadly applicable training objective for abductive reasoning,\nresulting in better interpretability and higher accuracy when solving Raven's\nprogressive matrices (RPM). ARLC allows both programming domain knowledge and\nlearning the rules underlying a data distribution. We evaluate ARLC on the\nI-RAVEN dataset, showcasing state-of-the-art accuracy across both\nin-distribution and out-of-distribution (unseen attribute-rule pairs) tests.\nARLC surpasses neuro-symbolic and connectionist baselines, including large\nlanguage models, despite having orders of magnitude fewer parameters. We show\nARLC's robustness to post-programming training by incrementally learning from\nexamples on top of programmed knowledge, which only improves its performance\nand does not result in catastrophic forgetting of the programmed solution. We\nvalidate ARLC's seamless transfer learning from a 2x2 RPM constellation to\nunseen constellations. Our code is available at\nhttps://github.com/IBM/abductive-rule-learner-with-context-awareness.\n","authors":["Giacomo Camposampiero","Michael Hersche","Aleksandar Terzić","Roger Wattenhofer","Abu Sebastian","Abbas Rahimi"],"pdf_url":"https://arxiv.org/pdf/2406.19121v3.pdf","comment":"Accepted at the 18th International Conference on Neural-Symbolic\n Learning and Reasoning (NeSy) 2024 [Spotlight]"},{"id":"http://arxiv.org/abs/2310.13019v4","updated":"2024-08-30T05:50:56Z","published":"2023-10-18T18:50:39Z","title":"Tailoring Adversarial Attacks on Deep Neural Networks for Targeted Class\n Manipulation Using DeepFool Algorithm","summary":" The susceptibility of deep neural networks (DNNs) to adversarial attacks\nundermines their reliability across numerous applications, underscoring the\nnecessity for an in-depth exploration of these vulnerabilities and the\nformulation of robust defense strategies. The DeepFool algorithm by\nMoosavi-Dezfooli et al. (2016) represents a pivotal step in identifying minimal\nperturbations required to induce misclassification of input images.\nNonetheless, its generic methodology falls short in scenarios necessitating\ntargeted interventions. Additionally, previous research studies have\npredominantly concentrated on the success rate of attacks without adequately\naddressing the consequential distortion of images, the maintenance of image\nquality, or the confidence threshold required for misclassification. To bridge\nthese gaps, we introduce the Enhanced Targeted DeepFool (ET DeepFool)\nalgorithm, an evolution of DeepFool that not only facilitates the specification\nof desired misclassification targets but also incorporates a configurable\nminimum confidence score. Our empirical investigations demonstrate the\nsuperiority of this refined approach in maintaining the integrity of images and\nminimizing perturbations across a variety of DNN architectures. Unlike previous\niterations, such as the Targeted DeepFool by Gajjar et al. (2022), our method\ngrants unparalleled control over the perturbation process, enabling precise\nmanipulation of model responses. Preliminary outcomes reveal that certain\nmodels, including AlexNet and the advanced Vision Transformer, display\ncommendable robustness to such manipulations. This discovery of varying levels\nof model robustness, as unveiled through our confidence level adjustments,\ncould have far-reaching implications for the field of image recognition. Our\ncode will be made public upon acceptance of the paper.\n","authors":["S. M. Fazle Rabby Labib","Joyanta Jyoti Mondal","Meem Arafat Manab","Sarfaraz Newaz","Xi Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.13019v4.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.17016v1","updated":"2024-08-30T05:13:11Z","published":"2024-08-30T05:13:11Z","title":"Error-controlled non-additive interaction discovery in machine learning\n models","summary":" Machine learning (ML) models are powerful tools for detecting complex\npatterns within data, yet their \"black box\" nature limits their\ninterpretability, hindering their use in critical domains like healthcare and\nfinance. To address this challenge, interpretable ML methods have been\ndeveloped to explain how features influence model predictions. However, these\nmethods often focus on univariate feature importance, overlooking the complex\ninteractions between features that ML models are capable of capturing.\nRecognizing this limitation, recent efforts have aimed to extend these methods\nto discover feature interactions, but existing approaches struggle with\nrobustness and error control, especially under data perturbations. In this\nstudy, we introduce Diamond, a novel method for trustworthy feature interaction\ndiscovery. Diamond uniquely integrates the model-X knockoffs framework to\ncontrol the false discovery rate (FDR), ensuring that the proportion of falsely\ndiscovered interactions remains low. We further address the challenges of using\noff-the-shelf interaction importance measures by proposing a calibration\nprocedure that refines these measures to maintain the desired FDR. Diamond's\napplicability spans a wide range of ML models, including deep neural networks,\ntree-based models, and factorization-based models. Our empirical evaluations on\nboth simulated and real datasets across various biomedical studies demonstrate\nDiamond's utility in enabling more reliable data-driven scientific discoveries.\nThis method represents a significant step forward in the deployment of ML\nmodels for scientific innovation and hypothesis generation.\n","authors":["Winston Chen","Yifan Jiang","William Stafford Noble","Yang Young Lu"],"pdf_url":"https://arxiv.org/pdf/2408.17016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15478v3","updated":"2024-08-30T05:02:12Z","published":"2024-02-23T18:12:53Z","title":"Transformers are Expressive, But Are They Expressive Enough for\n Regression?","summary":" Transformers have become pivotal in Natural Language Processing,\ndemonstrating remarkable success in applications like Machine Translation and\nSummarization. Given their widespread adoption, several works have attempted to\nanalyze the expressivity of Transformers. Expressivity of a neural network is\nthe class of functions it can approximate. A neural network is fully expressive\nif it can act as a universal function approximator. We attempt to analyze the\nsame for Transformers. Contrary to existing claims, our findings reveal that\nTransformers struggle to reliably approximate smooth functions, relying on\npiecewise constant approximations with sizable intervals. The central question\nemerges as: ''Are Transformers truly Universal Function Approximators?'' To\naddress this, we conduct a thorough investigation, providing theoretical\ninsights and supporting evidence through experiments. Theoretically, we prove\nthat Transformer Encoders cannot approximate smooth functions. Experimentally,\nwe complement our theory and show that the full Transformer architecture cannot\napproximate smooth functions. By shedding light on these challenges, we\nadvocate a refined understanding of Transformers' capabilities. Code Link:\nhttps://github.com/swaroop-nath/transformer-expressivity.\n","authors":["Swaroop Nath","Harshad Khadilkar","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2402.15478v3.pdf","comment":"18 pages, 17 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.17011v1","updated":"2024-08-30T04:51:19Z","published":"2024-08-30T04:51:19Z","title":"Disease Classification and Impact of Pretrained Deep Convolution Neural\n Networks on Diverse Medical Imaging Datasets across Imaging Modalities","summary":" Imaging techniques such as Chest X-rays, whole slide images, and optical\ncoherence tomography serve as the initial screening and detection for a wide\nvariety of medical pulmonary and ophthalmic conditions respectively. This paper\ninvestigates the intricacies of using pretrained deep convolutional neural\nnetworks with transfer learning across diverse medical imaging datasets with\nvarying modalities for binary and multiclass classification. We conducted a\ncomprehensive performance analysis with ten network architectures and model\nfamilies each with pretraining and random initialization. Our finding showed\nthat the use of pretrained models as fixed feature extractors yields poor\nperformance irrespective of the datasets. Contrary, histopathology microscopy\nwhole slide images have better performance. It is also found that deeper and\nmore complex architectures did not necessarily result in the best performance.\nThis observation implies that the improvements in ImageNet are not parallel to\nthe medical imaging tasks. Within a medical domain, the performance of the\nnetwork architectures varies within model families with shifts in datasets.\nThis indicates that the performance of models within a specific modality may\nnot be conclusive for another modality within the same domain. This study\nprovides a deeper understanding of the applications of deep learning techniques\nin medical imaging and highlights the impact of pretrained networks across\ndifferent medical imaging datasets under five different experimental settings.\n","authors":["Jutika Borah","Kumaresh Sarmah","Hidam Kumarjit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.17011v1.pdf","comment":"15 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.17010v1","updated":"2024-08-30T04:50:27Z","published":"2024-08-30T04:50:27Z","title":"Improving Time Series Classification with Representation Soft Label\n Smoothing","summary":" Previous research has indicated that deep neural network based models for\ntime series classification (TSC) tasks are prone to overfitting. This issue can\nbe mitigated by employing strategies that prevent the model from becoming\noverly confident in its predictions, such as label smoothing and confidence\npenalty. Building upon the concept of label smoothing, we propose a novel\napproach to generate more reliable soft labels, which we refer to as\nrepresentation soft label smoothing. We apply label smoothing, confidence\npenalty, and our method representation soft label smoothing to several TSC\nmodels and compare their performance with baseline method which only uses hard\nlabels for training. Our results demonstrate that the use of these enhancement\ntechniques yields competitive results compared to the baseline method.\nImportantly, our method demonstrates strong performance across models with\nvarying structures and complexities.\n","authors":["Hengyi Ma","Weitong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.17010v1.pdf","comment":"14 pages,6 figures"},{"id":"http://arxiv.org/abs/2408.17008v1","updated":"2024-08-30T04:40:35Z","published":"2024-08-30T04:40:35Z","title":"Evaluation of Table Representations to Answer Questions from Tables in\n Documents : A Case Study using 3GPP Specifications","summary":" With the ubiquitous use of document corpora for question answering, one\nimportant aspect which is especially relevant for technical documents is the\nability to extract information from tables which are interspersed with text.\nThe major challenge in this is that unlike free-flow text or isolated set of\ntables, the representation of a table in terms of what is a relevant chunk is\nnot obvious. We conduct a series of experiments examining various\nrepresentations of tabular data interspersed with text to understand the\nrelative benefits of different representations. We choose a corpus of $3^{rd}$\nGeneration Partnership Project (3GPP) documents since they are heavily\ninterspersed with tables. We create expert curated dataset of question answers\nto evaluate our approach. We conclude that row level representations with\ncorresponding table header information being included in every cell improves\nthe performance of the retrieval, thus leveraging the structural information\npresent in the tabular data.\n","authors":["Sujoy Roychowdhury","Sumit Soman","HG Ranjani","Avantika Sharma","Neeraj Gunda","Sai Krishna Bala"],"pdf_url":"https://arxiv.org/pdf/2408.17008v1.pdf","comment":"10 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.16999v1","updated":"2024-08-30T04:11:35Z","published":"2024-08-30T04:11:35Z","title":"A Tighter Convergence Proof of Reverse Experience Replay","summary":" In reinforcement learning, Reverse Experience Replay (RER) is a recently\nproposed algorithm that attains better sample complexity than the classic\nexperience replay method. RER requires the learning algorithm to update the\nparameters through consecutive state-action-reward tuples in reverse order.\nHowever, the most recent theoretical analysis only holds for a minimal learning\nrate and short consecutive steps, which converge slower than those large\nlearning rate algorithms without RER. In view of this theoretical and empirical\ngap, we provide a tighter analysis that mitigates the limitation on the\nlearning rate and the length of consecutive steps. Furthermore, we show\ntheoretically that RER converges with a larger learning rate and a longer\nsequence.\n","authors":["Nan Jiang","Jinzhao Li","Yexiang Xue"],"pdf_url":"https://arxiv.org/pdf/2408.16999v1.pdf","comment":"This paper is accepted at RLC 2024"},{"id":"http://arxiv.org/abs/2408.16993v1","updated":"2024-08-30T03:43:37Z","published":"2024-08-30T03:43:37Z","title":"A Scalable k-Medoids Clustering via Whale Optimization Algorithm","summary":" Unsupervised clustering has emerged as a critical tool for uncovering hidden\npatterns and insights from vast, unlabeled datasets. However, traditional\nmethods like Partitioning Around Medoids (PAM) struggle with scalability due to\ntheir quadratic computational complexity. To address this limitation, we\nintroduce WOA-kMedoids, a novel unsupervised clustering method that\nincorporates the Whale Optimization Algorithm (WOA), a nature-inspired\nmetaheuristic inspired by the hunting strategies of humpback whales. By\noptimizing centroid selection, WOA-kMedoids reduces computational complexity of\nthe k-medoids algorithm from quadratic to near-linear with respect to the\nnumber of observations. This improvement in efficiency enables WOA-kMedoids to\nbe scalable to large datasets while maintaining high clustering accuracy. We\nevaluated the performance of WOA-kMedoids on 25 diverse time series datasets\nfrom the UCR archive. Our empirical results demonstrate that WOA-kMedoids\nmaintains clustering accuracy similar to PAM. While WOA-kMedoids exhibited\nslightly higher runtime than PAM on small datasets (less than 300\nobservations), it outperformed PAM in computational efficiency on larger\ndatasets. The scalability of WOA-kMedoids, combined with its consistently high\naccuracy, positions it as a promising and practical choice for unsupervised\nclustering in big data applications. WOA-kMedoids has implications for\nefficient knowledge discovery in massive, unlabeled datasets across various\ndomains.\n","authors":["Huang Chenan","Narumasa Tsutsumida"],"pdf_url":"https://arxiv.org/pdf/2408.16993v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.16987v1","updated":"2024-08-30T03:22:35Z","published":"2024-08-30T03:22:35Z","title":"From Model Explanation to Data Misinterpretation: Uncovering the\n Pitfalls of Post Hoc Explainers in Business Research","summary":" Machine learning models have been increasingly used in business research.\nHowever, most state-of-the-art machine learning models, such as deep neural\nnetworks and XGBoost, are black boxes in nature. Therefore, post hoc explainers\nthat provide explanations for machine learning models by, for example,\nestimating numerical importance of the input features, have been gaining wide\nusage. Despite the intended use of post hoc explainers being explaining machine\nlearning models, we found a growing trend in business research where post hoc\nexplanations are used to draw inferences about the data. In this work, we\ninvestigate the validity of such use. Specifically, we investigate with\nextensive experiments whether the explanations obtained by the two most popular\npost hoc explainers, SHAP and LIME, provide correct information about the true\nmarginal effects of X on Y in the data, which we call data-alignment. We then\nidentify what factors influence the alignment of explanations. Finally, we\npropose a set of mitigation strategies to improve the data-alignment of\nexplanations and demonstrate their effectiveness with real-world data in an\neconometric context. In spite of this effort, we nevertheless conclude that it\nis often not appropriate to infer data insights from post hoc explanations. We\narticulate appropriate alternative uses, the most important of which is to\nfacilitate the proposition and subsequent empirical investigation of\nhypotheses. The ultimate goal of this paper is to caution business researchers\nagainst translating post hoc explanations of machine learning models into\npotentially false insights and understanding of data.\n","authors":["Ronilo Ragodos","Tong Wang","Lu Feng"," Yu"," Hu"],"pdf_url":"https://arxiv.org/pdf/2408.16987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06051v2","updated":"2024-08-30T03:19:26Z","published":"2024-08-12T10:55:42Z","title":"Perceptual Similarity for Measuring Decision-Making Style and Policy\n Diversity in Games","summary":" Defining and measuring decision-making styles, also known as playstyles, is\ncrucial in gaming, where these styles reflect a broad spectrum of individuality\nand diversity. However, finding a universally applicable measure for these\nstyles poses a challenge. Building on Playstyle Distance, the first\nunsupervised metric to measure playstyle similarity based on game screens and\nraw actions, we introduce three enhancements to increase accuracy: multiscale\nanalysis with varied state granularity, a perceptual kernel rooted in\npsychology, and the utilization of the intersection-over-union method for\nefficient evaluation. These innovations not only advance measurement precision\nbut also offer insights into human cognition of similarity. Across two racing\ngames and seven Atari games, our techniques significantly improve the precision\nof zero-shot playstyle classification, achieving an accuracy exceeding 90\npercent with fewer than 512 observation-action pairs, which is less than half\nan episode of these games. Furthermore, our experiments with 2048 and Go\ndemonstrate the potential of discrete playstyle measures in puzzle and board\ngames. We also develop an algorithm for assessing decision-making diversity\nusing these measures. Our findings improve the measurement of end-to-end game\nanalysis and the evolution of artificial intelligence for diverse playstyles.\n","authors":["Chiu-Chou Lin","Wei-Chen Chiu","I-Chen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.06051v2.pdf","comment":"TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49"},{"id":"http://arxiv.org/abs/2408.16981v1","updated":"2024-08-30T03:03:03Z","published":"2024-08-30T03:03:03Z","title":"The Sample-Communication Complexity Trade-off in Federated Q-Learning","summary":" We consider the problem of federated Q-learning, where $M$ agents aim to\ncollaboratively learn the optimal Q-function of an unknown infinite-horizon\nMarkov decision process with finite state and action spaces. We investigate the\ntrade-off between sample and communication complexities for the widely used\nclass of intermittent communication algorithms. We first establish the converse\nresult, where it is shown that a federated Q-learning algorithm that offers any\nspeedup with respect to the number of agents in the per-agent sample complexity\nneeds to incur a communication cost of at least an order of\n$\\frac{1}{1-\\gamma}$ up to logarithmic factors, where $\\gamma$ is the discount\nfactor. We also propose a new algorithm, called Fed-DVR-Q, which is the first\nfederated Q-learning algorithm to simultaneously achieve order-optimal sample\nand communication complexities. Thus, together these results provide a complete\ncharacterization of the sample-communication complexity trade-off in federated\nQ-learning.\n","authors":["Sudeep Salgia","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2408.16981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16725v2","updated":"2024-08-30T02:53:48Z","published":"2024-08-29T17:18:53Z","title":"Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming","summary":" Recent advances in language models have achieved significant progress.\nGPT-4o, as a new milestone, has enabled real-time conversations with humans,\ndemonstrating near-human natural fluency. Such human-computer interaction\nnecessitates models with the capability to perform reasoning directly with the\naudio modality and generate output in streaming. However, this remains beyond\nthe reach of current academic models, as they typically depend on extra TTS\nsystems for speech synthesis, resulting in undesirable latency. This paper\nintroduces the Mini-Omni, an audio-based end-to-end conversational model,\ncapable of real-time speech interaction. To achieve this capability, we propose\na text-instructed speech generation method, along with batch-parallel\nstrategies during inference to further boost the performance. Our method also\nhelps to retain the original model's language capabilities with minimal\ndegradation, enabling other works to establish real-time interaction\ncapabilities. We call this training method \"Any Model Can Talk\". We also\nintroduce the VoiceAssistant-400K dataset to fine-tune models optimized for\nspeech output. To our best knowledge, Mini-Omni is the first fully end-to-end,\nopen-source model for real-time speech interaction, offering valuable potential\nfor future research.\n","authors":["Zhifei Xie","Changqiao Wu"],"pdf_url":"https://arxiv.org/pdf/2408.16725v2.pdf","comment":"Technical report, work in progress. Demo and code:\n https://github.com/gpt-omni/mini-omni"},{"id":"http://arxiv.org/abs/2402.01138v3","updated":"2024-08-30T02:53:24Z","published":"2024-02-02T04:30:58Z","title":"Graph Neural Networks in EEG-based Emotion Recognition: A Survey","summary":" Compared to other modalities, EEG-based emotion recognition can intuitively\nrespond to the emotional patterns in the human brain and, therefore, has become\none of the most concerning tasks in the brain-computer interfaces field. Since\ndependencies within brain regions are closely related to emotion, a significant\ntrend is to develop Graph Neural Networks (GNNs) for EEG-based emotion\nrecognition. However, brain region dependencies in emotional EEG have\nphysiological bases that distinguish GNNs in this field from those in other\ntime series fields. Besides, there is neither a comprehensive review nor\nguidance for constructing GNNs in EEG-based emotion recognition. In the survey,\nour categorization reveals the commonalities and differences of existing\napproaches under a unified framework of graph construction. We analyze and\ncategorize methods from three stages in the framework to provide clear guidance\non constructing GNNs in EEG-based emotion recognition. In addition, we discuss\nseveral open challenges and future directions, such as Temporal full-connected\ngraph and Graph condensation.\n","authors":["Chenyu Liu","Xinliang Zhou","Yihao Wu","Ruizhi Yang","Zhongruo Wang","Liming Zhai","Ziyu Jia","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01138v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04261v2","updated":"2024-08-30T02:47:43Z","published":"2024-03-07T06:52:51Z","title":"Advancing Chinese biomedical text mining with community challenges","summary":" Objective: This study aims to review the recent advances in community\nchallenges for biomedical text mining in China. Methods: We collected\ninformation of evaluation tasks released in community challenges of biomedical\ntext mining, including task description, dataset description, data source, task\ntype and related links. A systematic summary and comparative analysis were\nconducted on various biomedical natural language processing tasks, such as\nnamed entity recognition, entity normalization, attribute extraction, relation\nextraction, event extraction, text classification, text similarity, knowledge\ngraph construction, question answering, text generation, and large language\nmodel evaluation. Results: We identified 39 evaluation tasks from 6 community\nchallenges that spanned from 2017 to 2023. Our analysis revealed the diverse\nrange of evaluation task types and data sources in biomedical text mining. We\nexplored the potential clinical applications of these community challenge tasks\nfrom a translational biomedical informatics perspective. We compared with their\nEnglish counterparts, and discussed the contributions, limitations, lessons and\nguidelines of these community challenges, while highlighting future directions\nin the era of large language models. Conclusion: Community challenge evaluation\ncompetitions have played a crucial role in promoting technology innovation and\nfostering interdisciplinary collaboration in the field of biomedical text\nmining. These challenges provide valuable platforms for researchers to develop\nstate-of-the-art solutions.\n","authors":["Hui Zong","Rongrong Wu","Jiaxue Cha","Weizhe Feng","Erman Wu","Jiakun Li","Aibin Shao","Liang Tao","Zuofeng Li","Buzhou Tang","Bairong Shen"],"pdf_url":"https://arxiv.org/pdf/2403.04261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16978v1","updated":"2024-08-30T02:44:26Z","published":"2024-08-30T02:44:26Z","title":"Training Ultra Long Context Language Model with Fully Pipelined\n Distributed Transformer","summary":" Large Language Models (LLMs) with long context capabilities are integral to\ncomplex tasks in natural language processing and computational biology, such as\ntext generation and protein sequence analysis. However, training LLMs directly\non extremely long contexts demands considerable GPU resources and increased\nmemory, leading to higher costs and greater complexity. Alternative approaches\nthat introduce long context capabilities via downstream finetuning or\nadaptations impose significant design limitations. In this paper, we propose\nFully Pipelined Distributed Transformer (FPDT) for efficiently training\nlong-context LLMs with extreme hardware efficiency. For GPT and Llama models,\nwe achieve a 16x increase in sequence length that can be trained on the same\nhardware compared to current state-of-the-art solutions. With our dedicated\nsequence chunk pipeline design, we can now train 8B LLM with 2 million sequence\nlength on only 4 GPUs, while also maintaining over 55% of MFU. Our proposed\nFPDT is agnostic to existing training techniques and is proven to work\nefficiently across different LLM models.\n","authors":["Jinghan Yao","Sam Ade Jacobs","Masahiro Tanaka","Olatunji Ruwase","Aamir Shafi","Hari Subramoni","Dhabaleswar K. Panda"],"pdf_url":"https://arxiv.org/pdf/2408.16978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16975v1","updated":"2024-08-30T02:36:36Z","published":"2024-08-30T02:36:36Z","title":"Technical Report of HelixFold3 for Biomolecular Structure Prediction","summary":" The AlphaFold series has transformed protein structure prediction with\nremarkable accuracy, often matching experimental methods. AlphaFold2,\nAlphaFold-Multimer, and the latest AlphaFold3 represent significant strides in\npredicting single protein chains, protein complexes, and biomolecular\nstructures. While AlphaFold2 and AlphaFold-Multimer are open-sourced,\nfacilitating rapid and reliable predictions, AlphaFold3 remains partially\naccessible through a limited online server and has not been open-sourced,\nrestricting further development. To address these challenges, the PaddleHelix\nteam is developing HelixFold3, aiming to replicate AlphaFold3's capabilities.\nUsing insights from previous models and extensive datasets, HelixFold3 achieves\nan accuracy comparable to AlphaFold3 in predicting the structures of\nconventional ligands, nucleic acids, and proteins. The initial release of\nHelixFold3 is available as open source on GitHub for academic research,\npromising to advance biomolecular research and accelerate discoveries. We also\nprovide online service at PaddleHelix website at\nhttps://paddlehelix.baidu.com/app/all/helixfold3/forecast.\n","authors":["Lihang Liu","Shanzhuo Zhang","Yang Xue","Xianbin Ye","Kunrui Zhu","Yuxin Li","Yang Liu","Xiaonan Zhang","Xiaomin Fang"],"pdf_url":"https://arxiv.org/pdf/2408.16975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08994v2","updated":"2024-08-30T02:26:29Z","published":"2024-08-16T19:52:53Z","title":"Model-based RL as a Minimalist Approach to Horizon-Free and Second-Order\n Bounds","summary":" Learning a transition model via Maximum Likelihood Estimation (MLE) followed\nby planning inside the learned model is perhaps the most standard and simplest\nModel-based Reinforcement Learning (RL) framework. In this work, we show that\nsuch a simple Model-based RL scheme, when equipped with optimistic and\npessimistic planning procedures, achieves strong regret and sample complexity\nbounds in online and offline RL settings. Particularly, we demonstrate that\nunder the conditions where the trajectory-wise reward is normalized between\nzero and one and the transition is time-homogenous, it achieves horizon-free\nand second-order bounds. Horizon-free means that our bounds have no polynomial\ndependence on the horizon of the Markov Decision Process. A second-order bound\nis a type of instance-dependent bound that scales with respect to the variances\nof the returns of the policies which can be small when the system is nearly\ndeterministic and (or) the optimal policy has small values. We highlight that\nour algorithms are simple, fairly standard, and indeed have been extensively\nstudied in the RL literature: they learn a model via MLE, build a version space\naround the MLE solution, and perform optimistic or pessimistic planning\ndepending on whether operating in the online or offline mode. These algorithms\ndo not rely on additional specialized algorithmic designs such as learning\nvariances and performing variance-weighted learning and thus can leverage rich\nfunction approximations that are significantly beyond linear or tabular\nstructures. The simplicity of the algorithms also implies that our horizon-free\nand second-order regret analysis is actually standard and mainly follows the\ngeneral framework of optimism/pessimism in the face of uncertainty.\n","authors":["Zhiyong Wang","Dongruo Zhou","John C. S. Lui","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2408.08994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16969v1","updated":"2024-08-30T02:07:13Z","published":"2024-08-30T02:07:13Z","title":"Point Neuron Learning: A New Physics-Informed Neural Network\n Architecture","summary":" Machine learning and neural networks have advanced numerous research domains,\nbut challenges such as large training data requirements and inconsistent model\nperformance hinder their application in certain scientific problems. To\novercome these challenges, researchers have investigated integrating physics\nprinciples into machine learning models, mainly through: (i) physics-guided\nloss functions, generally termed as physics-informed neural networks, and (ii)\nphysics-guided architectural design. While both approaches have demonstrated\nsuccess across multiple scientific disciplines, they have limitations including\nbeing trapped to a local minimum, poor interpretability, and restricted\ngeneralizability. This paper proposes a new physics-informed neural network\n(PINN) architecture that combines the strengths of both approaches by embedding\nthe fundamental solution of the wave equation into the network architecture,\nenabling the learned model to strictly satisfy the wave equation. The proposed\npoint neuron learning method can model an arbitrary sound field based on\nmicrophone observations without any dataset. Compared to other PINN methods,\nour approach directly processes complex numbers and offers better\ninterpretability and generalizability. We evaluate the versatility of the\nproposed architecture by a sound field reconstruction problem in a reverberant\nenvironment. Results indicate that the point neuron method outperforms two\ncompeting methods and can efficiently handle noisy environments with sparse\nmicrophone observations.\n","authors":["Hanwen Bi","Thushara D. Abhayapala"],"pdf_url":"https://arxiv.org/pdf/2408.16969v1.pdf","comment":"under the review process of EURASIP Journal on Audio, Speech, and\n Music Processing"},{"id":"http://arxiv.org/abs/2408.16966v1","updated":"2024-08-30T01:56:57Z","published":"2024-08-30T01:56:57Z","title":"UserSumBench: A Benchmark Framework for Evaluating User Summarization\n Approaches","summary":" Large language models (LLMs) have shown remarkable capabilities in generating\nuser summaries from a long list of raw user activity data. These summaries\ncapture essential user information such as preferences and interests, and\ntherefore are invaluable for LLM-based personalization applications, such as\nexplainable recommender systems. However, the development of new summarization\ntechniques is hindered by the lack of ground-truth labels, the inherent\nsubjectivity of user summaries, and human evaluation which is often costly and\ntime-consuming. To address these challenges, we introduce \\UserSumBench, a\nbenchmark framework designed to facilitate iterative development of LLM-based\nsummarization approaches. This framework offers two key components: (1) A\nreference-free summary quality metric. We show that this metric is effective\nand aligned with human preferences across three diverse datasets (MovieLens,\nYelp and Amazon Review). (2) A novel robust summarization method that leverages\ntime-hierarchical summarizer and self-critique verifier to produce high-quality\nsummaries while eliminating hallucination. This method serves as a strong\nbaseline for further innovation in summarization techniques.\n","authors":["Chao Wang","Neo Wu","Lin Ning","Luyang Liu","Jun Xie","Shawn O'Banion","Bradley Green"],"pdf_url":"https://arxiv.org/pdf/2408.16966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14520v2","updated":"2024-08-30T01:26:43Z","published":"2024-08-26T06:36:42Z","title":"Towards Graph Prompt Learning: A Survey and Beyond","summary":" Large-scale \"pre-train and prompt learning\" paradigms have demonstrated\nremarkable adaptability, enabling broad applications across diverse domains\nsuch as question answering, image recognition, and multimodal retrieval. This\napproach fully leverages the potential of large-scale pre-trained models,\nreducing downstream data requirements and computational costs while enhancing\nmodel applicability across various tasks. Graphs, as versatile data structures\nthat capture relationships between entities, play pivotal roles in fields such\nas social network analysis, recommender systems, and biological graphs. Despite\nthe success of pre-train and prompt learning paradigms in Natural Language\nProcessing (NLP) and Computer Vision (CV), their application in graph domains\nremains nascent. In graph-structured data, not only do the node and edge\nfeatures often have disparate distributions, but the topological structures\nalso differ significantly. This diversity in graph data can lead to\nincompatible patterns or gaps between pre-training and fine-tuning on\ndownstream graphs. We aim to bridge this gap by summarizing methods for\nalleviating these disparities. This includes exploring prompt design\nmethodologies, comparing related techniques, assessing application scenarios\nand datasets, and identifying unresolved problems and challenges. This survey\ncategorizes over 100 relevant works in this field, summarizing general design\nprinciples and the latest applications, including text-attributed graphs,\nmolecules, proteins, and recommendation systems. Through this extensive review,\nwe provide a foundational understanding of graph prompt learning, aiming to\nimpact not only the graph mining community but also the broader Artificial\nGeneral Intelligence (AGI) community.\n","authors":["Qingqing Long","Yuchen Yan","Peiyan Zhang","Chen Fang","Wentao Cui","Zhiyuan Ning","Meng Xiao","Ning Cao","Xiao Luo","Lingjun Xu","Shiyue Jiang","Zheng Fang","Chong Chen","Xian-Sheng Hua","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.14520v2.pdf","comment":"19 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.07000v2","updated":"2024-08-30T01:19:42Z","published":"2024-07-09T16:13:26Z","title":"Etalon: Holistic Performance Evaluation Framework for LLM Inference\n Systems","summary":" Serving large language models (LLMs) in production can incur substantial\ncosts, which has prompted recent advances in inference system optimizations.\nToday, these systems are evaluated against conventional latency and throughput\nmetrics (eg. TTFT, TBT, Normalised Latency and TPOT). However, these metrics\nfail to fully capture the nuances of LLM inference, leading to an incomplete\nassessment of user-facing performance crucial for real-time applications such\nas chat and translation. In this paper, we first identify the pitfalls of\ncurrent performance metrics in evaluating LLM inference systems. We then\npropose Etalon, a comprehensive performance evaluation framework that includes\nfluidity-index -- a novel metric designed to reflect the intricacies of the LLM\ninference process and its impact on real-time user experience. Finally, we\nevaluate various existing open-source platforms and model-as-a-service\nofferings using Etalon, discussing their strengths and weaknesses. Etalon is\navailable at https://github.com/project-etalon/etalon.\n","authors":["Amey Agrawal","Anmol Agarwal","Nitin Kedia","Jayashree Mohan","Souvik Kundu","Nipun Kwatra","Ramachandran Ramjee","Alexey Tumanov"],"pdf_url":"https://arxiv.org/pdf/2407.07000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16958v1","updated":"2024-08-30T01:09:32Z","published":"2024-08-30T01:09:32Z","title":"Discovery of False Data Injection Schemes on Frequency Controllers with\n Reinforcement Learning","summary":" While inverter-based distributed energy resources (DERs) play a crucial role\nin integrating renewable energy into the power system, they concurrently\ndiminish the grid's system inertia, elevating the risk of frequency\ninstabilities. Furthermore, smart inverters, interfaced via communication\nnetworks, pose a potential vulnerability to cyber threats if not diligently\nmanaged. To proactively fortify the power grid against sophisticated cyber\nattacks, we propose to employ reinforcement learning (RL) to identify potential\nthreats and system vulnerabilities. This study concentrates on analyzing\nadversarial strategies for false data injection, specifically targeting smart\ninverters involved in primary frequency control. Our findings demonstrate that\nan RL agent can adeptly discern optimal false data injection methods to\nmanipulate inverter settings, potentially causing catastrophic consequences.\n","authors":["Romesh Prasad","Malik Hassanaly","Xiangyu Zhang","Abhijeet Sahu"],"pdf_url":"https://arxiv.org/pdf/2408.16958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15991v2","updated":"2024-08-30T01:00:09Z","published":"2023-10-24T16:39:06Z","title":"WhiteFox: White-Box Compiler Fuzzing Empowered by Large Language Models","summary":" Compiler correctness is crucial, as miscompilation can falsify program\nbehaviors, leading to serious consequences. Fuzzing has been studied to uncover\ncompiler defects. However, compiler fuzzing remains challenging: Existing arts\nfocus on black- and grey-box fuzzing, which generates tests without sufficient\nunderstanding of internal compiler behaviors. Meanwhile, traditional white-box\ntechniques, like symbolic execution, are computationally inapplicable to the\ngiant codebase of compilers. Recent advances demonstrate that Large Language\nModels (LLMs) excel in code generation/understanding tasks. Nonetheless,\nguiding LLMs with compiler source-code information remains a missing piece of\nresearch in compiler testing.\n To this end, we propose WhiteFox, the first white-box compiler fuzzer using\nLLMs with source-code information to test compiler optimization, with a\nspotlight on detecting deep logic bugs in the deep learning (DL) compilers.\nWhiteFox adopts a multi-agent framework: an LLM-based analysis agent examines\nthe low-level optimization source code and produces requirements on the\nhigh-level test programs that can trigger the optimization; an LLM-based\ngeneration agent produces test programs based on the summarized requirements.\nAdditionally, optimization-triggering tests are used as feedback to enhance the\ngeneration on the fly. Our evaluation on the three most popular DL compilers\n(i.e., PyTorch Inductor, TensorFlow-XLA, and TensorFlow Lite) shows WhiteFox\ncan generate high-quality test programs to exercise deep optimizations,\npracticing up to 8X more than state-of-the-art fuzzers. WhiteFox has found 101\nbugs for the DL compilers, with 92 confirmed as previously unknown and 70\nfixed. WhiteFox has been acknowledged by the PyTorch team and is being\nincorporated into its development workflow. Beyond DL compilers, WhiteFox can\nalso be adapted for compilers in different domains.\n","authors":["Chenyuan Yang","Yinlin Deng","Runyu Lu","Jiayi Yao","Jiawei Liu","Reyhaneh Jabbarvand","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.15991v2.pdf","comment":"Published in OOPSLA 2024"},{"id":"http://arxiv.org/abs/2304.06496v2","updated":"2024-08-30T00:47:06Z","published":"2023-03-27T12:02:33Z","title":"EEGMatch: Learning with Incomplete Labels for Semi-Supervised EEG-based\n Cross-Subject Emotion Recognition","summary":" Electroencephalography (EEG) is an objective tool for emotion recognition and\nshows promising performance. However, the label scarcity problem is a main\nchallenge in this field, which limits the wide application of EEG-based emotion\nrecognition. In this paper, we propose a novel semi-supervised learning\nframework (EEGMatch) to leverage both labeled and unlabeled EEG data. First, an\nEEG-Mixup based data augmentation method is developed to generate more valid\nsamples for model learning. Second, a semi-supervised two-step pairwise\nlearning method is proposed to bridge prototype-wise and instance-wise pairwise\nlearning, where the prototype-wise pairwise learning measures the global\nrelationship between EEG data and the prototypical representation of each\nemotion class and the instance-wise pairwise learning captures the local\nintrinsic relationship among EEG data. Third, a semi-supervised multi-domain\nadaptation is introduced to align the data representation among multiple\ndomains (labeled source domain, unlabeled source domain, and target domain),\nwhere the distribution mismatch is alleviated. Extensive experiments are\nconducted on two benchmark databases (SEED and SEED-IV) under a cross-subject\nleave-one-subject-out cross-validation evaluation protocol. The results show\nthe proposed EEGmatch performs better than the state-of-the-art methods under\ndifferent incomplete label conditions (with 6.89% improvement on SEED and 1.44%\nimprovement on SEED-IV), which demonstrates the effectiveness of the proposed\nEEGMatch in dealing with the label scarcity problem in emotion recognition\nusing EEG signals. The source code is available at\nhttps://github.com/KAZABANA/EEGMatch.\n","authors":["Rushuang Zhou","Weishan Ye","Zhiguo Zhang","Yanyang Luo","Li Zhang","Linling Li","Gan Huang","Yining Dong","Yuan-Ting Zhang","Zhen Liang"],"pdf_url":"https://arxiv.org/pdf/2304.06496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16947v1","updated":"2024-08-30T00:06:29Z","published":"2024-08-30T00:06:29Z","title":"An Empirical Study of Scaling Laws for Transfer","summary":" We present a limited empirical study of scaling laws for transfer learning in\ntransformer models. More specifically, we examine a scaling law that\nincorporates a \"transfer gap\" term, indicating the effectiveness of\npre-training on one distribution when optimizing for downstream performance on\nanother distribution. When the transfer gap is low, pre-training is a\ncost-effective strategy for improving downstream performance. Conversely, when\nthe gap is high, collecting high-quality fine-tuning data becomes relatively\nmore cost effective. Fitting the scaling law to experiments from diverse\ndatasets reveals significant variations in the transfer gap across\ndistributions. In theory, the scaling law can inform optimal data allocation\nstrategies and highlights how the scarcity of downstream data can bottleneck\nperformance. Our findings contribute to a principled way to measure transfer\nlearning efficiency and understand how data availability affects capabilities.\n","authors":["Matthew Barnett"],"pdf_url":"https://arxiv.org/pdf/2408.16947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.05958v3","updated":"2024-08-30T13:35:31Z","published":"2021-06-10T17:54:21Z","title":"High Probability Complexity Bounds for Non-Smooth Stochastic\n Optimization with Heavy-Tailed Noise","summary":" Stochastic first-order methods are standard for training large-scale machine\nlearning models. Random behavior may cause a particular run of an algorithm to\nresult in a highly suboptimal objective value, whereas theoretical guarantees\nare usually proved for the expectation of the objective value. Thus, it is\nessential to theoretically guarantee that algorithms provide small objective\nresidual with high probability. Existing methods for non-smooth stochastic\nconvex optimization have complexity bounds with the dependence on the\nconfidence level that is either negative-power or logarithmic but under an\nadditional assumption of sub-Gaussian (light-tailed) noise distribution that\nmay not hold in practice. In our paper, we resolve this issue and derive the\nfirst high-probability convergence results with logarithmic dependence on the\nconfidence level for non-smooth convex stochastic optimization problems with\nnon-sub-Gaussian (heavy-tailed) noise. To derive our results, we propose novel\nstepsize rules for two stochastic methods with gradient clipping. Moreover, our\nanalysis works for generalized smooth objectives with H\\\"older-continuous\ngradients, and for both methods, we provide an extension for strongly convex\nproblems. Finally, our results imply that the first (accelerated) method we\nconsider also has optimal iteration and oracle complexity in all the regimes,\nand the second one is optimal in the non-smooth setting.\n","authors":["Eduard Gorbunov","Marina Danilova","Innokentiy Shibaev","Pavel Dvurechensky","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2106.05958v3.pdf","comment":"61 pages, 12 figures. Changes in V2: different presentation of the\n results, different structure, new experiments. Changes in V3: some typos were\n fixed"},{"id":"http://arxiv.org/abs/2107.04857v2","updated":"2024-08-30T10:43:08Z","published":"2021-07-10T15:14:19Z","title":"Dense-Sparse Deep Convolutional Neural Networks Training for Image\n Denoising","summary":" Recently, deep learning methods such as the convolutional neural networks\nhave gained prominence in the area of image denoising. This is owing to their\nproven ability to surpass state-of-the-art classical image denoising algorithms\nsuch as block-matching and 3D filtering algorithm. Deep denoising convolutional\nneural networks use many feed-forward convolution layers with added\nregularization methods of batch normalization and residual learning to speed up\ntraining and improve denoising performance significantly. However, this comes\nat the expense of a huge number of trainable parameters. In this paper, we show\nthat by employing an enhanced dense-sparse-dense network training procedure to\nthe deep denoising convolutional neural networks, comparable denoising\nperformance level can be achieved at a significantly reduced number of\ntrainable parameters. We derive motivation from the fact that networks trained\nusing the dense-sparse-dense approach have been shown to attain performance\nboost with reduced number of parameters. The proposed reduced deep denoising\nconvolutional neural networks network is an efficient denoising model with\nsignificantly reduced parameters and comparable performance to the deep\ndenoising convolutional neural networks. Additionally, denoising was achieved\nat significantly reduced processing time.\n","authors":["Basit O. Alawode","Mudassir Masood"],"pdf_url":"https://arxiv.org/pdf/2107.04857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03886v2","updated":"2024-08-30T23:40:05Z","published":"2023-12-06T20:24:17Z","title":"On The Fairness Impacts of Hardware Selection in Machine Learning","summary":" In the machine learning ecosystem, hardware selection is often regarded as a\nmere utility, overshadowed by the spotlight on algorithms and data. This\noversight is particularly problematic in contexts like ML-as-a-service\nplatforms, where users often lack control over the hardware used for model\ndeployment. How does the choice of hardware impact generalization properties?\nThis paper investigates the influence of hardware on the delicate balance\nbetween model performance and fairness. We demonstrate that hardware choices\ncan exacerbate existing disparities, attributing these discrepancies to\nvariations in gradient flows and loss surfaces across different demographic\ngroups. Through both theoretical and empirical analysis, the paper not only\nidentifies the underlying factors but also proposes an effective strategy for\nmitigating hardware-induced performance imbalances.\n","authors":["Sree Harsha Nelaturu","Nishaanth Kanna Ravichandran","Cuong Tran","Sara Hooker","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2312.03886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00012v4","updated":"2024-08-30T22:49:32Z","published":"2023-06-21T19:34:16Z","title":"FlakyFix: Using Large Language Models for Predicting Flaky Test Fix\n Categories and Test Code Repair","summary":" Flaky tests are problematic because they non-deterministically pass or fail\nfor the same software version under test, causing confusion and wasting\ndevelopment effort. While machine learning models have been used to predict\nflakiness and its root causes, there is much less work on providing support to\nfix the problem. To address this gap, in this paper, we focus on predicting the\ntype of fix that is required to remove flakiness and then repair the test code\non that basis. We do this for a subset of flaky tests where the root cause of\nflakiness is in the test itself and not in the production code. One key idea is\nto guide the repair process with additional knowledge about the test's\nflakiness in the form of its predicted fix category. Thus, we first propose a\nframework that automatically generates labeled datasets for 13 fix categories\nand trains models to predict the fix category of a flaky test by analyzing the\ntest code only. Our experimental results using code models and few-shot\nlearning show that we can correctly predict most of the fix categories. To show\nthe usefulness of such fix category labels for automatically repairing\nflakiness, we augment the prompts of GPT-3.5 Turbo, a Large Language Model\n(LLM), with such extra knowledge to request repair suggestions. The results\nshow that our suggested fix category labels, complemented with in-context\nlearning, significantly enhance the capability of GPT-3.5 Turbo in generating\nfixes for flaky tests. Based on the execution and analysis of a sample of\nGPT-repaired flaky tests, we estimate that a large percentage of such repairs\n(roughly between 51% and 83%) can be expected to pass. For the failing repaired\ntests, on average, 16% of the test code needs to be further changed for them to\npass.\n","authors":["Sakina Fatima","Hadi Hemmati","Lionel Briand"],"pdf_url":"https://arxiv.org/pdf/2307.00012v4.pdf","comment":"26 pages, 20 Figures"},{"id":"http://arxiv.org/abs/2407.14056v2","updated":"2024-08-30T22:46:55Z","published":"2024-07-19T06:33:10Z","title":"Rasa: Building Expressive Speech Synthesis Systems for Indian Languages\n in Low-resource Settings","summary":" We release Rasa, the first multilingual expressive TTS dataset for any Indian\nlanguage, which contains 10 hours of neutral speech and 1-3 hours of expressive\nspeech for each of the 6 Ekman emotions covering 3 languages: Assamese,\nBengali, & Tamil. Our ablation studies reveal that just 1 hour of neutral and\n30 minutes of expressive data can yield a Fair system as indicated by MUSHRA\nscores. Increasing neutral data to 10 hours, with minimal expressive data,\nsignificantly enhances expressiveness. This offers a practical recipe for\nresource-constrained languages, prioritizing easily obtainable neutral data\nalongside smaller amounts of expressive data. We show the importance of\nsyllabically balanced data and pooling emotions to enhance expressiveness. We\nalso highlight challenges in generating specific emotions, e.g., fear and\nsurprise.\n","authors":["Praveen Srinivasa Varadhan","Ashwin Sankar","Giri Raju","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2407.14056v2.pdf","comment":"Accepted at INTERSPEECH 2024. First two authors listed contributed\n equally"},{"id":"http://arxiv.org/abs/2311.11180v2","updated":"2024-08-30T21:55:16Z","published":"2023-11-18T23:06:33Z","title":"Nonsmooth Projection-Free Optimization with Functional Constraints","summary":" This paper presents a subgradient-based algorithm for constrained nonsmooth\nconvex optimization that does not require projections onto the feasible set.\nWhile the well-established Frank-Wolfe algorithm and its variants already avoid\nprojections, they are primarily designed for smooth objective functions. In\ncontrast, our proposed algorithm can handle nonsmooth problems with general\nconvex functional inequality constraints. It achieves an $\\epsilon$-suboptimal\nsolution in $\\mathcal{O}(\\epsilon^{-2})$ iterations, with each iteration\nrequiring only a single (potentially inexact) Linear Minimization Oracle (LMO)\ncall and a (possibly inexact) subgradient computation. This performance is\nconsistent with existing lower bounds. Similar performance is observed when\ndeterministic subgradients are replaced with stochastic subgradients. In the\nspecial case where there are no functional inequality constraints, our\nalgorithm competes favorably with a recent nonsmooth projection-free method\ndesigned for constraint-free problems. Our approach utilizes a simple\nseparation scheme in conjunction with a new Lagrange multiplier update rule.\n","authors":["Kamiar Asgari","Michael J. Neely"],"pdf_url":"https://arxiv.org/pdf/2311.11180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05746v2","updated":"2024-08-30T21:51:31Z","published":"2024-04-03T14:33:23Z","title":"Causality for Earth Science -- A Review on Time-series and\n Spatiotemporal Causality Methods","summary":" This survey paper covers the breadth and depth of time-series and\nspatiotemporal causality methods, and their applications in Earth Science. More\nspecifically, the paper presents an overview of causal discovery and causal\ninference, explains the underlying causal assumptions, and enlists evaluation\ntechniques and key terminologies of the domain area. The paper elicits the\nvarious state-of-the-art methods introduced for time-series and spatiotemporal\ncausal analysis along with their strengths and limitations. The paper further\ndescribes the existing applications of several methods for answering specific\nEarth Science questions such as extreme weather events, sea level rise,\nteleconnections etc. This survey paper can serve as a primer for Data Science\nresearchers interested in data-driven causal study as we share a list of\nresources, such as Earth Science datasets (synthetic, simulated and\nobservational data) and open source tools for causal analysis. It will equally\nbenefit the Earth Science community interested in taking an AI-driven approach\nto study the causality of different dynamic and thermodynamic processes as we\npresent the open challenges and opportunities in performing causality-based\nEarth Science study.\n","authors":["Sahara Ali","Uzma Hasan","Xingyan Li","Omar Faruque","Akila Sampath","Yiyi Huang","Md Osman Gani","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09950v2","updated":"2024-08-30T21:17:11Z","published":"2024-07-13T17:15:23Z","title":"PSO Fuzzy XGBoost Classifier Boosted with Neural Gas Features on EEG\n Signals in Emotion Recognition","summary":" Emotion recognition is the technology-driven process of identifying and\ncategorizing human emotions from various data sources, such as facial\nexpressions, voice patterns, body motion, and physiological signals, such as\nEEG. These physiological indicators, though rich in data, present challenges\ndue to their complexity and variability, necessitating sophisticated feature\nselection and extraction methods. NGN, an unsupervised learning algorithm,\neffectively adapts to input spaces without predefined grid structures,\nimproving feature extraction from physiological data. Furthermore, the\nincorporation of fuzzy logic enables the handling of fuzzy data by introducing\nreasoning that mimics human decision-making. The combination of PSO with\nXGBoost aids in optimizing model performance through efficient hyperparameter\ntuning and decision process optimization. This study explores the integration\nof Neural-Gas Network (NGN), XGBoost, Particle Swarm Optimization (PSO), and\nfuzzy logic to enhance emotion recognition using physiological signals. Our\nresearch addresses three critical questions concerning the improvement of\nXGBoost with PSO and fuzzy logic, NGN's effectiveness in feature selection, and\nthe performance comparison of the PSO-fuzzy XGBoost classifier with standard\nbenchmarks. Acquired results indicate that our methodologies enhance the\naccuracy of emotion recognition systems and outperform other feature selection\ntechniques using the majority of classifiers, offering significant implications\nfor both theoretical advancement and practical application in emotion\nrecognition technology.\n","authors":["Seyed Muhammad Hossein Mousavi"],"pdf_url":"https://arxiv.org/pdf/2407.09950v2.pdf","comment":"PSO, Fuzzy, XGBoost, Neural Gas Network (NGN), Feature Selection, EEG\n Signals, Emotion Recognition"},{"id":"http://arxiv.org/abs/2408.04104v2","updated":"2024-08-30T21:01:59Z","published":"2024-08-07T21:45:01Z","title":"Hardware-Assisted Virtualization of Neural Processing Units for Cloud\n Platforms","summary":" Cloud platforms today have been deploying hardware accelerators like neural\nprocessing units (NPUs) for powering machine learning (ML) inference services.\nTo maximize the resource utilization while ensuring reasonable quality of\nservice, a natural approach is to virtualize NPUs for efficient resource\nsharing for multi-tenant ML services. However, virtualizing NPUs for modern\ncloud platforms is not easy. This is not only due to the lack of system\nabstraction support for NPU hardware, but also due to the lack of architectural\nand ISA support for enabling fine-grained dynamic operator scheduling for\nvirtualized NPUs.\n We present Neu10, a holistic NPU virtualization framework. We investigate\nvirtualization techniques for NPUs across the entire software and hardware\nstack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which\nenables fine-grained virtualization of the heterogeneous compute units in a\nphysical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go\ncomputing model and flexible vNPU-to-pNPU mappings for improved resource\nutilization and cost-effectiveness; (3) an ISA extension of modern NPU\narchitecture for facilitating fine-grained tensor operator scheduling for\nmultiple vNPUs. We implement Neu10 based on a production-level NPU simulator.\nOur experiments show that Neu10 improves the throughput of ML inference\nservices by up to 1.4$\\times$ and reduces the tail latency by up to\n4.6$\\times$, while improving the NPU utilization by 1.2$\\times$ on average,\ncompared to state-of-the-art NPU sharing approaches.\n","authors":["Yuqi Xue","Yiqi Liu","Lifeng Nai","Jian Huang"],"pdf_url":"https://arxiv.org/pdf/2408.04104v2.pdf","comment":"Accepted to MICRO'24"},{"id":"http://arxiv.org/abs/2408.13402v2","updated":"2024-08-30T20:57:39Z","published":"2024-08-23T23:00:19Z","title":"LLaVaOLMoBitnet1B: Ternary LLM goes Multimodal!","summary":" Multimodal Large Language Models (MM-LLMs) have seen significant advancements\nin the last year, demonstrating impressive performance across tasks. However,\nto truly democratize AI, models must exhibit strong capabilities and be able to\nrun efficiently on small compute footprints accessible by most. Part of this\nquest, we introduce LLaVaOLMoBitnet1B - the first Ternary Multimodal LLM\ncapable of accepting Image(s)+Text inputs to produce coherent textual\nresponses. The model is fully open-sourced along with training scripts to\nencourage further research in this space. This accompanying technical report\nhighlights the training process, evaluation details, challenges associated with\nternary models and future opportunities. Link to the model:\nhttps://huggingface.co/IntelLabs/LlavaOLMoBitnet1B\n","authors":["Jainaveen Sundaram","Ravi Iyer"],"pdf_url":"https://arxiv.org/pdf/2408.13402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02946v2","updated":"2024-08-30T20:22:18Z","published":"2024-08-06T04:14:29Z","title":"Scaling Laws for Data Poisoning in LLMs","summary":" Recent work shows that LLMs are vulnerable to data poisoning, in which they\nare trained on partially corrupted or harmful data. Poisoned data is hard to\ndetect, breaks guardrails, and leads to undesirable and harmful behavior. Given\nthe intense efforts by leading labs to train and deploy increasingly larger and\nmore capable LLMs, it is critical to ask if the risk of data poisoning will be\nnaturally mitigated by scale, or if it is an increasing threat. We consider\nthree threat models by which data poisoning can occur: malicious fine-tuning,\nimperfect data curation, and intentional data contamination. Our experiments\nevaluate the effects of data poisoning on 23 frontier LLMs ranging from 1.5-72\nbillion parameters on three datasets which speak to each of our threat models.\nWe find that larger LLMs are increasingly vulnerable, learning harmful behavior\nsignificantly more quickly than smaller LLMs with even minimal data poisoning.\nThese results underscore the need for robust safeguards against data poisoning\nin larger LLMs.\n","authors":["Dillon Bowen","Brendan Murphy","Will Cai","David Khachaturov","Adam Gleave","Kellin Pelrine"],"pdf_url":"https://arxiv.org/pdf/2408.02946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10204v2","updated":"2024-08-30T20:19:10Z","published":"2024-08-19T17:58:03Z","title":"Criticality Leveraged Adversarial Training (CLAT) for Boosted\n Performance via Parameter Efficiency","summary":" Adversarial training enhances neural network robustness but suffers from a\ntendency to overfit and increased generalization errors on clean data. This\nwork introduces CLAT, an innovative approach that mitigates adversarial\noverfitting by introducing parameter efficiency into the adversarial training\nprocess, improving both clean accuracy and adversarial robustness. Instead of\ntuning the entire model, CLAT identifies and fine-tunes robustness-critical\nlayers - those predominantly learning non-robust features - while freezing the\nremaining model to enhance robustness. It employs dynamic critical layer\nselection to adapt to changes in layer criticality throughout the fine-tuning\nprocess. Empirically, CLAT can be applied on top of existing adversarial\ntraining methods, significantly reduces the number of trainable parameters by\napproximately 95%, and achieves more than a 2% improvement in adversarial\nrobustness compared to baseline methods.\n","authors":["Bhavna Gopal","Huanrui Yang","Jingyang Zhang","Mark Horton","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10204v2.pdf","comment":"9 pages + appendix/ additional experiments"},{"id":"http://arxiv.org/abs/2406.11799v2","updated":"2024-08-30T20:10:51Z","published":"2024-06-17T17:47:44Z","title":"Mix-Domain Contrastive Learning for Unpaired H&E-to-IHC Stain\n Translation","summary":" H&E-to-IHC stain translation techniques offer a promising solution for\nprecise cancer diagnosis, especially in low-resource regions where there is a\nshortage of health professionals and limited access to expensive equipment.\nConsidering the pixel-level misalignment of H&E-IHC image pairs, current\nresearch explores the pathological consistency between patches from the same\npositions of the image pair. However, most of them overemphasize the\ncorrespondence between domains or patches, overlooking the side information\nprovided by the non-corresponding objects. In this paper, we propose a\nMix-Domain Contrastive Learning (MDCL) method to leverage the supervision\ninformation in unpaired H&E-to-IHC stain translation. Specifically, the\nproposed MDCL method aggregates the inter-domain and intra-domain pathology\ninformation by estimating the correlation between the anchor patch and all the\npatches from the matching images, encouraging the network to learn additional\ncontrastive knowledge from mixed domains. With the mix-domain pathology\ninformation aggregation, MDCL enhances the pathological consistency between the\ncorresponding patches and the component discrepancy of the patches from the\ndifferent positions of the generated IHC image. Extensive experiments on two\nH&E-to-IHC stain translation datasets, namely MIST and BCI, demonstrate that\nthe proposed method achieves state-of-the-art performance across multiple\nmetrics.\n","authors":["Song Wang","Zhong Zhang","Huan Yan","Ming Xu","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2406.11799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04634v4","updated":"2024-08-30T19:58:02Z","published":"2024-05-07T19:37:22Z","title":"FRACTAL: An Ultra-Large-Scale Aerial Lidar Dataset for 3D Semantic\n Segmentation of Diverse Landscapes","summary":" Mapping agencies are increasingly adopting Aerial Lidar Scanning (ALS) as a\nnew tool to map buildings and other above-ground structures. Processing ALS\ndata at scale requires efficient point classification methods that perform well\nover highly diverse territories. Large annotated Lidar datasets are needed to\nevaluate these classification methods, however, current Lidar benchmarks have\nrestricted scope and often cover a single urban area. To bridge this data gap,\nwe introduce the FRench ALS Clouds from TArgeted Landscapes (FRACTAL) dataset:\nan ultra-large-scale aerial Lidar dataset made of 100,000 dense point clouds\nwith high quality labels for 7 semantic classes and spanning 250 km$^2$.\nFRACTAL achieves high spatial and semantic diversity by explicitly sampling\nrare classes and challenging landscapes from five different regions of France.\nWe describe the data collection, annotation, and curation process of the\ndataset. We provide baseline semantic segmentation results using a state of the\nart 3D point cloud classification model. FRACTAL aims to support the\ndevelopment of 3D deep learning approaches for large-scale land monitoring.\n","authors":["Charles Gaydon","Michel Daab","Floryne Roche"],"pdf_url":"https://arxiv.org/pdf/2405.04634v4.pdf","comment":"9 (body) + 2 (bibliography) + 8 (appendices) pages | Dataset is\n available at https://huggingface.co/datasets/IGNF/FRACTAL | Trained model is\n available at https://huggingface.co/IGNF/FRACTAL-LidarHD_7cl_randlanet | Deep\n learning code repository is on Gihtub at https://github.com/IGNF/myria3d |\n Data engineering code repository is on Github at\n https://github.com/IGNF/pacasam"},{"id":"http://arxiv.org/abs/2310.15111v2","updated":"2024-08-30T19:21:36Z","published":"2023-10-23T17:20:01Z","title":"Matryoshka Diffusion Models","summary":" Diffusion models are the de facto approach for generating high-quality images\nand videos, but learning high-dimensional models remains a formidable task due\nto computational and optimization challenges. Existing methods often resort to\ntraining cascaded models in pixel space or using a downsampled latent space of\na separately trained auto-encoder. In this paper, we introduce Matryoshka\nDiffusion Models(MDM), an end-to-end framework for high-resolution image and\nvideo synthesis. We propose a diffusion process that denoises inputs at\nmultiple resolutions jointly and uses a NestedUNet architecture where features\nand parameters for small-scale inputs are nested within those of large scales.\nIn addition, MDM enables a progressive training schedule from lower to higher\nresolutions, which leads to significant improvements in optimization for\nhigh-resolution generation. We demonstrate the effectiveness of our approach on\nvarious benchmarks, including class-conditioned image generation,\nhigh-resolution text-to-image, and text-to-video applications. Remarkably, we\ncan train a single pixel-space model at resolutions of up to 1024x1024 pixels,\ndemonstrating strong zero-shot generalization using the CC12M dataset, which\ncontains only 12 million images. Our code is released at\nhttps://github.com/apple/ml-mdm\n","authors":["Jiatao Gu","Shuangfei Zhai","Yizhe Zhang","Josh Susskind","Navdeep Jaitly"],"pdf_url":"https://arxiv.org/pdf/2310.15111v2.pdf","comment":"Accepted by ICLR2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.17057v1","updated":"2024-08-30T07:32:19Z","published":"2024-08-30T07:32:19Z","title":"LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality\n Assessment Model","summary":" Recent advancements in the field of No-Reference Image Quality Assessment\n(NR-IQA) using deep learning techniques demonstrate high performance across\nmultiple open-source datasets. However, such models are typically very large\nand complex making them not so suitable for real-world deployment, especially\non resource- and battery-constrained mobile devices. To address this\nlimitation, we propose a compact, lightweight NR-IQA model that achieves\nstate-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation\nand test datasets while being also nearly 5.7 times faster than the fastest\nSOTA model. Our model features a dual-branch architecture, with each branch\nseparately trained on synthetically and authentically distorted images which\nenhances the model's generalizability across different distortion types. To\nimprove robustness under diverse real-world visual conditions, we additionally\nincorporate multiple color spaces during the training process. We also\ndemonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks\n(KANs) for final quality regression as compared to the conventional Multi-Layer\nPerceptrons (MLPs). Our evaluation considering various open-source datasets\nhighlights the practical, high-accuracy, and robust performance of our proposed\nlightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildiyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.17057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16990v1","updated":"2024-08-30T03:36:22Z","published":"2024-08-30T03:36:22Z","title":"Video to Music Moment Retrieval","summary":" Adding proper background music helps complete a short video to be shared.\nTowards automating the task, previous research focuses on video-to-music\nretrieval (VMR), aiming to find amidst a collection of music the one best\nmatching the content of a given video. Since music tracks are typically much\nlonger than short videos, meaning the returned music has to be cut to a shorter\nmoment, there is a clear gap between the practical need and VMR. In order to\nbridge the gap, we propose in this paper video to music moment retrieval (VMMR)\nas a new task. To tackle the new task, we build a comprehensive dataset\nAd-Moment which contains 50K short videos annotated with music moments and\ndevelop a two-stage approach. In particular, given a test video, the most\nsimilar music is retrieved from a given collection. Then, a Transformer based\nmusic moment localization is performed. We term this approach Retrieval and\nLocalization (ReaL). Extensive experiments on real-world datasets verify the\neffectiveness of the proposed method for VMMR.\n","authors":["Zijie Xin","Minquan Wang","Ye Ma","Bo Wang","Quan Chen","Peng Jiang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2408.16990v1.pdf","comment":null}]},"2024-09-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.09075v2","updated":"2024-09-03T16:47:09Z","published":"2024-08-17T02:26:29Z","title":"Improving Rare Word Translation With Dictionaries and Attention Masking","summary":" In machine translation, rare words continue to be a problem for the dominant\nencoder-decoder architecture, especially in low-resource and out-of-domain\ntranslation settings. Human translators solve this problem with monolingual or\nbilingual dictionaries. In this paper, we propose appending definitions from a\nbilingual dictionary to source sentences and using attention masking to link\ntogether rare words with their definitions. We find that including definitions\nfor rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1.\n","authors":["Kenneth J. Sible","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2408.09075v2.pdf","comment":"11 pages, 3 figures, 3 tables. Accepted at AMTA 2024"},{"id":"http://arxiv.org/abs/2406.06385v3","updated":"2024-09-03T16:36:06Z","published":"2024-06-10T15:44:22Z","title":"Low-Rank Quantization-Aware Training for LLMs","summary":" Large language models (LLMs) are omnipresent, however their practical\ndeployment is challenging due to their ever increasing computational and memory\ndemands. Quantization is one of the most effective ways to make them more\ncompute and memory efficient. Quantization-aware training (QAT) methods,\ngenerally produce the best quantized performance, however it comes at the cost\nof potentially long training time and excessive memory usage, making it\nimpractical when applying for LLMs. Inspired by parameter-efficient fine-tuning\n(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a\nlightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several\ncomponents to save memory without sacrificing predictive performance: (a)\nlow-rank auxiliary weights that are aware of the quantization grid; (b) a\ndowncasting operator using fixed-point or double-packed integers and (c)\ncheckpointing. Unlike most related work, our method (i) is inference-efficient,\nleading to no additional overhead compared to traditional PTQ; (ii) can be seen\nas a general extended pretraining framework, meaning that the resulting model\ncan still be utilized for any downstream task afterwards; (iii) can be applied\nacross a wide range of quantization settings, such as different choices\nquantization granularity, activation quantization, and seamlessly combined with\nmany PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families\nand validate its effectiveness on several downstream tasks. Our method\noutperforms common post-training quantization (PTQ) approaches and reaches the\nsame model performance as full-model QAT at the fraction of its memory usage.\nSpecifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of\nmemory. Our source code is available at\nhttps://github.com/qualcomm-ai-research/LR-QAT\n","authors":["Yelysei Bondarenko","Riccardo Del Chiaro","Markus Nagel"],"pdf_url":"https://arxiv.org/pdf/2406.06385v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14340v3","updated":"2024-09-03T14:53:34Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":" In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wenhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17024v2","updated":"2024-09-03T13:55:01Z","published":"2024-08-30T05:42:31Z","title":"InkubaLM: A small language model for low-resource African languages","summary":" High-resource language models often fall short in the African context, where\nthere is a critical need for models that are efficient, accessible, and locally\nrelevant, even amidst significant computing and data constraints. This paper\nintroduces InkubaLM, a small language model with 0.4 billion parameters, which\nachieves performance comparable to models with significantly larger parameter\ncounts and more extensive training data on tasks such as machine translation,\nquestion-answering, AfriMMLU, and the AfriXnli task. Notably, InkubaLM\noutperforms many larger models in sentiment analysis and demonstrates\nremarkable consistency across multiple languages. This work represents a\npivotal advancement in challenging the conventional paradigm that effective\nlanguage models must rely on substantial resources. Our model and datasets are\npublicly available at https://huggingface.co/lelapa to encourage research and\ndevelopment on low-resource languages.\n","authors":["Atnafu Lambebo Tonja","Bonaventure F. P. Dossou","Jessica Ojo","Jenalea Rajab","Fadel Thior","Eric Peter Wairagala","Anuoluwapo Aremu","Pelonomi Moiloa","Jade Abbott","Vukosi Marivate","Benjamin Rosman"],"pdf_url":"https://arxiv.org/pdf/2408.17024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02031v8","updated":"2024-09-03T10:19:52Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":" Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reasons are the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever large language\nmodel in the ocean domain, which is expert in various ocean science tasks. We\nalso propose OceanGPT, a novel framework to automatically obtain a large volume\nof ocean domain instruction data, which generates instructions based on\nmulti-agent collaboration. Additionally, we construct the first oceanography\nbenchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean\ndomain. Though comprehensive experiments, OceanGPT not only shows a higher\nlevel of knowledge expertise for oceans science tasks but also gains\npreliminary embodied intelligence capabilities in ocean technology.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v8.pdf","comment":"ACL2024. Project Website: http://oceangpt.zjukg.cn/"},{"id":"http://arxiv.org/abs/2312.01082v2","updated":"2024-09-03T07:59:58Z","published":"2023-12-02T09:20:10Z","title":"A Survey on Stability of Learning with Limited Labelled Data and its\n Sensitivity to the Effects of Randomness","summary":" Learning with limited labelled data, such as prompting, in-context learning,\nfine-tuning, meta-learning or few-shot learning, aims to effectively train a\nmodel using only a small amount of labelled samples. However, these approaches\nhave been observed to be excessively sensitive to the effects of uncontrolled\nrandomness caused by non-determinism in the training process. The randomness\nnegatively affects the stability of the models, leading to large variances in\nresults across training runs. When such sensitivity is disregarded, it can\nunintentionally, but unfortunately also intentionally, create an imaginary\nperception of research progress. Recently, this area started to attract\nresearch attention and the number of relevant studies is continuously growing.\nIn this survey, we provide a comprehensive overview of 415 papers addressing\nthe effects of randomness on the stability of learning with limited labelled\ndata. We distinguish between four main tasks addressed in the papers\n(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness\neffects), providing findings for each one. Furthermore, we identify and discuss\nseven challenges and open problems together with possible directions to\nfacilitate further research. The ultimate goal of this survey is to emphasise\nthe importance of this growing research area, which so far has not received an\nappropriate level of attention, and reveal impactful directions for future\nresearch.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2312.01082v2.pdf","comment":"Accepted to ACM Comput. Surv. 2024"},{"id":"http://arxiv.org/abs/2406.01252v3","updated":"2024-09-03T07:07:59Z","published":"2024-06-03T12:10:26Z","title":"Towards Scalable Automated Alignment of LLMs: A Survey","summary":" Alignment is the most critical step in building large language models (LLMs)\nthat meet human needs. With the rapid development of LLMs gradually surpassing\nhuman capabilities, traditional alignment methods based on human-annotation are\nincreasingly unable to meet the scalability demands. Therefore, there is an\nurgent need to explore new sources of automated alignment signals and technical\napproaches. In this paper, we systematically review the recently emerging\nmethods of automated alignment, attempting to explore how to achieve effective,\nscalable, automated alignment once the capabilities of LLMs exceed those of\nhumans. Specifically, we categorize existing automated alignment methods into 4\nmajor categories based on the sources of alignment signals and discuss the\ncurrent status and potential development of each category. Additionally, we\nexplore the underlying mechanisms that enable automated alignment and discuss\nthe essential factors that make automated alignment technologies feasible and\neffective from the fundamental role of alignment.\n","authors":["Boxi Cao","Keming Lu","Xinyu Lu","Jiawei Chen","Mengjie Ren","Hao Xiang","Peilin Liu","Yaojie Lu","Ben He","Xianpei Han","Le Sun","Hongyu Lin","Bowen Yu"],"pdf_url":"https://arxiv.org/pdf/2406.01252v3.pdf","comment":"Paper List: https://github.com/cascip/awesome-auto-alignment"},{"id":"http://arxiv.org/abs/2311.13110v3","updated":"2024-09-03T06:31:48Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v3.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes and formatting improvements. V3: improvements from\n journal revisions"},{"id":"http://arxiv.org/abs/2406.10203v2","updated":"2024-09-03T06:05:32Z","published":"2024-06-14T17:38:21Z","title":"A Fundamental Trade-off in Aligned Language Models and its Relation to\n Sampling Adaptors","summary":" The relationship between the quality of a string, as judged by a human\nreader, and its probability, $p(\\boldsymbol{y})$ under a language model\nundergirds the development of better language models. For example, many popular\nalgorithms for sampling from a language model have been conceived with the goal\nof manipulating $p(\\boldsymbol{y})$ to place higher probability on strings that\nhumans deem of high quality. In this article, we examine the\nprobability--quality relationship in language models explicitly aligned to\nhuman preferences, e.g., through reinforcement learning through human feedback.\nWe show that, when sampling corpora from an aligned language model, there\nexists a trade-off between the strings' average reward and average\nlog-likelihood under the prior language model, i.e., the same model before\nalignment with human preferences. We provide a formal treatment of this\nphenomenon and demonstrate how a choice of sampling adaptor allows for a\nselection of how much likelihood we exchange for the reward.\n","authors":["Naaman Tan","Josef Valvoda","Tianyu Liu","Anej Svete","Yanxia Qin","Kan Min-Yen","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2406.10203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11169v4","updated":"2024-09-03T05:51:40Z","published":"2024-03-17T10:59:09Z","title":"Correcting misinformation on social media with a large language model","summary":" Real-world misinformation, often multimodal, can be partially or fully\nfactual but misleading using diverse tactics like conflating correlation with\ncausation. Such misinformation is severely understudied, challenging to\naddress, and harms various social domains, particularly on social media, where\nit can spread rapidly. High-quality and timely correction of misinformation\nthat identifies and explains its (in)accuracies effectively reduces false\nbeliefs. Despite the wide acceptance of manual correction, it is difficult to\nbe timely and scalable. While LLMs have versatile capabilities that could\naccelerate misinformation correction, they struggle due to a lack of recent\ninformation, a tendency to produce false content, and limitations in addressing\nmultimodal information. We propose MUSE, an LLM augmented with access to and\ncredibility evaluation of up-to-date information. By retrieving evidence as\nrefutations or supporting context, MUSE identifies and explains content\n(in)accuracies with references. It conducts multimodal retrieval and interprets\nvisual content to verify and correct multimodal content. Given the absence of a\ncomprehensive evaluation approach, we propose 13 dimensions of misinformation\ncorrection quality. Then, fact-checking experts evaluate responses to social\nmedia content that are not presupposed to be misinformation but broadly include\n(partially) incorrect and correct posts that may (not) be misleading. Results\ndemonstrate MUSE's ability to write high-quality responses to potential\nmisinformation--across modalities, tactics, domains, political leanings, and\nfor information that has not previously been fact-checked online--within\nminutes of its appearance on social media. Overall, MUSE outperforms GPT-4 by\n37% and even high-quality responses from laypeople by 29%. Our work provides a\ngeneral methodological and evaluative framework to correct misinformation at\nscale.\n","authors":["Xinyi Zhou","Ashish Sharma","Amy X. Zhang","Tim Althoff"],"pdf_url":"https://arxiv.org/pdf/2403.11169v4.pdf","comment":"50 pages"},{"id":"http://arxiv.org/abs/2405.01481v2","updated":"2024-09-03T05:47:42Z","published":"2024-05-02T17:13:40Z","title":"NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment","summary":" Aligning Large Language Models (LLMs) with human values and preferences is\nessential for making them helpful and safe. However, building efficient tools\nto perform alignment can be challenging, especially for the largest and most\ncompetent LLMs which often contain tens or hundreds of billions of parameters.\nWe create NeMo-Aligner, a toolkit for model alignment that can efficiently\nscale to a thousand GPUs for training the largest open-source LLMs such as\nNemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized\nand scalable implementations for major paradigms of model alignment such as:\nReinforcement Learning from Human Feedback (RLHF), Direct Preference\nOptimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally,\nour toolkit supports running most of the alignment techniques in a Parameter\nEfficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for\nextensibility, allowing support for other alignment techniques with minimal\neffort. It is open-sourced with Apache 2.0 License and we invite community\ncontributions at https://github.com/NVIDIA/NeMo-Aligner\n","authors":["Gerald Shen","Zhilin Wang","Olivier Delalleau","Jiaqi Zeng","Yi Dong","Daniel Egert","Shengyang Sun","Jimmy Zhang","Sahil Jain","Ali Taghibakhshi","Markel Sanz Ausin","Ashwath Aithal","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2405.01481v2.pdf","comment":"16 pages, 4 figures, Accepted to COLM 2024"},{"id":"http://arxiv.org/abs/2408.15518v2","updated":"2024-09-03T04:38:16Z","published":"2024-08-28T04:06:14Z","title":"Squid: Long Context as a New Modality for Energy-Efficient On-Device\n Language Models","summary":" This paper presents Dolphin, a novel decoder-decoder architecture for\nenergy-efficient processing of long contexts in language models. Our approach\naddresses the significant energy consumption and latency challenges inherent in\non-device models. Dolphin employs a compact 0.5B parameter decoder to distill\nextensive contextual information into a memory embedding, substantially\nreducing the input length for the primary 7B parameter decoder model. Inspired\nby vision-language models, we repurpose the image embedding projector to encode\nlong textual contexts, effectively treating extended context as a distinct\nmodality. This innovative method enables processing of substantially longer\ncontexts without the typical computational overhead associated with extended\ninput sequences. Empirical evaluations demonstrate a 10-fold improvement in\nenergy efficiency and a 5-fold reduction in latency compared to conventional\nfull-length context processing methods without losing quality of the response.\nOur work contributes to the development of more sustainable and scalable\nlanguage models for on-device applications, addressing the critical need for\nenergy-efficient and responsive AI technologies in resource-constrained\nenvironments while maintaining the accuracy to understand long contexts. This\nresearch has implications for the broader field of natural language processing,\nparticularly in the domain of efficient model design for resource-limited\nsettings. By enabling more sophisticated AI capabilities on edge devices,\nDolphin paves the way for advanced language processing in a wide range of\napplications where computational resources are at a premium. The Dolphin model\nis publicly available at https://huggingface.co/NexaAIDev/Dolphin.\n","authors":["Wei Chen","Zhiyuan Li","Shuo Xin","Yihao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06576v4","updated":"2024-09-03T02:11:01Z","published":"2024-06-04T04:17:40Z","title":"OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step","summary":" Despite significant advancements in text generation and reasoning, Large\nLanguage Models (LLMs) still face challenges in accurately performing complex\narithmetic operations. Language model systems often enable LLMs to generate\ncode for arithmetic operations to achieve accurate calculations. However, this\napproach compromises speed and security, and fine-tuning risks the language\nmodel losing prior capabilities. We propose a framework that enables exact\narithmetic in a single autoregressive step, providing faster, more secure, and\nmore interpretable LLM systems with arithmetic capabilities. We use the hidden\nstates of a LLM to control a symbolic architecture that performs arithmetic.\nOur implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama)\nachieves 100\\% accuracy on single arithmetic operations\n($+,-,\\times,\\div,\\sin{},\\cos{},\\log{},\\exp{},\\sqrt{}$), outperforming GPT 4o\nwith and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o\nwith and without a code interpreter on average across a range of mathematical\nproblem solving benchmarks, demonstrating that OccamLLMs can excel in\narithmetic tasks, even surpassing much larger models. We will make our code\npublic shortly.\n","authors":["Owen Dugan","Donato Manuel Jimenez Beneto","Charlotte Loh","Zhuo Chen","Rumen Dangovski","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2406.06576v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05219v3","updated":"2024-09-03T02:10:16Z","published":"2024-07-07T00:43:05Z","title":"Flood of Techniques and Drought of Theories: Emotion Mining in Disasters","summary":" Emotion mining has become a crucial tool for understanding human emotions\nduring disasters, leveraging the extensive data generated on social media\nplatforms. This paper aims to summarize existing research on emotion mining\nwithin disaster contexts, highlighting both significant discoveries and\npersistent issues. On the one hand, emotion mining techniques have achieved\nacceptable accuracy enabling applications such as rapid damage assessment and\nmental health surveillance. On the other hand, with many studies adopting\ndata-driven approaches, several methodological issues remain. These include\narbitrary emotion classification, ignoring biases inherent in data collection\nfrom social media, such as the overrepresentation of individuals from higher\nsocioeconomic status on Twitter, and the lack of application of theoretical\nframeworks like cross-cultural comparisons. These problems can be summarized as\na notable lack of theory-driven research and ignoring insights from social and\nbehavioral sciences. This paper underscores the need for interdisciplinary\ncollaboration between computer scientists and social scientists to develop more\nrobust and theoretically grounded approaches in emotion mining. By addressing\nthese gaps, we aim to enhance the effectiveness and reliability of emotion\nmining methodologies, ultimately contributing to improved disaster\npreparedness, response, and recovery.\n Keywords: emotion mining, sentiment analysis, natural disasters, psychology,\ntechnological disasters\n","authors":["Soheil Shapouri","Saber Soleymani","Saed Rezayi"],"pdf_url":"https://arxiv.org/pdf/2407.05219v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11807v3","updated":"2024-09-03T01:14:30Z","published":"2024-03-18T14:04:47Z","title":"How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming\n Ability in Multi-Agent Environments","summary":" Decision-making, a complicated task requiring various types of abilities,\npresents an excellent framework for assessing Large Language Models (LLMs). Our\nresearch investigates decision-making capabilities of LLMs through the lens of\nGame Theory. We focus specifically on games that support the simultaneous\nparticipation of more than two agents. We introduce GAMA($\\gamma$)-Bench, which\nevaluates LLMs' Gaming Ability in Multi-Agent environments. $\\gamma$-Bench\nincludes eight classical multi-agent games and a scoring scheme specially\ndesigned to quantitatively assess LLMs' performance. Leveraging $\\gamma$-Bench,\nwe investigate LLMs' robustness, generalizability, and strategies for\nenhancement. Results reveal that while GPT-3.5 shows satisfying robustness, its\ngeneralizability is relatively limited. However, its performance can be\nimproved through approaches such as Chain-of-Thought. Additionally, we evaluate\ntwelve versions from six models, including GPT-3.5, GPT-4, Gemini, LLaMA-3.1,\nMixtral, and Qwen-2. We find that Gemini-1.5-Pro outperforms other models with\na score of $63.8$ out of $100$, followed by LLaMA-3.1-70B and GPT-4 with scores\nof $60.9$ and $60.5$, respectively. The code and experimental results are made\npublicly available via https://github.com/CUHK-ARISE/GAMABench.\n","authors":["Jen-tse Huang","Eric John Li","Man Ho Lam","Tian Liang","Wenxuan Wang","Youliang Yuan","Wenxiang Jiao","Xing Wang","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.11807v3.pdf","comment":"11 pages of main text. 20 pages of appendices. 12 figures, 9 tables.\n Added models: Gemini-1.5-Pro, LLaMA-3.1-{7, 70, 405}B, Mixtral-8x{7, 22}B,\n Qwen-2-72B"},{"id":"http://arxiv.org/abs/2406.16746v3","updated":"2024-09-03T23:03:41Z","published":"2024-06-24T15:55:49Z","title":"The Responsible Foundation Model Development Cheatsheet: A Review of\n Tools & Resources","summary":" Foundation model development attracts a rapidly expanding body of\ncontributors, scientists, and applications. To help shape responsible\ndevelopment practices, we introduce the Foundation Model Development\nCheatsheet: a growing collection of 250+ tools and resources spanning text,\nvision, and speech modalities. We draw on a large body of prior work to survey\nresources (e.g. software, documentation, frameworks, guides, and practical\ntools) that support informed data selection, processing, and understanding,\nprecise and limitation-aware artifact documentation, efficient model training,\nadvance awareness of the environmental impact from training, careful model\nevaluation of capabilities, risks, and claims, as well as responsible model\nrelease, licensing and deployment practices. We hope this curated collection of\nresources helps guide more responsible development. The process of curating\nthis list, enabled us to review the AI development ecosystem, revealing what\ntools are critically missing, misused, or over-used in existing practices. We\nfind that (i) tools for data sourcing, model evaluation, and monitoring are\ncritically under-serving ethical and real-world needs, (ii) evaluations for\nmodel safety, capabilities, and environmental impact all lack reproducibility\nand transparency, (iii) text and particularly English-centric analyses continue\nto dominate over multilingual and multi-modal analyses, and (iv) evaluation of\nsystems, rather than just models, is needed so that capabilities and impact are\nassessed in context.\n","authors":["Shayne Longpre","Stella Biderman","Alon Albalak","Hailey Schoelkopf","Daniel McDuff","Sayash Kapoor","Kevin Klyman","Kyle Lo","Gabriel Ilharco","Nay San","Maribeth Rauh","Aviya Skowron","Bertie Vidgen","Laura Weidinger","Arvind Narayanan","Victor Sanh","David Adelani","Percy Liang","Rishi Bommasani","Peter Henderson","Sasha Luccioni","Yacine Jernite","Luca Soldaini"],"pdf_url":"https://arxiv.org/pdf/2406.16746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02326v1","updated":"2024-09-03T22:36:42Z","published":"2024-09-03T22:36:42Z","title":"Arctic-SnowCoder: Demystifying High-Quality Data in Code Pretraining","summary":" Recent studies have been increasingly demonstrating that high-quality data is\ncrucial for effective pretraining of language models. However, the precise\ndefinition of \"high-quality\" remains underexplored. Focusing on the code\ndomain, we introduce Arctic-SnowCoder-1.3B, a data-efficient base code model\npretrained on 555B tokens through three phases of progressively refined data:\n(1) general pretraining with 500B standard-quality code tokens, preprocessed\nthrough basic filtering, deduplication, and decontamination, (2) continued\npretraining with 50B high-quality tokens, selected from phase one by a\nBERT-style quality annotator trained to distinguish good code from random data,\nusing positive examples drawn from high-quality code files, along with\ninstruction data from Magicoder and StarCoder2-Instruct, and (3) enhanced\npretraining with 5B synthetic data created by Llama-3.1-70B using phase two\ndata as seeds, adapting the Magicoder approach for pretraining. Despite being\ntrained on a limited dataset, Arctic-SnowCoder achieves state-of-the-art\nperformance on BigCodeBench, a coding benchmark focusing on practical and\nchallenging programming tasks, compared to similarly sized models trained on no\nmore than 1T tokens, outperforming Phi-1.5-1.3B by 36%. Across all evaluated\nbenchmarks, Arctic-SnowCoder-1.3B beats StarCoderBase-3B pretrained on 1T\ntokens. Additionally, it matches the performance of leading small base code\nmodels trained on trillions of tokens. For example, Arctic-SnowCoder-1.3B\nsurpasses StarCoder2-3B, pretrained on over 3.3T tokens, on HumanEval+, a\nbenchmark that evaluates function-level code generation, and remains\ncompetitive on BigCodeBench. Our evaluation presents a comprehensive analysis\njustifying various design choices for Arctic-SnowCoder. Most importantly, we\nfind that the key to high-quality data is its alignment with the distribution\nof downstream applications.\n","authors":["Yuxiang Wei","Hojae Han","Rajhans Samdani"],"pdf_url":"https://arxiv.org/pdf/2409.02326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14452v3","updated":"2024-09-03T22:06:17Z","published":"2023-03-25T12:12:30Z","title":"COFFEE: A Contrastive Oracle-Free Framework for Event Extraction","summary":" Event extraction is a complex information extraction task that involves\nextracting events from unstructured text. Prior classification-based methods\nrequire comprehensive entity annotations for joint training, while newer\ngeneration-based methods rely on heuristic templates containing oracle\ninformation such as event type, which is often unavailable in real-world\nscenarios. In this study, we consider a more realistic setting of this task,\nnamely the Oracle-Free Event Extraction (OFEE) task, where only the input\ncontext is given without any oracle information, including event type, event\nontology and trigger word. To solve this task, we propose a new framework,\ncalled COFFEE, which extracts the events solely based on the document context\nwithout referring to any oracle information. In particular, a contrastive\nselection model is introduced in COFFEE to rectify the generated triggers and\nhandle multi-event instances. The proposed COFFEE outperforms state-of-the-art\napproaches under the oracle-free setting of the event extraction task, as\nevaluated on a public event extraction benchmark ACE05.\n","authors":["Meiru Zhang","Yixuan Su","Zaiqiao Meng","Zihao Fu","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2303.14452v3.pdf","comment":"Accepted to MATCHING Workshop at ACL 2023"},{"id":"http://arxiv.org/abs/2403.06023v2","updated":"2024-09-03T20:09:57Z","published":"2024-03-09T22:18:26Z","title":"Persian Slang Text Conversion to Formal and Deep Learning of Persian\n Short Texts on Social Media for Sentiment Classification","summary":" The lack of a suitable tool for the analysis of conversational texts in the\nPersian language has made various analyses of these texts, including Sentiment\nAnalysis, difficult. In this research, we tried to make the understanding of\nthese texts easier for the machine by providing PSC, Persian Slang Converter, a\ntool for converting conversational texts into formal ones, and by using the\nmost up-to-date and best deep learning methods along with the PSC, the\nsentiment learning of short Persian language texts for the machine in a better\nway. be made More than 10 million unlabeled texts from various social networks\nand movie subtitles (as Conversational texts) and about 10 million news texts\n(as formal texts) have been used for training unsupervised models and formal\nimplementation of the tool. 60,000 texts from the comments of Instagram social\nnetwork users with positive, negative, and neutral labels are considered\nsupervised data for training the emotion classification model of short texts.\nUsing the formal tool, 57% of the words of the corpus of conversation were\nconverted. Finally, by using the formalizer, FastText model, and deep LSTM\nnetwork, an accuracy of 81.91 was obtained on the test data.\n","authors":["Mohsen Khazeni","Mohammad Heydari","Amir Albadvi"],"pdf_url":"https://arxiv.org/pdf/2403.06023v2.pdf","comment":"16 pages, 4 figures, 14 tables"},{"id":"http://arxiv.org/abs/2405.19534v2","updated":"2024-09-03T19:37:27Z","published":"2024-05-29T21:29:44Z","title":"Preference Learning Algorithms Do Not Learn Preference Rankings","summary":" Preference learning algorithms (e.g., RLHF and DPO) are frequently used to\nsteer LLMs to produce generations that are more preferred by humans, but our\nunderstanding of their inner workings is still limited. In this work, we study\nthe conventional wisdom that preference learning trains models to assign higher\nlikelihoods to more preferred outputs than less preferred outputs, measured via\n$\\textit{ranking accuracy}$. Surprisingly, we find that most state-of-the-art\npreference-tuned models achieve a ranking accuracy of less than 60% on common\npreference datasets. We furthermore derive the $\\textit{idealized ranking\naccuracy}$ that a preference-tuned LLM would achieve if it optimized the DPO or\nRLHF objective perfectly. We demonstrate that existing models exhibit a\nsignificant $\\textit{alignment gap}$ -- $\\textit{i.e.}$, a gap between the\nobserved and idealized ranking accuracies. We attribute this discrepancy to the\nDPO objective, which is empirically and theoretically ill-suited to fix even\nmild ranking errors in the reference model, and derive a simple and efficient\nformula for quantifying the difficulty of learning a given preference\ndatapoint. Finally, we demonstrate that ranking accuracy strongly correlates\nwith the empirically popular win rate metric when the model is close to the\nreference model used in the objective, shedding further light on the\ndifferences between on-policy (e.g., RLHF) and off-policy (e.g., DPO)\npreference learning algorithms.\n","authors":["Angelica Chen","Sadhika Malladi","Lily H. Zhang","Xinyi Chen","Qiuyi Zhang","Rajesh Ranganath","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2405.19534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02259v1","updated":"2024-09-03T19:34:25Z","published":"2024-09-03T19:34:25Z","title":"Optimal L-Systems for Stochastic L-system Inference Problems","summary":" This paper presents two novel theorems that address two open problems in\nstochastic Lindenmayer-system (L-system) inference, specifically focusing on\nthe construction of an optimal stochastic L-system capable of generating a\ngiven sequence of strings. The first theorem delineates a method for crafting a\nstochastic L-system that maximizes the likelihood of producing a given sequence\nof words through a singular derivation. Furthermore, the second theorem\ndetermines the stochastic L-systems with the highest probability of producing a\ngiven sequence of words with multiple possible derivations. From these, we\nintroduce an algorithm to infer an optimal stochastic L-system from a given\nsequence. This algorithm incorporates sophisticated optimization techniques,\nsuch as interior point methods, ensuring production of a stochastically optimal\nstochastic L-system suitable for generating the given sequence. This allows for\nthe use of using stochastic L-systems as model for machine learning using only\npositive data for training.\n","authors":["Ali Lotfi","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2409.02259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02257v1","updated":"2024-09-03T19:31:03Z","published":"2024-09-03T19:31:03Z","title":"MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in\n LLMs","summary":" Existing benchmarks for large language models (LLMs) increasingly struggle to\ndifferentiate between top-performing models, underscoring the need for more\nchallenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced\nbenchmark building upon MMLU-Pro to assess shortcut learning and higher-order\nreasoning in LLMs. By incorporating questions with multiple correct answers\nacross diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex\nreasoning and resist simplistic problem-solving strategies. Our results show\nthat MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous\ntest of model discrimination, particularly in multi-correct answer scenarios.\nWe introduce novel metrics like shortcut selection ratio and correct pair\nidentification ratio, offering deeper insights into model behavior and\nanchoring bias. Evaluations of five state-of-the-art LLMs reveal significant\nperformance gaps, highlighting variations in reasoning abilities and bias\nsusceptibility. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/mmlu-pro-plus}.\n","authors":["Saeid Asgari Taghanaki","Aliasgahr Khani","Amir Khasahmadi"],"pdf_url":"https://arxiv.org/pdf/2409.02257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16994v2","updated":"2024-09-03T19:28:39Z","published":"2024-07-24T04:27:55Z","title":"A Voter-Based Stochastic Rejection-Method Framework for Asymptotically\n Safe Language Model Outputs","summary":" This paper proposes a new method for preventing unsafe or otherwise low\nquality large language model (LLM) outputs, by leveraging the stochasticity of\nLLMs. We propose a system whereby LLM checkers vote on the acceptability of a\ngenerated output, regenerating it if a threshold of disapproval is reached,\nuntil sufficient checkers approve. We further propose estimators for cost and\nfailure rate, and based on those estimators and experimental data tailored to\nthe application, we propose an algorithm that achieves a desired failure rate\nat the least possible cost. We demonstrate that, under these models, failure\nrate decreases exponentially as a function of cost when voter count and\nthreshold are chosen according to the algorithm, and that the models reasonably\nestimate the actual performance of such a system in action, even with limited\ndata.\n","authors":["Jake R. Watts","Joel Sokol"],"pdf_url":"https://arxiv.org/pdf/2407.16994v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.00996v4","updated":"2024-09-03T19:27:19Z","published":"2023-10-02T08:58:29Z","title":"ARN: Analogical Reasoning on Narratives","summary":" As a core cognitive skill that enables the transferability of information\nacross domains, analogical reasoning has been extensively studied for both\nhumans and computational models. However, while cognitive theories of analogy\noften focus on narratives and study the distinction between surface,\nrelational, and system similarities, existing work in natural language\nprocessing has a narrower focus as far as relational analogies between word\npairs. This gap brings a natural question: can state-of-the-art large language\nmodels (LLMs) detect system analogies between narratives? To gain insight into\nthis question and extend word-based relational analogies to relational system\nanalogies, we devise a comprehensive computational framework that\noperationalizes dominant theories of analogy, using narrative elements to\ncreate surface and system mappings. Leveraging the interplay between these\nmappings, we create a binary task and benchmark for Analogical Reasoning on\nNarratives (ARN), covering four categories of far (cross-domain)/near\n(within-domain) analogies and disanalogies. We show that while all LLMs can\nlargely recognize near analogies, even the largest ones struggle with far\nanalogies in a zero-shot setting, with GPT4.0 scoring below random. Guiding the\nmodels through solved examples and chain-of-thought reasoning enhances their\nanalogical reasoning ability. Yet, since even in the few-shot setting, the best\nmodel only performs halfway between random and humans, ARN opens exciting\ndirections for computational analogical reasoners.\n","authors":["Zhivar Sourati","Filip Ilievski","Pia Sommerauer","Yifan Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.00996v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02244v1","updated":"2024-09-03T19:19:13Z","published":"2024-09-03T19:19:13Z","title":"Therapy as an NLP Task: Psychologists' Comparison of LLMs and Human\n Peers in CBT","summary":" Wider access to therapeutic care is one of the biggest challenges in mental\nhealth treatment. Due to institutional barriers, some people seeking mental\nhealth support have turned to large language models (LLMs) for personalized\ntherapy, even though these models are largely unsanctioned and untested. We\ninvestigate the potential and limitations of using LLMs as providers of\nevidence-based therapy by using mixed methods clinical metrics. Using HELPERT,\na prompt run on a large language model using the same process and training as a\ncomparative group of peer counselors, we replicated publicly accessible mental\nhealth conversations rooted in Cognitive Behavioral Therapy (CBT) to compare\nsession dynamics and counselor's CBT-based behaviors between original peer\nsupport sessions and their reconstructed HELPERT sessions. Two licensed,\nCBT-trained clinical psychologists evaluated the sessions using the Cognitive\nTherapy Rating Scale and provided qualitative feedback. Our findings show that\nthe peer sessions are characterized by empathy, small talk, therapeutic\nalliance, and shared experiences but often exhibit therapist drift. Conversely,\nHELPERT reconstructed sessions exhibit minimal therapist drift and higher\nadherence to CBT methods but display a lack of collaboration, empathy, and\ncultural understanding. Through CTRS ratings and psychologists' feedback, we\nhighlight the importance of human-AI collaboration for scalable mental health.\nOur work outlines the ethical implication of imparting human-like subjective\nqualities to LLMs in therapeutic settings, particularly the risk of deceptive\nempathy, which may lead to unrealistic patient expectations and potential harm.\n","authors":["Zainab Iftikhar","Sean Ransom","Amy Xiao","Jeff Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02239v1","updated":"2024-09-03T19:11:15Z","published":"2024-09-03T19:11:15Z","title":"Temporal Order Preserved Optimal Transport-based Cross-modal Knowledge\n Transfer Learning for ASR","summary":" Transferring linguistic knowledge from a pretrained language model (PLM) to\nan acoustic model has been shown to greatly improve the performance of\nautomatic speech recognition (ASR). However, due to the heterogeneous feature\ndistributions in cross-modalities, designing an effective model for feature\nalignment and knowledge transfer between linguistic and acoustic sequences\nremains a challenging task. Optimal transport (OT), which efficiently measures\nprobability distribution discrepancies, holds great potential for aligning and\ntransferring knowledge between acoustic and linguistic modalities. Nonetheless,\nthe original OT treats acoustic and linguistic feature sequences as two\nunordered sets in alignment and neglects temporal order information during OT\ncoupling estimation. Consequently, a time-consuming pretraining stage is\nrequired to learn a good alignment between the acoustic and linguistic\nrepresentations. In this paper, we propose a Temporal Order Preserved OT\n(TOT)-based Cross-modal Alignment and Knowledge Transfer (CAKT) (TOT-CAKT) for\nASR. In the TOT-CAKT, local neighboring frames of acoustic sequences are\nsmoothly mapped to neighboring regions of linguistic sequences, preserving\ntheir temporal order relationship in feature alignment and matching. With the\nTOT-CAKT model framework, we conduct Mandarin ASR experiments with a pretrained\nChinese PLM for linguistic knowledge transfer. Our results demonstrate that the\nproposed TOT-CAKT significantly improves ASR performance compared to several\nstate-of-the-art models employing linguistic knowledge transfer, and addresses\nthe weaknesses of the original OT-based method in sequential feature alignment\nfor ASR.\n","authors":["Xugang Lu","Peng Shen","Yu Tsao","Hisashi Kawai"],"pdf_url":"https://arxiv.org/pdf/2409.02239v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2406.01981v2","updated":"2024-09-03T19:11:11Z","published":"2024-06-04T05:47:17Z","title":"Zyda: A 1.3T Dataset for Open Language Modeling","summary":" The size of large language models (LLMs) has scaled dramatically in recent\nyears and their computational and data requirements have surged\ncorrespondingly. State-of-the-art language models, even at relatively smaller\nsizes, typically require training on at least a trillion tokens. This rapid\nadvancement has eclipsed the growth of open-source datasets available for\nlarge-scale LLM pretraining. In this paper, we introduce Zyda (Zyphra Dataset),\na dataset under a permissive license comprising 1.3 trillion tokens, assembled\nby integrating several major respected open-source datasets into a single,\nhigh-quality corpus. We apply rigorous filtering and deduplication processes,\nboth within and across datasets, to maintain and enhance the quality derived\nfrom the original datasets. Our evaluations show that Zyda not only competes\nfavorably with other open datasets like Dolma, FineWeb, and RefinedWeb, but\nalso substantially improves the performance of comparable models from the\nPythia suite. Our rigorous data processing methods significantly enhance Zyda's\neffectiveness, outperforming even the best of its constituent datasets when\nused independently.\n","authors":["Yury Tokpanov","Beren Millidge","Paolo Glorioso","Jonathan Pilault","Adam Ibrahim","James Whittington","Quentin Anthony"],"pdf_url":"https://arxiv.org/pdf/2406.01981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16176v3","updated":"2024-09-03T19:08:18Z","published":"2023-10-24T20:48:11Z","title":"Correction with Backtracking Reduces Hallucination in Summarization","summary":" Abstractive summarization aims at generating natural language summaries of a\nsource document that are succinct while preserving the important elements.\nDespite recent advances, neural text summarization models are known to be\nsusceptible to hallucinating (or more correctly confabulating), that is to\nproduce summaries with details that are not grounded in the source document. In\nthis paper, we introduce a simple yet efficient technique, CoBa, to reduce\nhallucination in abstractive summarization. The approach is based on two steps:\nhallucination detection and mitigation. We show that the former can be achieved\nthrough measuring simple statistics about conditional word probabilities and\ndistance to context words. Further, we demonstrate that straight-forward\nbacktracking is surprisingly effective at mitigation. We thoroughly evaluate\nthe proposed method with prior art on three benchmark datasets for text\nsummarization. The results show that CoBa is effective and efficient in\nreducing hallucination, and offers great adaptability and flexibility. Code can\nbe found at https://github.com/zhenzhel/CoBa.\n","authors":["Zhenzhen Liu","Chao Wan","Varsha Kishore","Jin Peng Zhou","Minmin Chen","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2310.16176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02228v1","updated":"2024-09-03T18:55:54Z","published":"2024-09-03T18:55:54Z","title":"Unforgettable Generalization in Language Models","summary":" When language models (LMs) are trained to forget (or \"unlearn'') a skill, how\nprecisely does their behavior change? We study the behavior of transformer LMs\nin which tasks have been forgotten via fine-tuning on randomized labels. Such\nLMs learn to generate near-random predictions for individual examples in the\n\"training'' set used for forgetting. Across tasks, however, LMs exhibit extreme\nvariability in whether LM predictions change on examples outside the training\nset. In some tasks (like entailment classification), forgetting generalizes\nrobustly, and causes models to produce uninformative predictions on new task\ninstances; in other tasks (like physical commonsense reasoning and scientific\nquestion answering) forgetting affects only the training examples, and models\ncontinue to perform the \"forgotten'' task accurately even for examples very\nsimilar to those that appeared in the training set. Dataset difficulty is not\npredictive of whether a behavior can be forgotten; instead, generalization in\nforgetting is (weakly) predicted by the confidence of LMs' initial task\npredictions and the variability of LM representations of training data, with\nlow confidence and low variability both associated with greater generalization.\nPerhaps most surprisingly, random-label forgetting appears to be somewhat\ninsensitive to the contents of the training set: for example, models trained on\nscience questions with random labels continue to answer other science questions\naccurately, but begin to produce random labels on entailment classification\ntasks. Finally, we show that even generalizable forgetting is shallow: linear\nprobes trained on LMs' representations can still perform tasks reliably after\nforgetting. Our results highlight the difficulty and unpredictability of\nperforming targeted skill removal from models via fine-tuning.\n","authors":["Eric Zhang","Leshem Chosen","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2409.02228v1.pdf","comment":"18 pages, 9 figures, published in First Conference on Language\n Modeling 2024"},{"id":"http://arxiv.org/abs/2409.02865v1","updated":"2024-09-03T17:59:50Z","published":"2024-09-03T17:59:50Z","title":"Visually Grounded Speech Models for Low-resource Languages and Cognitive\n Modelling","summary":" This dissertation examines visually grounded speech (VGS) models that learn\nfrom unlabelled speech paired with images. It focuses on applications for\nlow-resource languages and understanding human language acquisition. We\nintroduce a task called visually prompted keyword localisation to detect and\nlocalise keywords in speech using images. We demonstrate the effectiveness of\nVGS models in few-shot learning scenarios for low-resource languages like\nYoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our\nmonolingual VGS model exhibits this bias, but we found that multilingualism\ndoes not affect the bias in this VGS model similarly to what is observed in\nchildren.\n","authors":["Leanne Nortje"],"pdf_url":"https://arxiv.org/pdf/2409.02865v1.pdf","comment":"PhD Dissertation"},{"id":"http://arxiv.org/abs/2409.02098v1","updated":"2024-09-03T17:54:40Z","published":"2024-09-03T17:54:40Z","title":"CRAFT Your Dataset: Task-Specific Synthetic Dataset Generation Through\n Corpus Retrieval and Augmentation","summary":" Building high-quality datasets for specialized tasks is a time-consuming and\nresource-intensive process that often requires specialized domain knowledge. We\npropose Corpus Retrieval and Augmentation for Fine-Tuning (CRAFT), a method for\ngenerating synthetic datasets, given a small number of user-written few-shots\nthat demonstrate the task to be performed. Given the few-shot examples, we use\nlarge-scale public web-crawled corpora and similarity-based document retrieval\nto find other relevant human-written documents. Lastly, instruction-tuned large\nlanguage models (LLMs) augment the retrieved documents into custom-formatted\ntask samples, which then can be used for fine-tuning. We demonstrate that CRAFT\ncan efficiently generate large-scale task-specific training datasets for four\ndiverse tasks: biology question-answering (QA), medicine QA and commonsense QA\nas well as summarization. Our experiments show that CRAFT-based models\noutperform or achieve comparable performance to general LLMs for QA tasks,\nwhile CRAFT-based summarization models outperform models trained on\nhuman-curated data by 46 preference points.\n","authors":["Ingo Ziegler","Abdullatif Köksal","Desmond Elliott","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2409.02098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15444v2","updated":"2024-09-03T17:48:55Z","published":"2024-05-30T18:07:13Z","title":"Investigating the Robustness of LLMs on Math Word Problems","summary":" Large Language Models (LLMs) excel at various tasks, including solving math\nword problems (MWPs), but struggle with real-world problems containing\nirrelevant information. To address this, we propose a prompting framework that\ngenerates adversarial variants of MWPs by adding irrelevant variables. We\nintroduce a dataset, ProbleMATHIC, containing both adversarial and\nnon-adversarial MWPs. Our experiments reveal that LLMs are susceptible to\ndistraction by numerical noise, resulting in an average relative performance\ndrop of ~26% on adversarial MWPs. To mitigate this, we fine-tune LLMs (Llama-2,\nMistral) on the adversarial samples from our dataset. Fine-tuning on\nadversarial training instances improves performance on adversarial MWPs by ~8%,\nindicating increased robustness to noise and better ability to identify\nrelevant data for reasoning. Finally, to assess the generalizability of our\nprompting framework, we introduce GSM-8K-Adv, an adversarial variant of the\nGSM-8K benchmark. LLMs continue to struggle when faced with adversarial\ninformation, reducing performance by up to ~6%.\n","authors":["Ujjwala Anantheswaran","Himanshu Gupta","Kevin Scaria","Shreyas Verma","Chitta Baral","Swaroop Mishra"],"pdf_url":"https://arxiv.org/pdf/2406.15444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02078v1","updated":"2024-09-03T17:26:17Z","published":"2024-09-03T17:26:17Z","title":"Political DEBATE: Efficient Zero-shot and Few-shot Classifiers for\n Political Text","summary":" Social scientists quickly adopted large language models due to their ability\nto annotate documents without supervised training, an ability known as\nzero-shot learning. However, due to their compute demands, cost, and often\nproprietary nature, these models are often at odds with replication and open\nscience standards. This paper introduces the Political DEBATE (DeBERTa\nAlgorithm for Textual Entailment) language models for zero-shot and few-shot\nclassification of political documents. These models are not only as good, or\nbetter than, state-of-the art large language models at zero and few-shot\nclassification, but are orders of magnitude more efficient and completely open\nsource. By training the models on a simple random sample of 10-25 documents,\nthey can outperform supervised classifiers trained on hundreds or thousands of\ndocuments and state-of-the-art generative models with complex, engineered\nprompts. Additionally, we release the PolNLI dataset used to train these models\n-- a corpus of over 200,000 political documents with highly accurate labels\nacross over 800 classification tasks.\n","authors":["Michael Burnham","Kayla Kahn","Ryan Yank Wang","Rachel X. Peng"],"pdf_url":"https://arxiv.org/pdf/2409.02078v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02076v1","updated":"2024-09-03T17:25:54Z","published":"2024-09-03T17:25:54Z","title":"Spinning the Golden Thread: Benchmarking Long-Form Generation in\n Language Models","summary":" The abilities of long-context language models (LMs) are often evaluated using\nthe \"Needle-in-a-Haystack\" (NIAH) test, which comprises tasks designed to\nassess a model's ability to identify specific information (\"needle\") within\nlarge text sequences (\"haystack\"). While these benchmarks measure how well\nmodels understand long-context input sequences, they do not effectively gauge\nthe quality of long-form text generation--a critical aspect for applications\nsuch as design proposals and creative writing. To address this gap, we have\nintroduced a new long-form text evaluation benchmark, Spinning the Golden\nThread (SGT), which tests models' ability to identify specific events within\ngenerated long text sequences. In this benchmark, we prompt long-context LMs to\ncreate long-form text that must include particular events or constraints and\nevaluate their ability to incorporate these elements. We evaluated ten\nlong-context LMs across four distinct scenarios, three types of prompt\ninstructions, and two different generation-length settings (16K and 32K).\nAlthough these models perform well on NIAH benchmarks, none demonstrated\nsatisfactory performance on the Spinning the Golden Thread, raising concerns\nabout their ability to generate coherent long-form text that follows\ninstructions. Additionally, as the length of the generated text increases, all\nmodels exhibit a significant drop in performance.\n","authors":["Yuhao Wu","Ming Shan Hee","Zhiqing Hu","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2409.02076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02060v1","updated":"2024-09-03T17:08:20Z","published":"2024-09-03T17:08:20Z","title":"OLMoE: Open Mixture-of-Experts Language Models","summary":" We introduce OLMoE, a fully open, state-of-the-art language model leveraging\nsparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but\nuses only 1B per input token. We pretrain it on 5 trillion tokens and further\nadapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available\nmodels with similar active parameters, even surpassing larger ones like\nLlama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE\ntraining, analyze routing in our model showing high specialization, and\nopen-source all aspects of our work: model weights, training data, code, and\nlogs.\n","authors":["Niklas Muennighoff","Luca Soldaini","Dirk Groeneveld","Kyle Lo","Jacob Morrison","Sewon Min","Weijia Shi","Pete Walsh","Oyvind Tafjord","Nathan Lambert","Yuling Gu","Shane Arora","Akshita Bhagia","Dustin Schwenk","David Wadden","Alexander Wettig","Binyuan Hui","Tim Dettmers","Douwe Kiela","Ali Farhadi","Noah A. Smith","Pang Wei Koh","Amanpreet Singh","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2409.02060v1.pdf","comment":"61 pages (24 main), 36 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.02050v1","updated":"2024-09-03T16:53:38Z","published":"2024-09-03T16:53:38Z","title":"Enhancing Code-Switching Speech Recognition with LID-Based Collaborative\n Mixture of Experts Model","summary":" Due to the inherent difficulty in modeling phonetic similarities across\ndifferent languages, code-switching speech recognition presents a formidable\nchallenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE)\nmodel that leverages a collaborative mechanism among expert groups. Initially,\na preceding routing network explicitly learns Language Identification (LID)\ntasks and selects experts based on acquired LID weights. This process ensures\nrobust routing information to the MoE layer, mitigating interference from\ndiverse language domains on expert network parameter updates. The LID weights\nare also employed to facilitate inter-group collaboration, enabling the\nintegration of language-specific representations. Furthermore, within each\nlanguage expert group, a gating network operates unsupervised to foster\ncollaboration on attributes beyond language. Extensive experiments demonstrate\nthe efficacy of our approach, achieving significant performance enhancements\ncompared to alternative methods. Importantly, our method preserves the\nefficient inference capabilities characteristic of MoE models without\nnecessitating additional pre-training.\n","authors":["Hukai Huang","Jiayan Lin","Kaidi Wang","Yishuang Li","Wenhao Guan","Qingyang Hong","Lin Li"],"pdf_url":"https://arxiv.org/pdf/2409.02050v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.02038v1","updated":"2024-09-03T16:37:45Z","published":"2024-09-03T16:37:45Z","title":"BEAVER: An Enterprise Benchmark for Text-to-SQL","summary":" Existing text-to-SQL benchmarks have largely been constructed using publicly\navailable tables from the web with human-generated tests containing question\nand SQL statement pairs. They typically show very good results and lead people\nto think that LLMs are effective at text-to-SQL tasks. In this paper, we apply\noff-the-shelf LLMs to a benchmark containing enterprise data warehouse data. In\nthis environment, LLMs perform poorly, even when standard prompt engineering\nand RAG techniques are utilized. As we will show, the reasons for poor\nperformance are largely due to three characteristics: (1) public LLMs cannot\ntrain on enterprise data warehouses because they are largely in the \"dark web\",\n(2) schemas of enterprise tables are more complex than the schemas in public\ndata, which leads the SQL-generation task innately harder, and (3)\nbusiness-oriented questions are often more complex, requiring joins over\nmultiple tables and aggregations. As a result, we propose a new dataset BEAVER,\nsourced from real enterprise data warehouses together with natural language\nqueries and their correct SQL statements which we collected from actual user\nhistory. We evaluated this dataset using recent LLMs and demonstrated their\npoor performance on this task. We hope this dataset will facilitate future\nresearchers building more sophisticated text-to-SQL systems which can do better\non this important class of data.\n","authors":["Peter Baile Chen","Fabian Wenz","Yi Zhang","Moe Kayali","Nesime Tatbul","Michael Cafarella","Çağatay Demiralp","Michael Stonebraker"],"pdf_url":"https://arxiv.org/pdf/2409.02038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05783v2","updated":"2024-09-03T16:23:55Z","published":"2024-04-08T17:53:21Z","title":"A Survey on Responsible Generative AI: What to Generate and What Not","summary":" In recent years, generative AI (GenAI), like large language models and\ntext-to-image models, has received significant attention across various\ndomains. However, ensuring the responsible generation of content by these\nmodels is crucial for their real-world applicability. This raises an\ninteresting question: What should responsible GenAI generate, and what should\nit not? To answer the question, this paper investigates the practical\nresponsible requirements of both textual and visual generative models,\noutlining five key considerations: generating truthful content, avoiding toxic\ncontent, refusing harmful instruction, leaking no training data-related\ncontent, and ensuring generated content identifiable. Specifically, we review\nrecent advancements and challenges in addressing these requirements. Besides,\nwe discuss and emphasize the importance of responsible GenAI across healthcare,\neducation, finance, and artificial general intelligence domains. Through a\nunified perspective on both textual and visual generative models, this paper\naims to provide insights into practical safety-related issues and further\nbenefit the community in building responsible GenAI.\n","authors":["Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.05783v2.pdf","comment":"77 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.02026v1","updated":"2024-09-03T16:20:22Z","published":"2024-09-03T16:20:22Z","title":"Foundations of Large Language Model Compression -- Part 1: Weight\n Quantization","summary":" In recent years, compression of large language models (LLMs) has emerged as\nan important problem to allow language model deployment on resource-constrained\ndevices, reduce computational costs, and mitigate the environmental footprint\nof large-scale AI infrastructure. In this paper, we present the foundations of\nLLM quantization from a convex optimization perspective and propose a\nquantization method that builds on these foundations and outperforms previous\nmethods. Our quantization framework, CVXQ, scales to models containing hundreds\nof billions of weight parameters and provides users with the flexibility to\ncompress models to any specified model size, post-training. A reference\nimplementation of CVXQ can be obtained from https://github.com/seannz/cvxq.\n","authors":["Sean I. Young"],"pdf_url":"https://arxiv.org/pdf/2409.02026v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2309.00649v2","updated":"2024-09-03T15:23:26Z","published":"2023-08-31T12:53:52Z","title":"GPT has become financially literate: Insights from financial literacy\n tests of GPT and a preliminary test of how people use it as a source of\n advice","summary":" We assess the ability of GPT -- a large language model -- to serve as a\nfinancial robo-advisor for the masses, by using a financial literacy test.\nDavinci and ChatGPT based on GPT-3.5 score 66% and 65% on the financial\nliteracy test, respectively, compared to a baseline of 33%. However, ChatGPT\nbased on GPT-4 achieves a near-perfect 99% score, pointing to financial\nliteracy becoming an emergent ability of state-of-the-art models. We use the\nJudge-Advisor System and a savings dilemma to illustrate how researchers might\nassess advice-utilization from large language models. We also present a number\nof directions for future research.\n","authors":["Paweł Niszczota","Sami Abbas"],"pdf_url":"https://arxiv.org/pdf/2309.00649v2.pdf","comment":"43 pages, 2 figures and 2 tables in main text; in V2 added\n information that this is the Author Accepted Manuscript version"},{"id":"http://arxiv.org/abs/2402.01115v4","updated":"2024-09-03T15:14:11Z","published":"2024-02-02T03:15:13Z","title":"Interpretation of Intracardiac Electrograms Through Textual\n Representations","summary":" Understanding the irregular electrical activity of atrial fibrillation (AFib)\nhas been a key challenge in electrocardiography. For serious cases of AFib,\ncatheter ablations are performed to collect intracardiac electrograms (EGMs).\nEGMs offer intricately detailed and localized electrical activity of the heart\nand are an ideal modality for interpretable cardiac studies. Recent\nadvancements in artificial intelligence (AI) has allowed some works to utilize\ndeep learning frameworks to interpret EGMs during AFib. Additionally, language\nmodels (LMs) have shown exceptional performance in being able to generalize to\nunseen domains, especially in healthcare. In this study, we are the first to\nleverage pretrained LMs for finetuning of EGM interpolation and AFib\nclassification via masked language modeling. We formulate the EGM as a textual\nsequence and present competitive performances on AFib classification compared\nagainst other representations. Lastly, we provide a comprehensive\ninterpretability study to provide a multi-perspective intuition of the model's\nbehavior, which could greatly benefit the clinical use.\n","authors":["William Jongwon Han","Diana Gomez","Avi Alok","Chaojing Duan","Michael A. Rosenberg","Douglas Weber","Emerson Liu","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01115v4.pdf","comment":"17 pages, 7 figures; Accepted to CHIL 2024"},{"id":"http://arxiv.org/abs/2409.01944v1","updated":"2024-09-03T14:40:31Z","published":"2024-09-03T14:40:31Z","title":"FuzzCoder: Byte-level Fuzzing Test via Large Language Model","summary":" Fuzzing is an important dynamic program analysis technique designed for\nfinding vulnerabilities in complex software. Fuzzing involves presenting a\ntarget program with crafted malicious input to cause crashes, buffer overflows,\nmemory errors, and exceptions. Crafting malicious inputs in an efficient manner\nis a difficult open problem and the best approaches often apply uniform random\nmutations to pre-existing valid inputs. In this work, we propose to adopt\nfine-tuned large language models (FuzzCoder) to learn patterns in the input\nfiles from successful attacks to guide future fuzzing explorations.\nSpecifically, we develop a framework to leverage the code LLMs to guide the\nmutation process of inputs in fuzzing. The mutation process is formulated as\nthe sequence-to-sequence modeling, where LLM receives a sequence of bytes and\nthen outputs the mutated byte sequence. FuzzCoder is fine-tuned on the created\ninstruction dataset (Fuzz-Instruct), where the successful fuzzing history is\ncollected from the heuristic fuzzing tool. FuzzCoder can predict mutation\nlocations and strategies locations in input files to trigger abnormal behaviors\nof the program. Experimental results show that FuzzCoder based on AFL (American\nFuzzy Lop) gain significant improvements in terms of effective proportion of\nmutation (EPM) and number of crashes (NC) for various input formats including\nELF, JPG, MP3, and XML.\n","authors":["Liqun Yang","Jian Yang","Chaoren Wei","Guanglin Niu","Ge Zhang","Yunli Wang","Linzheng ChaI","Wanxu Xia","Hongcheng Guo","Shun Zhang","Jiaheng Liu","Yuwei Yin","Junran Peng","Jiaxin Ma","Liang Sun","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2409.01944v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.01941v1","updated":"2024-09-03T14:38:29Z","published":"2024-09-03T14:38:29Z","title":"Towards Leveraging Large Language Models for Automated Medical Q&A\n Evaluation","summary":" This paper explores the potential of using Large Language Models (LLMs) to\nautomate the evaluation of responses in medical Question and Answer (Q\\&A)\nsystems, a crucial form of Natural Language Processing. Traditionally, human\nevaluation has been indispensable for assessing the quality of these responses.\nHowever, manual evaluation by medical professionals is time-consuming and\ncostly. Our study examines whether LLMs can reliably replicate human\nevaluations by using questions derived from patient data, thereby saving\nvaluable time for medical experts. While the findings suggest promising\nresults, further research is needed to address more specific or complex\nquestions that were beyond the scope of this initial investigation.\n","authors":["Jack Krolik","Herprit Mahal","Feroz Ahmad","Gaurav Trivedi","Bahador Saket"],"pdf_url":"https://arxiv.org/pdf/2409.01941v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.00267v3","updated":"2024-09-03T14:01:54Z","published":"2023-09-01T05:53:33Z","title":"RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with\n AI Feedback","summary":" Reinforcement learning from human feedback (RLHF) has proven effective in\naligning large language models (LLMs) with human preferences, but gathering\nhigh-quality preference labels is expensive. RL from AI Feedback (RLAIF),\nintroduced in Bai et al., offers a promising alternative that trains the reward\nmodel (RM) on preferences generated by an off-the-shelf LLM. Across the tasks\nof summarization, helpful dialogue generation, and harmless dialogue\ngeneration, we show that RLAIF achieves comparable performance to RLHF.\nFurthermore, we take a step towards \"self-improvement\" by demonstrating that\nRLAIF can outperform a supervised fine-tuned baseline even when the AI labeler\nis the same size as the policy, or even the exact same checkpoint as the\ninitial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that\ncircumvents RM training by obtaining rewards directly from an off-the-shelf LLM\nduring RL, which achieves superior performance to canonical RLAIF. Our results\nsuggest that RLAIF can achieve performance on-par with using human feedback,\noffering a potential solution to the scalability limitations of RLHF.\n","authors":["Harrison Lee","Samrat Phatale","Hassan Mansoor","Thomas Mesnard","Johan Ferret","Kellie Lu","Colton Bishop","Ethan Hall","Victor Carbune","Abhinav Rastogi","Sushant Prakash"],"pdf_url":"https://arxiv.org/pdf/2309.00267v3.pdf","comment":"Presented at ICML 2024"},{"id":"http://arxiv.org/abs/2409.01901v1","updated":"2024-09-03T13:44:56Z","published":"2024-09-03T13:44:56Z","title":"3D-LEX v1.0: 3D Lexicons for American Sign Language and Sign Language of\n the Netherlands","summary":" In this work, we present an efficient approach for capturing sign language in\n3D, introduce the 3D-LEX v1.0 dataset, and detail a method for semi-automatic\nannotation of phonetic properties. Our procedure integrates three motion\ncapture techniques encompassing high-resolution 3D poses, 3D handshapes, and\ndepth-aware facial features, and attains an average sampling rate of one sign\nevery 10 seconds. This includes the time for presenting a sign example,\nperforming and recording the sign, and archiving the capture. The 3D-LEX\ndataset includes 1,000 signs from American Sign Language and an additional\n1,000 signs from the Sign Language of the Netherlands. We showcase the dataset\nutility by presenting a simple method for generating handshape annotations\ndirectly from 3D-LEX. We produce handshape labels for 1,000 signs from American\nSign Language and evaluate the labels in a sign recognition task. The labels\nenhance gloss recognition accuracy by 5% over using no handshape annotations,\nand by 1% over expert annotations. Our motion capture data supports in-depth\nanalysis of sign features and facilitates the generation of 2D projections from\nany viewpoint. The 3D-LEX collection has been aligned with existing sign\nlanguage benchmarks and linguistic resources, to support studies in 3D-aware\nsign language processing.\n","authors":["Oline Ranum","Gomer Otterspeer","Jari I. Andersen","Robert G. Belleman","Floris Roelofsen"],"pdf_url":"https://arxiv.org/pdf/2409.01901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01893v1","updated":"2024-09-03T13:30:00Z","published":"2024-09-03T13:30:00Z","title":"What are the Essential Factors in Crafting Effective Long Context\n Multi-Hop Instruction Datasets? Insights and Best Practices","summary":" Recent advancements in large language models (LLMs) with extended context\nwindows have significantly improved tasks such as information extraction,\nquestion answering, and complex planning scenarios. In order to achieve success\nin long context tasks, a large amount of work has been done to enhance the long\ncontext capabilities of the model through synthetic data. Existing methods\ntypically utilize the Self-Instruct framework to generate instruction tuning\ndata for better long context capability improvement. However, our preliminary\nexperiments indicate that less than 35% of generated samples are multi-hop, and\nmore than 40% exhibit poor quality, limiting comprehensive understanding and\nfurther research. To improve the quality of synthetic data, we propose the\nMulti-agent Interactive Multi-hop Generation (MIMG) framework, incorporating a\nQuality Verification Agent, a Single-hop Question Generation Agent, a Multiple\nQuestion Sampling Strategy, and a Multi-hop Question Merger Agent. This\nframework improves the data quality, with the proportion of high-quality,\nmulti-hop, and diverse data exceeding 85%. Furthermore, we systematically\ninvestigate strategies for document selection, question merging, and validation\ntechniques through extensive experiments across various models. Our findings\nshow that our synthetic high-quality long-context instruction data\nsignificantly enhances model performance, even surpassing models trained on\nlarger amounts of human-annotated data. Our code is available at:\nhttps://github.com/WowCZ/LongMIT.\n","authors":["Zhi Chen","Qiguang Chen","Libo Qin","Qipeng Guo","Haijun Lv","Yicheng Zou","Wanxiang Che","Hang Yan","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2409.01893v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2409.01882v1","updated":"2024-09-03T13:23:11Z","published":"2024-09-03T13:23:11Z","title":"Investigating Expert-in-the-Loop LLM Discourse Patterns for Ancient\n Intertextual Analysis","summary":" This study explores the potential of large language models (LLMs) for\nidentifying and examining intertextual relationships within biblical, Koine\nGreek texts. By evaluating the performance of LLMs on various intertextuality\nscenarios the study demonstrates that these models can detect direct\nquotations, allusions, and echoes between texts. The LLM's ability to generate\nnovel intertextual observations and connections highlights its potential to\nuncover new insights. However, the model also struggles with long query\npassages and the inclusion of false intertextual dependences, emphasizing the\nimportance of expert evaluation. The expert-in-the-loop methodology presented\noffers a scalable approach for intertextual research into the complex web of\nintertextuality within and beyond the biblical corpus.\n","authors":["Ray Umphrey","Jesse Roberts","Lindsey Roberts"],"pdf_url":"https://arxiv.org/pdf/2409.01882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01864v1","updated":"2024-09-03T13:05:38Z","published":"2024-09-03T13:05:38Z","title":"The Role of Large Language Models in Musicology: Are We Ready to Trust\n the Machines?","summary":" In this work, we explore the use and reliability of Large Language Models\n(LLMs) in musicology. From a discussion with experts and students, we assess\nthe current acceptance and concerns regarding this, nowadays ubiquitous,\ntechnology. We aim to go one step further, proposing a semi-automatic method to\ncreate an initial benchmark using retrieval-augmented generation models and\nmultiple-choice question generation, validated by human experts. Our evaluation\non 400 human-validated questions shows that current vanilla LLMs are less\nreliable than retrieval augmented generation from music dictionaries. This\npaper suggests that the potential of LLMs in musicology requires musicology\ndriven research that can specialized LLMs by including accurate and reliable\ndomain knowledge.\n","authors":["Pedro Ramoneda","Emilia Parada-Cabaleiro","Benno Weck","Xavier Serra"],"pdf_url":"https://arxiv.org/pdf/2409.01864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01854v1","updated":"2024-09-03T12:53:05Z","published":"2024-09-03T12:53:05Z","title":"AgentRE: An Agent-Based Framework for Navigating Complex Information\n Landscapes in Relation Extraction","summary":" The relation extraction (RE) in complex scenarios faces challenges such as\ndiverse relation types and ambiguous relations between entities within a single\nsentence, leading to the poor performance of pure \"text-in, text-out\" language\nmodels (LMs). To address these challenges, in this paper, we propose an\nagent-based RE framework, namely AgentRE, which fully leverages the potential\nof large language models (LLMs) including memory, retrieval and reflection, to\nachieve RE in complex scenarios. Specifically, three major modules are built in\nAgentRE serving as the tools to help the agent acquire and process various\nuseful information, thereby obtaining improved RE performance. Our extensive\nexperimental results upon two datasets in English and Chinese demonstrate our\nAgentRE's superior performance, especially in low-resource scenarios.\nAdditionally, the trajectories generated by AgentRE can be refined to construct\na high-quality training dataset incorporating different reasoning methods,\nwhich can be used to fine-tune smaller models. Code is available at\nhttps://github.com/Lightblues/AgentRE.\n","authors":["Yuchen Shi","Guochao Jiang","Tian Qiu","Deqing Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01854v1.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2409.01835v1","updated":"2024-09-03T12:34:21Z","published":"2024-09-03T12:34:21Z","title":"Towards Generative Class Prompt Learning for Few-shot Visual Recognition","summary":" Although foundational vision-language models (VLMs) have proven to be very\nsuccessful for various semantic discrimination tasks, they still struggle to\nperform faithfully for fine-grained categorization. Moreover, foundational\nmodels trained on one domain do not generalize well on a different domain\nwithout fine-tuning. We attribute these to the limitations of the VLM's\nsemantic representations and attempt to improve their fine-grained visual\nawareness using generative modeling. Specifically, we propose two novel\nmethods: Generative Class Prompt Learning (GCPL) and Contrastive Multi-class\nPrompt Learning (CoMPLe). Utilizing text-to-image diffusion models, GCPL\nsignificantly improves the visio-linguistic synergy in class embeddings by\nconditioning on few-shot exemplars with learnable class prompts. CoMPLe builds\non this foundation by introducing a contrastive learning component that\nencourages inter-class separation during the generative optimization process.\nOur empirical results demonstrate that such a generative class prompt learning\napproach substantially outperform existing methods, offering a better\nalternative to few shot image recognition challenges. The source code will be\nmade available at: https://github.com/soumitri2001/GCPL.\n","authors":["Soumitri Chattopadhyay","Sanket Biswas","Emanuele Vivoli","Josep Lladós"],"pdf_url":"https://arxiv.org/pdf/2409.01835v1.pdf","comment":"Accepted at BMVC 2024"},{"id":"http://arxiv.org/abs/2409.01808v1","updated":"2024-09-03T11:40:38Z","published":"2024-09-03T11:40:38Z","title":"Dialogue You Can Trust: Human and AI Perspectives on Generated\n Conversations","summary":" As dialogue systems and chatbots increasingly integrate into everyday\ninteractions, the need for efficient and accurate evaluation methods becomes\nparamount. This study explores the comparative performance of human and AI\nassessments across a range of dialogue scenarios, focusing on seven key\nperformance indicators (KPIs): Coherence, Innovation, Concreteness, Goal\nContribution, Commonsense Contradiction, Incorrect Fact, and Redundancy.\nUtilizing the GPT-4o API, we generated a diverse dataset of conversations and\nconducted a two-part experimental analysis. In Experiment 1, we evaluated\nmulti-party conversations on Coherence, Innovation, Concreteness, and Goal\nContribution, revealing that GPT models align closely with human judgments.\nNotably, both human and AI evaluators exhibited a tendency towards binary\njudgment rather than linear scaling, highlighting a shared challenge in these\nassessments. Experiment 2 extended the work of Finch et al. (2023) by focusing\non dyadic dialogues and assessing Commonsense Contradiction, Incorrect Fact,\nand Redundancy. The results indicate that while GPT-4o demonstrates strong\nperformance in maintaining factual accuracy and commonsense reasoning, it still\nstruggles with reducing redundancy and self-contradiction. Our findings\nunderscore the potential of GPT models to closely replicate human evaluation in\ndialogue systems, while also pointing to areas for improvement. This research\noffers valuable insights for advancing the development and implementation of\nmore refined dialogue evaluation methodologies, contributing to the evolution\nof more effective and human-like AI communication tools.\n","authors":["Ike Ebubechukwu","Johane Takeuchi","Antonello Ceravola","Frank Joublin"],"pdf_url":"https://arxiv.org/pdf/2409.01808v1.pdf","comment":"17 pages, 15 figures, shorter version submitted to 22nd Annual\n Workshop of the Australasian Language Technology Association (ALTA'24)"},{"id":"http://arxiv.org/abs/2409.01806v1","updated":"2024-09-03T11:39:52Z","published":"2024-09-03T11:39:52Z","title":"LASP: Surveying the State-of-the-Art in Large Language Model-Assisted AI\n Planning","summary":" Effective planning is essential for the success of any task, from organizing\na vacation to routing autonomous vehicles and developing corporate strategies.\nIt involves setting goals, formulating plans, and allocating resources to\nachieve them. LLMs are particularly well-suited for automated planning due to\ntheir strong capabilities in commonsense reasoning. They can deduce a sequence\nof actions needed to achieve a goal from a given state and identify an\neffective course of action. However, it is frequently observed that plans\ngenerated through direct prompting often fail upon execution. Our survey aims\nto highlight the existing challenges in planning with language models, focusing\non key areas such as embodied environments, optimal scheduling, competitive and\ncooperative games, task decomposition, reasoning, and planning. Through this\nstudy, we explore how LLMs transform AI planning and provide unique insights\ninto the future of LM-assisted planning.\n","authors":["Haoming Li","Zhaoliang Chen","Jonathan Zhang","Fei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01790v1","updated":"2024-09-03T11:09:44Z","published":"2024-09-03T11:09:44Z","title":"Training on the Benchmark Is Not All You Need","summary":" The success of Large Language Models (LLMs) relies heavily on the huge amount\nof pre-training data learned in the pre-training phase. The opacity of the\npre-training process and the training data causes the results of many benchmark\ntests to become unreliable. If any model has been trained on a benchmark test\nset, it can seriously hinder the health of the field. In order to automate and\nefficiently test the capabilities of large language models, numerous mainstream\nbenchmarks adopt a multiple-choice format. As the swapping of the contents of\nmultiple-choice options does not affect the meaning of the question itself, we\npropose a simple and effective data leakage detection method based on this\nproperty. Specifically, we shuffle the contents of the options in the data to\ngenerate the corresponding derived data sets, and then detect data leakage\nbased on the model's log probability distribution over the derived data sets.\nIf there is a maximum and outlier in the set of log probabilities, it indicates\nthat the data is leaked. Our method is able to work under black-box conditions\nwithout access to model training data or weights, effectively identifying data\nleakage from benchmark test sets in model pre-training data, including both\nnormal scenarios and complex scenarios where options may have been shuffled\nintentionally or unintentionally. Through experiments based on two LLMs and\nbenchmark designs, we demonstrate the effectiveness of our method. In addition,\nwe evaluate the degree of data leakage of 31 mainstream open-source LLMs on\nfour benchmark datasets and give a ranking of the leaked LLMs for each\nbenchmark, and we find that the Qwen family of LLMs has the highest degree of\ndata leakage.\n","authors":["Shiwen Ni","Xiangtao Kong","Chengming Li","Xiping Hu","Ruifeng Xu","Jia Zhu","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01787v1","updated":"2024-09-03T11:06:45Z","published":"2024-09-03T11:06:45Z","title":"LLM-GAN: Construct Generative Adversarial Network Through Large Language\n Models For Explainable Fake News Detection","summary":" Explainable fake news detection predicts the authenticity of news items with\nannotated explanations. Today, Large Language Models (LLMs) are known for their\npowerful natural language understanding and explanation generation abilities.\nHowever, presenting LLMs for explainable fake news detection remains two main\nchallenges. Firstly, fake news appears reasonable and could easily mislead\nLLMs, leaving them unable to understand the complex news-faking process.\nSecondly, utilizing LLMs for this task would generate both correct and\nincorrect explanations, which necessitates abundant labor in the loop. In this\npaper, we propose LLM-GAN, a novel framework that utilizes prompting mechanisms\nto enable an LLM to become Generator and Detector and for realistic fake news\ngeneration and detection. Our results demonstrate LLM-GAN's effectiveness in\nboth prediction performance and explanation quality. We further showcase the\nintegration of LLM-GAN to a cloud-native AI platform to provide better fake\nnews detection service in the cloud.\n","authors":["Yifeng Wang","Zhouhong Gu","Siwei Zhang","Suhang Zheng","Tao Wang","Tianyu Li","Hongwei Feng","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.01787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01780v1","updated":"2024-09-03T10:49:42Z","published":"2024-09-03T10:49:42Z","title":"State-of-the-art Advances of Deep-learning Linguistic Steganalysis\n Research","summary":" With the evolution of generative linguistic steganography techniques,\nconventional steganalysis falls short in robustly quantifying the alterations\ninduced by steganography, thereby complicating detection. Consequently, the\nresearch paradigm has pivoted towards deep-learning-based linguistic\nsteganalysis. This study offers a comprehensive review of existing\ncontributions and evaluates prevailing developmental trajectories.\nSpecifically, we first provided a formalized exposition of the general formulas\nfor linguistic steganalysis, while comparing the differences between this field\nand the domain of text classification. Subsequently, we classified the existing\nwork into two levels based on vector space mapping and feature extraction\nmodels, thereby comparing the research motivations, model advantages, and other\ndetails. A comparative analysis of the experiments is conducted to assess the\nperformances. Finally, the challenges faced by this field are discussed, and\nseveral directions for future development and key issues that urgently need to\nbe addressed are proposed.\n","authors":["Yihao Wang","Ru Zhang","Yifan Tang","Jianyi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01780v1.pdf","comment":"Accepted by 2023 International Conference on Data, Information and\n Computing Science"},{"id":"http://arxiv.org/abs/2409.01763v1","updated":"2024-09-03T10:16:43Z","published":"2024-09-03T10:16:43Z","title":"FC-KAN: Function Combinations in Kolmogorov-Arnold Networks","summary":" In this paper, we introduce FC-KAN, a Kolmogorov-Arnold Network (KAN) that\nleverages combinations of popular mathematical functions such as B-splines,\nwavelets, and radial basis functions on low-dimensional data through\nelement-wise operations. We explore several methods for combining the outputs\nof these functions, including sum, element-wise product, the addition of sum\nand element-wise product, quadratic function representation, and concatenation.\nIn our experiments, we compare FC-KAN with multi-layer perceptron network (MLP)\nand other existing KANs, such as BSRBF-KAN, EfficientKAN, FastKAN, and\nFasterKAN, on the MNIST and Fashion-MNIST datasets. A variant of FC-KAN, which\nuses a combination of outputs from B-splines and Difference of Gaussians (DoG)\nin the form of a quadratic function, outperformed all other models on the\naverage of 5 independent training runs. We expect that FC-KAN can leverage\nfunction combinations to design future KANs. Our repository is publicly\navailable at: https://github.com/hoangthangta/FC_KAN.\n","authors":["Hoang-Thang Ta","Duy-Quy Thai","Abu Bakar Siddiqur Rahman","Grigori Sidorov","Alexander Gelbukh"],"pdf_url":"https://arxiv.org/pdf/2409.01763v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.01754v1","updated":"2024-09-03T10:01:51Z","published":"2024-09-03T10:01:51Z","title":"Empirical evidence of Large Language Model's influence on human spoken\n communication","summary":" Artificial Intelligence (AI) agents now interact with billions of humans in\nnatural language, thanks to advances in Large Language Models (LLMs) like\nChatGPT. This raises the question of whether AI has the potential to shape a\nfundamental aspect of human culture: the way we speak. Recent analyses revealed\nthat scientific publications already exhibit evidence of AI-specific language.\nBut this evidence is inconclusive, since scientists may simply be using AI to\ncopy-edit their writing. To explore whether AI has influenced human spoken\ncommunication, we transcribed and analyzed about 280,000 English-language\nvideos of presentations, talks, and speeches from more than 20,000 YouTube\nchannels of academic institutions. We find a significant shift in the trend of\nword usage specific to words distinctively associated with ChatGPT following\nits release. These findings provide the first empirical evidence that humans\nincreasingly imitate LLMs in their spoken language. Our results raise societal\nand policy-relevant concerns about the potential of AI to unintentionally\nreduce linguistic diversity, or to be deliberately misused for mass\nmanipulation. They also highlight the need for further investigation into the\nfeedback loops between machine behavior and human culture.\n","authors":["Hiromu Yakura","Ezequiel Lopez-Lopez","Levin Brinkmann","Ignacio Serna","Prateek Gupta","Iyad Rahwan"],"pdf_url":"https://arxiv.org/pdf/2409.01754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01690v1","updated":"2024-09-03T08:13:06Z","published":"2024-09-03T08:13:06Z","title":"Taming CLIP for Fine-grained and Structured Visual Understanding of\n Museum Exhibits","summary":" CLIP is a powerful and widely used tool for understanding images in the\ncontext of natural language descriptions to perform nuanced tasks. However, it\ndoes not offer application-specific fine-grained and structured understanding,\ndue to its generic nature. In this work, we aim to adapt CLIP for fine-grained\nand structured -- in the form of tabular data -- visual understanding of museum\nexhibits. To facilitate such understanding we (a) collect, curate, and\nbenchmark a dataset of 200K+ image-table pairs, and (b) develop a method that\nallows predicting tabular outputs for input images. Our dataset is the first of\nits kind in the public domain. At the same time, the proposed method is novel\nin leveraging CLIP's powerful representations for fine-grained and tabular\nunderstanding. The proposed method (MUZE) learns to map CLIP's image embeddings\nto the tabular structure by means of a proposed transformer-based parsing\nnetwork (parseNet). More specifically, parseNet enables prediction of missing\nattribute values while integrating context from known attribute-value pairs for\nan input image. We show that this leads to significant improvement in accuracy.\nThrough exhaustive experiments, we show the effectiveness of the proposed\nmethod on fine-grained and structured understanding of museum exhibits, by\nachieving encouraging results in a newly established benchmark. Our dataset and\nsource-code can be found at: https://github.com/insait-institute/MUZE\n","authors":["Ada-Astrid Balauca","Danda Pani Paudel","Kristina Toutanova","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2409.01690v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2409.01666v1","updated":"2024-09-03T07:17:41Z","published":"2024-09-03T07:17:41Z","title":"In Defense of RAG in the Era of Long-Context Language Models","summary":" Overcoming the limited context limitations in early-generation LLMs,\nretrieval-augmented generation (RAG) has been a reliable solution for\ncontext-based answer generation in the past. Recently, the emergence of\nlong-context LLMs allows the models to incorporate much longer text sequences,\nmaking RAG less attractive. Recent studies show that long-context LLMs\nsignificantly outperform RAG in long-context applications. Unlike the existing\nworks favoring the long-context LLM over RAG, we argue that the extremely long\ncontext in LLMs suffers from a diminished focus on relevant information and\nleads to potential degradation in answer quality. This paper revisits the RAG\nin long-context answer generation. We propose an order-preserve\nretrieval-augmented generation (OP-RAG) mechanism, which significantly improves\nthe performance of RAG for long-context question-answer applications. With\nOP-RAG, as the number of retrieved chunks increases, the answer quality\ninitially rises, and then declines, forming an inverted U-shaped curve. There\nexist sweet points where OP-RAG could achieve higher answer quality with much\nless tokens than long-context LLM taking the whole context as input. Extensive\nexperiments on public benchmark demonstrate the superiority of our OP-RAG.\n","authors":["Tan Yu","Anbang Xu","Rama Akkiraju"],"pdf_url":"https://arxiv.org/pdf/2409.01666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01659v1","updated":"2024-09-03T07:01:46Z","published":"2024-09-03T07:01:46Z","title":"Interpreting and Improving Large Language Models in Arithmetic\n Calculation","summary":" Large language models (LLMs) have demonstrated remarkable potential across\nnumerous applications and have shown an emergent ability to tackle complex\nreasoning tasks, such as mathematical computations. However, even for the\nsimplest arithmetic calculations, the intrinsic mechanisms behind LLMs remain\nmysterious, making it challenging to ensure reliability. In this work, we delve\ninto uncovering a specific mechanism by which LLMs execute calculations.\nThrough comprehensive experiments, we find that LLMs frequently involve a small\nfraction (< 5%) of attention heads, which play a pivotal role in focusing on\noperands and operators during calculation processes. Subsequently, the\ninformation from these operands is processed through multi-layer perceptrons\n(MLPs), progressively leading to the final solution. These pivotal heads/MLPs,\nthough identified on a specific dataset, exhibit transferability across\ndifferent datasets and even distinct tasks. This insight prompted us to\ninvestigate the potential benefits of selectively fine-tuning these essential\nheads/MLPs to boost the LLMs' computational performance. We empirically find\nthat such precise tuning can yield notable enhancements on mathematical\nprowess, without compromising the performance on non-mathematical tasks. Our\nwork serves as a preliminary exploration into the arithmetic calculation\nabilities inherent in LLMs, laying a solid foundation to reveal more intricate\nmathematical tasks.\n","authors":["Wei Zhang","Chaoqun Wan","Yonggang Zhang","Yiu-ming Cheung","Xinmei Tian","Xu Shen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2409.01659v1.pdf","comment":"Accepted by ICML 2024 (oral)"},{"id":"http://arxiv.org/abs/2409.01658v1","updated":"2024-09-03T07:01:37Z","published":"2024-09-03T07:01:37Z","title":"From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language\n Models with Pinpoint Tuning","summary":" Large Language Models (LLMs) tend to prioritize adherence to user prompts\nover providing veracious responses, leading to the sycophancy issue. When\nchallenged by users, LLMs tend to admit mistakes and provide inaccurate\nresponses even if they initially provided the correct answer. Recent works\npropose to employ supervised fine-tuning (SFT) to mitigate the sycophancy\nissue, while it typically leads to the degeneration of LLMs' general\ncapability. To address the challenge, we propose a novel supervised pinpoint\ntuning (SPT), where the region-of-interest modules are tuned for a given\nobjective. Specifically, SPT first reveals and verifies a small percentage\n(<5%) of the basic modules, which significantly affect a particular behavior of\nLLMs. i.e., sycophancy. Subsequently, SPT merely fine-tunes these identified\nmodules while freezing the rest. To verify the effectiveness of the proposed\nSPT, we conduct comprehensive experiments, demonstrating that SPT significantly\nmitigates the sycophancy issue of LLMs (even better than SFT). Moreover, SPT\nintroduces limited or even no side effects on the general capability of LLMs.\nOur results shed light on how to precisely, effectively, and efficiently\nexplain and improve the targeted ability of LLMs.\n","authors":["Wei Chen","Zhen Huang","Liang Xie","Binbin Lin","Houqiang Li","Le Lu","Xinmei Tian","Deng Cai","Yonggang Zhang","Wenxiao Wan","Xu Shen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2409.01658v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2409.01628v1","updated":"2024-09-03T05:53:57Z","published":"2024-09-03T05:53:57Z","title":"CTG-KrEW: Generating Synthetic Structured Contextually Correlated\n Content by Conditional Tabular GAN with K-Means Clustering and Efficient Word\n Embedding","summary":" Conditional Tabular Generative Adversarial Networks (CTGAN) and their various\nderivatives are attractive for their ability to efficiently and flexibly create\nsynthetic tabular data, showcasing strong performance and adaptability.\nHowever, there are certain critical limitations to such models. The first is\ntheir inability to preserve the semantic integrity of contextually correlated\nwords or phrases. For instance, skillset in freelancer profiles is one such\nattribute where individual skills are semantically interconnected and\nindicative of specific domain interests or qualifications. The second challenge\nof traditional approaches is that, when applied to generate contextually\ncorrelated tabular content, besides generating semantically shallow content,\nthey consume huge memory resources and CPU time during the training stage. To\naddress these problems, we introduce a novel framework, CTGKrEW (Conditional\nTabular GAN with KMeans Clustering and Word Embedding), which is adept at\ngenerating realistic synthetic tabular data where attributes are collections of\nsemantically and contextually coherent words. CTGKrEW is trained and evaluated\nusing a dataset from Upwork, a realworld freelancing platform. Comprehensive\nexperiments were conducted to analyze the variability, contextual similarity,\nfrequency distribution, and associativity of the generated data, along with\ntesting the framework's system feasibility. CTGKrEW also takes around 99\\% less\nCPU time and 33\\% less memory footprints than the conventional approach.\nFurthermore, we developed KrEW, a web application to facilitate the generation\nof realistic data containing skill-related information. This application,\navailable at https://riyasamanta.github.io/krew.html, is freely accessible to\nboth the general public and the research community.\n","authors":["Riya Samanta","Bidyut Saha","Soumya K. Ghosh","Sajal K. Das"],"pdf_url":"https://arxiv.org/pdf/2409.01628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01586v1","updated":"2024-09-03T03:59:22Z","published":"2024-09-03T03:59:22Z","title":"Booster: Tackling Harmful Fine-tuing for Large Language Models via\n Attenuating Harmful Perturbation","summary":" Harmful fine-tuning issue \\citep{qi2023fine} poses serious safety concerns\nfor Large language models' fine-tuning-as-a-service. While existing defenses\n\\citep{huang2024vaccine,rosati2024representation} have been proposed to\nmitigate the issue, their performances are still far away from satisfactory,\nand the root cause of the problem has not been fully recovered. For the first\ntime in the literature, we in this paper show that \\textit{harmful\nperturbation} over the model weights should be the root cause of\nalignment-broken of harmful fine-tuning. In order to attenuate the negative\nimpact of harmful perturbation, we propose an alignment-stage solution, dubbed\nBooster. Technically, along with the original alignment loss, we append a loss\nregularizer in the alignment stage's optimization. The regularizer ensures that\nthe model's harmful loss reduction before/after simulated harmful perturbation\nis attenuated, thereby mitigating the subsequent fine-tuning risk. Empirical\nresults show that Booster can effectively reduce the harmful score of the\nfine-tuned models while maintaining the performance of downstream tasks. Our\ncode is available at \\url{https://github.com/git-disl/Booster}.\n","authors":["Tiansheng Huang","Sihao Hu","Fatih Ilhan","Selim Furkan Tekin","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01584v1","updated":"2024-09-03T03:42:56Z","published":"2024-09-03T03:42:56Z","title":"Towards Cross-Lingual Explanation of Artwork in Large-scale Vision\n Language Models","summary":" As the performance of Large-scale Vision Language Models (LVLMs) improves,\nthey are increasingly capable of responding in multiple languages, and there is\nan expectation that the demand for explanations generated by LVLMs will grow.\nHowever, pre-training of Vision Encoder and the integrated training of LLMs\nwith Vision Encoder are mainly conducted using English training data, leaving\nit uncertain whether LVLMs can completely handle their potential when\ngenerating explanations in languages other than English. In addition,\nmultilingual QA benchmarks that create datasets using machine translation have\ncultural differences and biases, remaining issues for use as evaluation tasks.\nTo address these challenges, this study created an extended dataset in multiple\nlanguages without relying on machine translation. This dataset that takes into\naccount nuances and country-specific phrases was then used to evaluate the\ngeneration explanation abilities of LVLMs. Furthermore, this study examined\nwhether Instruction-Tuning in resource-rich English improves performance in\nother languages. Our findings indicate that LVLMs perform worse in languages\nother than English compared to English. In addition, it was observed that LVLMs\nstruggle to effectively manage the knowledge learned from English data.\n","authors":["Shintaro Ozaki","Kazuki Hayashi","Yusuke Sakai","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2409.01584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01579v1","updated":"2024-09-03T03:25:59Z","published":"2024-09-03T03:25:59Z","title":"AdaComp: Extractive Context Compression with Adaptive Predictor for\n Retrieval-Augmented Large Language Models","summary":" Retrieved documents containing noise will hinder RAG from detecting answer\nclues and make the inference process slow and expensive. Therefore, context\ncompression is necessary to enhance its accuracy and efficiency. Existing\ncontext compression methods use extractive or generative models to retain the\nmost query-relevant sentences or apply the information bottleneck theory to\npreserve sufficient information. However, these methods may face issues such as\nover-compression or high computational costs. We observe that the retriever\noften ranks relevant documents at the top, but the exact number of documents\nneeded to answer the query is uncertain due to the impact of query complexity\nand retrieval quality: complex queries like multi-hop questions may require\nretaining more documents than simpler queries, and a low-quality retrieval may\nneed to rely on more documents to generate accurate outputs. Therefore,\ndetermining the minimum number of required documents (compression rate) is\nstill a challenge for RAG. In this paper, we introduce AdaComp, a low-cost\nextractive context compression method that adaptively determines the\ncompression rate based on both query complexity and retrieval quality.\nSpecifically, we first annotate the minimum top-k documents necessary for the\nRAG system to answer the current query as the compression rate and then\nconstruct triplets of the query, retrieved documents, and its compression rate.\nThen, we use this triplet dataset to train a compression-rate predictor.\nExperiments on three QA datasets and one conversational Muiti-doc QA dataset\nshow that AdaComp significantly reduces inference costs while maintaining\nperformance nearly identical to uncompressed models, achieving a balance\nbetween efficiency and performance.\n","authors":["Qianchi Zhang","Hainan Zhang","Liang Pang","Hongwei Zheng","Zhiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.01579v1.pdf","comment":"8 pages, 5 figures, code available at\n https://anonymous.4open.science/r/AdaComp-8C0C/"},{"id":"http://arxiv.org/abs/2409.01575v1","updated":"2024-09-03T03:16:03Z","published":"2024-09-03T03:16:03Z","title":"An Implementation of Werewolf Agent That does not Truly Trust LLMs","summary":" Werewolf is an incomplete information game, which has several challenges when\ncreating a computer agent as a player given the lack of understanding of the\nsituation and individuality of utterance (e.g., computer agents are not capable\nof characterful utterance or situational lying). We propose a werewolf agent\nthat solves some of those difficulties by combining a Large Language Model\n(LLM) and a rule-based algorithm. In particular, our agent uses a rule-based\nalgorithm to select an output either from an LLM or a template prepared\nbeforehand based on the results of analyzing conversation history using an LLM.\nIt allows the agent to refute in specific situations, identify when to end the\nconversation, and behave with persona. This approach mitigated conversational\ninconsistencies and facilitated logical utterance as a result. We also\nconducted a qualitative evaluation, which resulted in our agent being perceived\nas more human-like compared to an unmodified LLM. The agent is freely available\nfor contributing to advance the research in the field of Werewolf game.\n","authors":["Takehiro Sato","Shintaro Ozaki","Daisaku Yokoyama"],"pdf_url":"https://arxiv.org/pdf/2409.01575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01556v1","updated":"2024-09-03T02:50:04Z","published":"2024-09-03T02:50:04Z","title":"Benchmarking Cognitive Domains for LLMs: Insights from Taiwanese Hakka\n Culture","summary":" This study introduces a comprehensive benchmark designed to evaluate the\nperformance of large language models (LLMs) in understanding and processing\ncultural knowledge, with a specific focus on Hakka culture as a case study.\nLeveraging Bloom's Taxonomy, the study develops a multi-dimensional framework\nthat systematically assesses LLMs across six cognitive domains: Remembering,\nUnderstanding, Applying, Analyzing, Evaluating, and Creating. This benchmark\nextends beyond traditional single-dimensional evaluations by providing a deeper\nanalysis of LLMs' abilities to handle culturally specific content, ranging from\nbasic recall of facts to higher-order cognitive tasks such as creative\nsynthesis. Additionally, the study integrates Retrieval-Augmented Generation\n(RAG) technology to address the challenges of minority cultural knowledge\nrepresentation in LLMs, demonstrating how RAG enhances the models' performance\nby dynamically incorporating relevant external information. The results\nhighlight the effectiveness of RAG in improving accuracy across all cognitive\ndomains, particularly in tasks requiring precise retrieval and application of\ncultural knowledge. However, the findings also reveal the limitations of RAG in\ncreative tasks, underscoring the need for further optimization. This benchmark\nprovides a robust tool for evaluating and comparing LLMs in culturally diverse\ncontexts, offering valuable insights for future research and development in\nAI-driven cultural knowledge preservation and dissemination.\n","authors":["Chen-Chi Chang","Ching-Yuan Chen","Hung-Shin Lee","Chih-Cheng Lee"],"pdf_url":"https://arxiv.org/pdf/2409.01556v1.pdf","comment":"Submitted to O-COCOSDA 2024"},{"id":"http://arxiv.org/abs/2409.01552v1","updated":"2024-09-03T02:42:39Z","published":"2024-09-03T02:42:39Z","title":"Self-Instructed Derived Prompt Generation Meets In-Context Learning:\n Unlocking New Potential of Black-Box LLMs","summary":" Large language models (LLMs) have shown success in generating high-quality\nresponses. In order to achieve better alignment with LLMs with human\npreference, various works are proposed based on specific optimization process,\nwhich, however, is not suitable to Black-Box LLMs like GPT-4, due to\ninaccessible parameters. In Black-Box LLMs case, their performance is highly\ndependent on the quality of the provided prompts. Existing methods to enhance\nresponse quality often involve a prompt refinement model, yet these approaches\npotentially suffer from semantic inconsistencies between the refined and\noriginal prompts, and typically overlook the relationship between them. To\naddress these challenges, we introduce a self-instructed in-context learning\nframework that empowers LLMs to deliver more effective responses by generating\nreliable derived prompts to construct informative contextual environments. Our\napproach incorporates a self-instructed reinforcement learning mechanism,\nenabling direct interaction with the response model during derived prompt\ngeneration for better alignment. We then formulate querying as an in-context\nlearning task, using responses from LLMs combined with the derived prompts to\nestablish a contextual demonstration for the original prompt. This strategy\nensures alignment with the original query, reduces discrepancies from refined\nprompts, and maximizes the LLMs' in-context learning capability. Extensive\nexperiments demonstrate that the proposed method not only generates more\nreliable derived prompts but also significantly enhances LLMs' ability to\ndeliver more effective responses, including Black-Box models such as GPT-4.\n","authors":["Zhuo Li","Yuhao Du","Jinpeng Hu","Xiang Wan","Anningzhe Gao"],"pdf_url":"https://arxiv.org/pdf/2409.01552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01548v1","updated":"2024-09-03T02:37:34Z","published":"2024-09-03T02:37:34Z","title":"VoxHakka: A Dialectally Diverse Multi-speaker Text-to-Speech System for\n Taiwanese Hakka","summary":" This paper introduces VoxHakka, a text-to-speech (TTS) system designed for\nTaiwanese Hakka, a critically under-resourced language spoken in Taiwan.\nLeveraging the YourTTS framework, VoxHakka achieves high naturalness and\naccuracy and low real-time factor in speech synthesis while supporting six\ndistinct Hakka dialects. This is achieved by training the model with\ndialect-specific data, allowing for the generation of speaker-aware Hakka\nspeech. To address the scarcity of publicly available Hakka speech corpora, we\nemployed a cost-effective approach utilizing a web scraping pipeline coupled\nwith automatic speech recognition (ASR)-based data cleaning techniques. This\nprocess ensured the acquisition of a high-quality, multi-speaker, multi-dialect\ndataset suitable for TTS training. Subjective listening tests conducted using\ncomparative mean opinion scores (CMOS) demonstrate that VoxHakka significantly\noutperforms existing publicly available Hakka TTS systems in terms of\npronunciation accuracy, tone correctness, and overall naturalness. This work\nrepresents a significant advancement in Hakka language technology and provides\na valuable resource for language preservation and revitalization efforts.\n","authors":["Li-Wei Chen","Hung-Shin Lee","Chen-Chi Chang"],"pdf_url":"https://arxiv.org/pdf/2409.01548v1.pdf","comment":"Submitted to O-COCOSDA 2024"},{"id":"http://arxiv.org/abs/2409.01545v1","updated":"2024-09-03T02:29:01Z","published":"2024-09-03T02:29:01Z","title":"Effective Noise-aware Data Simulation for Domain-adaptive Speech\n Enhancement Leveraging Dynamic Stochastic Perturbation","summary":" Cross-domain speech enhancement (SE) is often faced with severe challenges\ndue to the scarcity of noise and background information in an unseen target\ndomain, leading to a mismatch between training and test conditions. This study\nputs forward a novel data simulation method to address this issue, leveraging\nnoise-extractive techniques and generative adversarial networks (GANs) with\nonly limited target noisy speech data. Notably, our method employs a noise\nencoder to extract noise embeddings from target-domain data. These embeddings\naptly guide the generator to synthesize utterances acoustically fitted to the\ntarget domain while authentically preserving the phonetic content of the input\nclean speech. Furthermore, we introduce the notion of dynamic stochastic\nperturbation, which can inject controlled perturbations into the noise\nembeddings during inference, thereby enabling the model to generalize well to\nunseen noise conditions. Experiments on the VoiceBank-DEMAND benchmark dataset\ndemonstrate that our domain-adaptive SE method outperforms an existing strong\nbaseline based on data simulation.\n","authors":["Chien-Chun Wang","Li-Wei Chen","Hung-Shin Lee","Berlin Chen","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2409.01545v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.01539v1","updated":"2024-09-03T02:15:34Z","published":"2024-09-03T02:15:34Z","title":"It is Time to Develop an Auditing Framework to Promote Value Aware\n Chatbots","summary":" The launch of ChatGPT in November 2022 marked the beginning of a new era in\nAI, the availability of generative AI tools for everyone to use. ChatGPT and\nother similar chatbots boast a wide range of capabilities from answering\nstudent homework questions to creating music and art. Given the large amounts\nof human data chatbots are built on, it is inevitable that they will inherit\nhuman errors and biases. These biases have the potential to inflict significant\nharm or increase inequity on different subpopulations. Because chatbots do not\nhave an inherent understanding of societal values, they may create new content\nthat is contrary to established norms. Examples of concerning generated content\nincludes child pornography, inaccurate facts, and discriminatory posts. In this\nposition paper, we argue that the speed of advancement of this technology\nrequires us, as computer and data scientists, to mobilize and develop a\nvalues-based auditing framework containing a community established standard set\nof measurements to monitor the health of different chatbots and LLMs. To\nsupport our argument, we use a simple audit template to share the results of\nbasic audits we conduct that are focused on measuring potential bias in search\nengine style tasks, code generation, and story generation. We identify\nresponses from GPT 3.5 and GPT 4 that are both consistent and not consistent\nwith values derived from existing law. While the findings come as no surprise,\nthey do underscore the urgency of developing a robust auditing framework for\nopenly sharing results in a consistent way so that mitigation strategies can be\ndeveloped by the academic community, government agencies, and companies when\nour values are not being adhered to. We conclude this paper with\nrecommendations for value-based strategies for improving the technologies.\n","authors":["Yanchen Wang","Lisa Singh"],"pdf_url":"https://arxiv.org/pdf/2409.01539v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.07500"},{"id":"http://arxiv.org/abs/2409.01524v1","updated":"2024-09-03T01:40:21Z","published":"2024-09-03T01:40:21Z","title":"S$^3$c-Math: Spontaneous Step-level Self-correction Makes Large Language\n Models Better Mathematical Reasoners","summary":" Self-correction is a novel method that can stimulate the potential reasoning\nabilities of large language models (LLMs). It involves detecting and correcting\nerrors during the inference process when LLMs solve reasoning problems.\nHowever, recent works do not regard self-correction as a spontaneous and\nintrinsic capability of LLMs. Instead, such correction is achieved through\npost-hoc generation, external knowledge introduction, multi-model\ncollaboration, and similar techniques. In this paper, we propose a series of\nmathematical LLMs called S$^3$c-Math, which are able to perform Spontaneous\nStep-level Self-correction for Mathematical reasoning. This capability helps\nLLMs to recognize whether their ongoing inference tends to contain errors and\nsimultaneously correct these errors to produce a more reliable response. We\nproposed a method, which employs a step-level sampling approach to construct\nstep-wise self-correction data for achieving such ability. Additionally, we\nimplement a training strategy that uses above constructed data to equip LLMs\nwith spontaneous step-level self-correction capacities. Our data and methods\nhave been demonstrated to be effective across various foundation LLMs,\nconsistently showing significant progress in evaluations on GSM8K, MATH, and\nother mathematical benchmarks. To the best of our knowledge, we are the first\nto introduce the spontaneous step-level self-correction ability of LLMs in\nmathematical reasoning.\n","authors":["Yuchen Yan","Jin Jiang","Yang Liu","Yixin Cao","Xin Xu","Mengdi zhang","Xunliang Cai","Jian Shao"],"pdf_url":"https://arxiv.org/pdf/2409.01524v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.17422v2","updated":"2024-09-03T16:00:58Z","published":"2024-08-30T17:12:14Z","title":"Open-vocabulary Temporal Action Localization using VLMs","summary":" Video action localization aims to find timings of a specific action from a\nlong video. Although existing learning-based approaches have been successful,\nthose require annotating videos that come with a considerable labor cost. This\npaper proposes a learning-free, open-vocabulary approach based on emerging\noff-the-shelf vision-language models (VLM). The challenge stems from the fact\nthat VLMs are neither designed to process long videos nor tailored for finding\nactions. We overcome these problems by extending an iterative visual prompting\ntechnique. Specifically, we sample video frames into a concatenated image with\nframe index labels, making a VLM guess a frame that is considered to be closest\nto the start/end of the action. Iterating this process by narrowing a sampling\ntime window results in finding a specific frame of start and end of an action.\nWe demonstrate that this sampling technique yields reasonable results,\nillustrating a practical extension of VLMs for understanding videos. A sample\ncode is available at\nhttps://microsoft.github.io/VLM-Video-Action-Localization/.\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2408.17422v2.pdf","comment":"7 pages, 5 figures, 4 tables. Last updated on September 3rd, 2024"},{"id":"http://arxiv.org/abs/2408.14013v2","updated":"2024-09-03T15:34:09Z","published":"2024-08-26T04:36:10Z","title":"A Multiscale Gradient Fusion Method for Edge Detection in Color Images\n Utilizing the CBM3D Filter","summary":" In this paper, a color edge detection strategy based on collaborative\nfiltering combined with multiscale gradient fusion is proposed. The\nblock-matching and 3D (BM3D) filter are used to enhance the sparse\nrepresentation in the transform domain and achieve the effect of denoising,\nwhereas the multiscale gradient fusion makes up for the defect of loss of\ndetails in single-scale edge detection and improves the edge detection\nresolution and quality. First, the RGB images in the dataset are converted to\nXYZ color space images through mathematical operations. Second, the colored\nblock-matching and 3D (CBM3D) filter are used on the sparse images and to\nremove noise interference. Then, the vector gradients of the color image and\nthe anisotropic Gaussian directional derivative of the two scale parameters are\ncalculated and averaged pixel-by-pixel to obtain a new edge strength map.\nFinally, the edge features are enhanced by image normalization and non-maximum\nsuppression technology, and on that basis, the edge contour is obtained by\ndouble threshold selection and a new morphological refinement method. Through\nan experimental analysis of the edge detection dataset, the method proposed has\ngood noise robustness and high edge quality, which is better than the Color\nSobel, Color Canny, SE and Color AGDD as shown by the PR curve, AUC, PSNR, MSE,\nand FOM indicators.\n","authors":["Zhuoyue Wang","Yiyi Tao","Danqing Ma","Jiajing Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14013v2.pdf","comment":"1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2408.16395v2","updated":"2024-09-03T14:57:18Z","published":"2024-08-29T09:57:55Z","title":"IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial\n Intelligence Evaluation in Histopathology","summary":" Histopathological image analysis is crucial for accurate cancer diagnosis and\ntreatment planning. While deep learning models, especially convolutional neural\nnetworks, have advanced this field, their \"black-box\" nature raises concerns\nabout interpretability and trustworthiness. Explainable Artificial Intelligence\n(XAI) techniques aim to address these concerns, but evaluating their\neffectiveness remains challenging. A significant issue with current\nocclusion-based XAI methods is that they often generate Out-of-Distribution\n(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce\nInpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a\nDenoising Diffusion Probabilistic Model to inpaint occluded regions in\nhistopathological images. By replacing cancerous areas with realistic,\nnon-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity.\nWe evaluate our method on the CAMELYON16 dataset through two phases: first, by\nassessing perceptual similarity using the Learned Perceptual Image Patch\nSimilarity (LPIPS) metric, and second, by quantifying the impact on model\npredictions through Area Under the Curve (AUC) analysis. Our results\ndemonstrate that IBO significantly improves perceptual fidelity, achieving\nnearly twice the improvement in LPIPS scores compared to the best existing\nocclusion strategy. Additionally, IBO increased the precision of XAI\nperformance prediction from 42% to 71% compared to traditional methods. These\nresults demonstrate IBO's potential to provide more reliable evaluations of XAI\ntechniques, benefiting histopathology and other applications. The source code\nfor this study is available at https://github.com/a-fsh-r/IBO.\n","authors":["Pardis Afshar","Sajjad Hashembeiki","Pouya Khani","Emad Fatemizadeh","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.16395v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.19477v2","updated":"2024-09-03T14:24:49Z","published":"2024-02-29T18:59:31Z","title":"Learning a Generalized Physical Face Model From Data","summary":" Physically-based simulation is a powerful approach for 3D facial animation as\nthe resulting deformations are governed by physical constraints, allowing to\neasily resolve self-collisions, respond to external forces and perform\nrealistic anatomy edits. Today's methods are data-driven, where the actuations\nfor finite elements are inferred from captured skin geometry. Unfortunately,\nthese approaches have not been widely adopted due to the complexity of\ninitializing the material space and learning the deformation model for each\ncharacter separately, which often requires a skilled artist followed by lengthy\nnetwork training. In this work, we aim to make physics-based facial animation\nmore accessible by proposing a generalized physical face model that we learn\nfrom a large 3D face dataset. Once trained, our model can be quickly fit to any\nunseen identity and produce a ready-to-animate physical face model\nautomatically. Fitting is as easy as providing a single 3D face scan, or even a\nsingle face image. After fitting, we offer intuitive animation controls, as\nwell as the ability to retarget animations across characters. All the while,\nthe resulting animations allow for physical effects like collision avoidance,\ngravity, paralysis, bone reshaping and more.\n","authors":["Lingchen Yang","Gaspard Zoss","Prashanth Chandran","Markus Gross","Barbara Solenthaler","Eftychios Sifakis","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2402.19477v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08557v3","updated":"2024-09-03T13:40:28Z","published":"2024-03-13T14:08:45Z","title":"$OC^4-ReID$: Occluded Cloth-Changing Person Re-Identification","summary":" The study of Cloth-Changing Person Re-identification (CC-ReID) focuses on\nretrieving specific pedestrians when their clothing has changed, typically\nunder the assumption that the entire pedestrian images are visible. Pedestrian\nimages in real-world scenarios, however, are often partially obscured by\nobstacles, presenting a significant challenge to existing CC-ReID systems. In\nthis paper, we introduce a more challenging task termed Occluded Cloth-Changing\nPerson Re-Identification ($OC^4-ReID$), which simultaneously addresses two\nchallenges of clothing changes and occlusion. Concretely, we construct two new\ndatasets, Occ-LTCC and Occ-PRCC, based on original CC-ReID datasets to include\nrandom occlusions of key pedestrians components (e.g., head, torso). Moreover,\na novel benchmark is proposed for $OC^4-ReID$ incorporating a Train-Test Micro\nGranularity Screening ($T^2MGS$) module to mitigate the influence of occlusion\nand proposing a Part-Robust Triplet (PRT) loss for partial features learning.\nComprehensive experiments on the proposed datasets, as well as on two CC-ReID\nbenchmark datasets demonstrate the superior performance of proposed method\nagainst other state-of-the-art methods. The codes and datasets are available\nat: https://github.com/1024AILab/OC4-ReID.\n","authors":["Zhihao Chen","Yiyuan Ge","Ziyang Wang","Jiaju Kang","Mingya Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08557v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12587v2","updated":"2024-09-03T13:36:22Z","published":"2024-06-18T13:18:32Z","title":"Restorer: Removing Multi-Degradation with All-Axis Attention and Prompt\n Guidance","summary":" There are many excellent solutions in image restoration.However, most methods\nrequire on training separate models to restore images with different types of\ndegradation.Although existing all-in-one models effectively address multiple\ntypes of degradation simultaneously, their performance in real-world scenarios\nis still constrained by the task confusion problem.In this work, we attempt to\naddress this issue by introducing \\textbf{Restorer}, a novel Transformer-based\nall-in-one image restoration model.To effectively address the complex\ndegradation present in real-world images, we propose All-Axis Attention (AAA),\na mechanism that simultaneously models long-range dependencies across both\nspatial and channel dimensions, capturing potential correlations along all\naxes.Additionally, we introduce textual prompts in Restorer to incorporate\nexplicit task priors, enabling the removal of specific degradation types based\non user instructions. By iterating over these prompts, Restorer can handle\ncomposite degradation in real-world scenarios without requiring additional\ntraining.Based on these designs, Restorer with one set of parameters\ndemonstrates state-of-the-art performance in multiple image restoration tasks\ncompared to existing all-in-one and even single-task models.Additionally,\nRestorer is efficient during inference, suggesting the potential in real-world\napplications.\n","authors":["Jiawei Mao","Juncheng Wu","Yuyin Zhou","Xuesong Yin","Yuanqi Chang"],"pdf_url":"https://arxiv.org/pdf/2406.12587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17147v4","updated":"2024-09-03T12:55:47Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v4.pdf","comment":"accepted by IEEE RA-L"},{"id":"http://arxiv.org/abs/2407.09510v3","updated":"2024-09-03T11:54:52Z","published":"2024-06-17T11:43:38Z","title":"3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods","summary":" We present a work-in-progress survey on 3D Gaussian Splatting compression\nmethods, focusing on their statistical performance across various benchmarks.\nThis survey aims to facilitate comparability by summarizing key statistics of\ndifferent compression approaches in a tabulated format. The datasets evaluated\ninclude TanksAndTemples, MipNeRF360, DeepBlending, and SyntheticNeRF. For each\nmethod, we report the Peak Signal-to-Noise Ratio (PSNR), Structural Similarity\nIndex (SSIM), Learned Perceptual Image Patch Similarity (LPIPS), and the\nresultant size in megabytes (MB), as provided by the respective authors. This\nis an ongoing, open project, and we invite contributions from the research\ncommunity as GitHub issues or pull requests. Please visit\nhttp://w-m.github.io/3dgs-compression-survey/ for more information and a\nsortable version of the table.\n","authors":["Milena T. Bagdasarian","Paul Knoll","Florian Barthel","Anna Hilsmann","Peter Eisert","Wieland Morgenstern"],"pdf_url":"https://arxiv.org/pdf/2407.09510v3.pdf","comment":"3D Gaussian Splatting compression survey; 3DGS compression; new\n approaches added"},{"id":"http://arxiv.org/abs/2407.07805v3","updated":"2024-09-03T10:46:10Z","published":"2024-07-10T16:25:26Z","title":"SUMix: Mixup with Semantic and Uncertain Information","summary":" Mixup data augmentation approaches have been applied for various tasks of\ndeep learning to improve the generalization ability of deep neural networks.\nSome existing approaches CutMix, SaliencyMix, etc. randomly replace a patch in\none image with patches from another to generate the mixed image. Similarly, the\ncorresponding labels are linearly combined by a fixed ratio $\\lambda$ by l. The\nobjects in two images may be overlapped during the mixing process, so some\nsemantic information is corrupted in the mixed samples. In this case, the mixed\nimage does not match the mixed label information. Besides, such a label may\nmislead the deep learning model training, which results in poor performance. To\nsolve this problem, we proposed a novel approach named SUMix to learn the\nmixing ratio as well as the uncertainty for the mixed samples during the\ntraining process. First, we design a learnable similarity function to compute\nan accurate mix ratio. Second, an approach is investigated as a regularized\nterm to model the uncertainty of the mixed samples. We conduct experiments on\nfive image benchmarks, and extensive experimental results imply that our method\nis capable of improving the performance of classifiers with different\ncutting-based mixup approaches. The source code is available at\nhttps://github.com/JinXins/SUMix.\n","authors":["Huafeng Qin","Xin Jin","Hongyu Zhu","Hongchao Liao","Mounîm A. El-Yacoubi","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2407.07805v3.pdf","comment":"Accepted by ECCV2024 [Camera Ready] (19 pages, 7 figures) with the\n source code at https://github.com/JinXins/SUMix"},{"id":"http://arxiv.org/abs/2404.12501v3","updated":"2024-09-03T10:12:34Z","published":"2024-04-18T20:43:33Z","title":"SPIdepth: Strengthened Pose Information for Self-supervised Monocular\n Depth Estimation","summary":" Self-supervised monocular depth estimation has garnered considerable\nattention for its applications in autonomous driving and robotics. While recent\nmethods have made strides in leveraging techniques like the Self Query Layer\n(SQL) to infer depth from motion, they often overlook the potential of\nstrengthening pose information. In this paper, we introduce SPIdepth, a novel\napproach that prioritizes enhancing the pose network for improved depth\nestimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the\nimportance of pose information in capturing fine-grained scene structures. By\nenhancing the pose network's capabilities, SPIdepth achieves remarkable\nadvancements in scene understanding and depth estimation. Experimental results\non benchmark datasets such as KITTI, Cityscapes, and Make3D showcase SPIdepth's\nstate-of-the-art performance, surpassing previous methods by significant\nmargins. Specifically, SPIdepth tops the self-supervised KITTI benchmark.\nAdditionally, SPIdepth achieves the lowest AbsRel (0.029), SqRel (0.069), and\nRMSE (1.394) on KITTI, establishing new state-of-the-art results. On\nCityscapes, SPIdepth shows improvements over SQLdepth of 21.7% in AbsRel, 36.8%\nin SqRel, and 16.5% in RMSE, even without using motion masks. On Make3D,\nSPIdepth in zero-shot outperforms all other models. Remarkably, SPIdepth\nachieves these results using only a single image for inference, surpassing even\nmethods that utilize video sequences for inference, thus demonstrating its\nefficacy and efficiency in real-world applications. Our approach represents a\nsignificant leap forward in self-supervised monocular depth estimation,\nunderscoring the importance of strengthening pose information for advancing\nscene understanding in real-world applications. The code and pre-trained models\nare publicly available at https://github.com/Lavreniuk/SPIdepth.\n","authors":["Mykola Lavreniuk"],"pdf_url":"https://arxiv.org/pdf/2404.12501v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15656v2","updated":"2024-09-03T09:45:59Z","published":"2024-08-28T09:17:25Z","title":"Realigned Softmax Warping for Deep Metric Learning","summary":" Deep Metric Learning (DML) loss functions traditionally aim to control the\nforces of separability and compactness within an embedding space so that the\nsame class data points are pulled together and different class ones are pushed\napart. Within the context of DML, a softmax operation will typically normalize\ndistances into a probability for optimization, thus coupling all the push/pull\nforces together. This paper proposes a potential new class of loss functions\nthat operate within a euclidean domain and aim to take full advantage of the\ncoupled forces governing embedding space formation under a softmax. These\nforces of compactness and separability can be boosted or mitigated within\ncontrolled locations at will by using a warping function. In this work, we\nprovide a simple example of a warping function and use it to achieve\ncompetitive, state-of-the-art results on various metric learning benchmarks.\n","authors":["Michael G. DeMoor","John J. Prevost"],"pdf_url":"https://arxiv.org/pdf/2408.15656v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.19430v2","updated":"2024-09-03T09:42:24Z","published":"2024-07-28T08:43:16Z","title":"Progressive Domain Adaptation for Thermal Infrared Object Tracking","summary":" Due to the lack of large-scale labeled Thermal InfraRed (TIR) training\ndatasets, most existing TIR trackers are trained directly on RGB datasets.\nHowever, tracking methods trained on RGB datasets suffer a significant drop-off\nin TIR data due to the domain shift issue. To this end, in this work, we\npropose a Progressive Domain Adaptation framework for TIR Tracking (PDAT),\nwhich transfers useful knowledge learned from RGB tracking to TIR tracking. The\nframework makes full use of large-scale labeled RGB datasets without requiring\ntime-consuming and labor-intensive labeling of large-scale TIR data.\nSpecifically, we first propose an adversarial-based global domain adaptation\nmodule to reduce domain gap on the feature level coarsely. Second, we design a\nclustering-based subdomain adaptation method to further align the feature\ndistributions of the RGB and TIR datasets finely. These two domain adaptation\nmodules gradually eliminate the discrepancy between the two domains, and thus\nlearn domain-invariant fine-grained features through progressive training.\nAdditionally, we collect a largescale TIR dataset with over 1.48 million\nunlabeled TIR images for training the proposed domain adaptation framework.\nExperimental results on five TIR tracking benchmarks show that the proposed\nmethod gains a nearly 6% success rate, demonstrating its effectiveness.\n","authors":["Qiao Li","Kanlun Tan","Qiao Liu","Di Yuan","Xin Li","Yunpeng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19430v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.08174v4","updated":"2024-09-03T09:16:03Z","published":"2024-01-16T07:33:22Z","title":"An Efficient Instance Segmentation Framework Using Segmentation\n Foundation Models with Oriented Bounding Box Prompts","summary":" Instance segmentation in unmanned aerial vehicle measurement is a\nlong-standing challenge. Since horizontal bounding boxes introduce many\ninterference objects, oriented bounding boxes (OBBs) are usually used for\ninstance identification. However, based on ``segmentation within bounding box''\nparadigm, current instance segmentation methods using OBBs are overly dependent\non bounding box detection performance. To tackle this, this paper proposes\nOBSeg, an efficient instance segmentation framework using OBBs. OBSeg is based\non box prompt-based segmentation foundation models (BSMs), e.g., Segment\nAnything Model. Specifically, OBSeg first detects OBBs to distinguish instances\nand provide coarse localization information. Then, it predicts OBB\nprompt-related masks for fine segmentation. Since OBBs only serve as prompts,\nOBSeg alleviates the over-dependence on bounding box detection performance of\ncurrent instance segmentation methods using OBBs. In addition, to enable BSMs\nto handle OBB prompts, we propose a novel OBB prompt encoder. To make OBSeg\nmore lightweight and further improve the performance of lightweight distilled\nBSMs, a Gaussian smoothing-based knowledge distillation method is introduced.\nExperiments demonstrate that OBSeg outperforms current instance segmentation\nmethods on multiple public datasets. The code is available at\nhttps://github.com/zhen6618/OBBInstanceSegmentation.\n","authors":["Zhen Zhou","Junfeng Fan","Yunkai Ma","Sihan Zhao","Fengshui Jing","Min Tan"],"pdf_url":"https://arxiv.org/pdf/2401.08174v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02092v2","updated":"2024-09-03T08:25:11Z","published":"2023-06-03T11:50:44Z","title":"Collaborative Group: Composed Image Retrieval via Consensus Learning\n from Noisy Annotations","summary":" Composed image retrieval extends content-based image retrieval systems by\nenabling users to search using reference images and captions that describe\ntheir intention. Despite great progress in developing image-text compositors to\nextract discriminative visual-linguistic features, we identify a hitherto\noverlooked issue, triplet ambiguity, which impedes robust feature extraction.\nTriplet ambiguity refers to a type of semantic ambiguity that arises between\nthe reference image, the relative caption, and the target image. It is mainly\ndue to the limited representation of the annotated text, resulting in many\nnoisy triplets where multiple visually dissimilar candidate images can be\nmatched to an identical reference pair (i.e., a reference image + a relative\ncaption). To address this challenge, we propose the Consensus Network\n(Css-Net), inspired by the psychological concept that groups outperform\nindividuals. Css-Net comprises two core components: (1) a consensus module with\nfour diverse compositors, each generating distinct image-text embeddings,\nfostering complementary feature extraction and mitigating dependence on any\nsingle, potentially biased compositor; (2) a Kullback-Leibler divergence loss\nthat encourages learning of inter-compositor interactions to promote consensual\noutputs. During evaluation, the decisions of the four compositors are combined\nthrough a weighting scheme, enhancing overall agreement. On benchmark datasets,\nparticularly FashionIQ, Css-Net demonstrates marked improvements. Notably, it\nachieves significant recall gains, with a 2.77% increase in R@10 and 6.67%\nboost in R@50, underscoring its competitiveness in addressing the fundamental\nlimitations of existing methods.\n","authors":["Xu Zhang","Zhedong Zheng","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2306.02092v2.pdf","comment":"Accepted by Knowledge-Based Systems (KBS)"},{"id":"http://arxiv.org/abs/2311.18825v2","updated":"2024-09-03T08:16:32Z","published":"2023-11-30T18:58:51Z","title":"CAST: Cross-Attention in Space and Time for Video Action Recognition","summary":" Recognizing human actions in videos requires spatial and temporal\nunderstanding. Most existing action recognition models lack a balanced\nspatio-temporal understanding of videos. In this work, we propose a novel\ntwo-stream architecture, called Cross-Attention in Space and Time (CAST), that\nachieves a balanced spatio-temporal understanding of videos using only RGB\ninput. Our proposed bottleneck cross-attention mechanism enables the spatial\nand temporal expert models to exchange information and make synergistic\npredictions, leading to improved performance. We validate the proposed method\nwith extensive experiments on public benchmarks with different characteristics:\nEPIC-KITCHENS-100, Something-Something-V2, and Kinetics-400. Our method\nconsistently shows favorable performance across these datasets, while the\nperformance of existing methods fluctuates depending on the dataset\ncharacteristics.\n","authors":["Dongho Lee","Jongseo Lee","Jinwoo Choi"],"pdf_url":"https://arxiv.org/pdf/2311.18825v2.pdf","comment":"This is an accepted NeurIPS 2023. Project webpage is available at\n https://jong980812.github.io/CAST.github.io/ Code is available at\n https://github.com/KHU-VLL/CAST"},{"id":"http://arxiv.org/abs/2312.05449v2","updated":"2024-09-03T08:01:47Z","published":"2023-12-09T03:33:14Z","title":"TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot\n Image Classification","summary":" Few-shot image classification aims to classify images from unseen novel\nclasses with few samples. Recent works demonstrate that deep local descriptors\nexhibit enhanced representational capabilities compared to image-level\nfeatures. However, most existing methods solely rely on either employing all\nlocal descriptors or directly utilizing partial descriptors, potentially\nresulting in the loss of crucial information. Moreover, these methods primarily\nemphasize the selection of query descriptors while overlooking support\ndescriptors. In this paper, we propose a novel Task-Aware Adaptive Local\nDescriptors Selection Network (TALDS-Net), which exhibits the capacity for\nadaptive selection of task-aware support descriptors and query descriptors.\nSpecifically, we compare the similarity of each local support descriptor with\nother local support descriptors to obtain the optimal support descriptor subset\nand then compare the query descriptors with the optimal support subset to\nobtain discriminative query descriptors. Extensive experiments demonstrate that\nour TALDS-Net outperforms state-of-the-art methods on both general and\nfine-grained datasets.\n","authors":["Qian Qiao","Yu Xie","Ziyin Zeng","Fanzhang Li"],"pdf_url":"https://arxiv.org/pdf/2312.05449v2.pdf","comment":"4 pages, 1 figures, is accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2307.10593v2","updated":"2024-09-03T07:50:05Z","published":"2023-07-20T05:15:03Z","title":"Asynchronous Blob Tracker for Event Cameras","summary":" Event-based cameras are popular for tracking fast-moving objects due to their\nhigh temporal resolution, low latency, and high dynamic range. In this paper,\nwe propose a novel algorithm for tracking event blobs using raw events\nasynchronously in real time. We introduce the concept of an event blob as a\nspatio-temporal likelihood of event occurrence where the conditional spatial\nlikelihood is blob-like. Many real-world objects such as car headlights or any\nquickly moving foreground objects generate event blob data. The proposed\nalgorithm uses a nearest neighbour classifier with a dynamic threshold criteria\nfor data association coupled with an extended Kalman filter to track the event\nblob state. Our algorithm achieves highly accurate blob tracking, velocity\nestimation, and shape estimation even under challenging lighting conditions and\nhigh-speed motions (> 11000 pixels/s). The microsecond time resolution achieved\nmeans that the filter output can be used to derive secondary information such\nas time-to-contact or range estimation, that will enable applications to\nreal-world problems such as collision avoidance in autonomous driving.\n","authors":["Ziwei Wang","Timothy Molloy","Pieter van Goor","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2307.10593v2.pdf","comment":"18 pages, 16 figures, Manuscript was accepted on August 7, 2024, by\n IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2405.09777v2","updated":"2024-09-03T07:46:17Z","published":"2024-05-16T02:46:19Z","title":"Rethinking Barely-Supervised Volumetric Medical Image Segmentation from\n an Unsupervised Domain Adaptation Perspective","summary":" This paper investigates an extremely challenging problem: barely-supervised\nvolumetric medical image segmentation (BSS). A BSS training dataset consists of\ntwo parts: 1) a barely-annotated labeled set, where each labeled image contains\nonly a single-slice annotation, and 2) an unlabeled set comprising numerous\nunlabeled volumetric images. State-of-the-art BSS methods employ a\nregistration-based paradigm, which uses inter-slice image registration to\npropagate single-slice annotations into volumetric pseudo labels, constructing\na completely annotated labeled set, to which a semi-supervised segmentation\nscheme can be applied. However, the paradigm has a critical limitation: the\npseudo-labels generated by image registration are unreliable and noisy.\nMotivated by this, we propose a new perspective: instead of solving BSS within\na semi-supervised learning scheme, this work formulates BSS as an unsupervised\ndomain adaptation problem. To this end, we propose a novel BSS framework,\n\\textbf{B}arely-supervised learning \\textbf{via} unsupervised domain\n\\textbf{A}daptation (BvA), as an alternative to the dominant registration\nparadigm. Specifically, we first design a novel noise-free labeled data\nconstruction algorithm (NFC) for slice-to-volume labeled data synthesis. Then,\nwe introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the\ndomain shifts. Extensive experiments demonstrate that our method provides a\npromising alternative for BSS. Remarkably, the proposed method, trained on the\nleft atrial segmentation dataset with \\textbf{only one} barely-labeled image,\nachieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%.\nThe code is available at\n\\href{https://github.com/Senyh/BvA}{\\textit{\\texttt{https://github.com/Senyh/BvA}}}.\n","authors":["Zhiqiang Shen","Peng Cao","Junming Su","Jinzhu Yang","Osmar R. Zaiane"],"pdf_url":"https://arxiv.org/pdf/2405.09777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08822v2","updated":"2024-09-03T07:42:44Z","published":"2023-12-14T11:11:50Z","title":"Planning and Rendering: Towards Product Poster Generation with Diffusion\n Models","summary":" Product poster generation significantly optimizes design efficiency and\nreduces production costs. Prevailing methods predominantly rely on\nimage-inpainting methods to generate clean background images for given\nproducts. Subsequently, poster layout generation methods are employed to\nproduce corresponding layout results. However, the background images may not be\nsuitable for accommodating textual content due to their complexity, and the\nfixed location of products limits the diversity of layout results. To alleviate\nthese issues, we propose a novel product poster generation framework based on\ndiffusion models named P\\&R. The P\\&R draws inspiration from the workflow of\ndesigners in creating posters, which consists of two stages: Planning and\nRendering. At the planning stage, we propose a PlanNet to generate the layout\nof the product and other visual components considering both the appearance\nfeatures of the product and semantic features of the text, which improves the\ndiversity and rationality of the layouts. At the rendering stage, we propose a\nRenderNet to generate the background for the product while considering the\ngenerated layout, where a spatial fusion module is introduced to fuse the\nlayout of different visual components. To foster the advancement of this field,\nwe propose the first product poster generation dataset PPG30k, comprising 30k\nexquisite product poster images along with comprehensive image and text\nannotations. Our method outperforms the state-of-the-art product poster\ngeneration methods on PPG30k. The PPG30k will be released soon.\n","authors":["Zhaochen Li","Fengheng Li","Wei Feng","Honghe Zhu","Yaoyu Li","Zheng Zhang","Jingjing Lv","Junjie Shen","Zhangang Lin","Jingping Shao","Zhenglu Yang"],"pdf_url":"https://arxiv.org/pdf/2312.08822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11835v3","updated":"2024-09-03T07:39:39Z","published":"2024-01-22T10:52:02Z","title":"Unveiling the Human-like Similarities of Automatic Facial Expression\n Recognition: An Empirical Exploration through Explainable AI","summary":" Facial expression recognition is vital for human behavior analysis, and deep\nlearning has enabled models that can outperform humans. However, it is unclear\nhow closely they mimic human processing. This study aims to explore the\nsimilarity between deep neural networks and human perception by comparing\ntwelve different networks, including both general object classifiers and\nFER-specific models. We employ an innovative global explainable AI method to\ngenerate heatmaps, revealing crucial facial regions for the twelve networks\ntrained on six facial expressions. We assess these results both quantitatively\nand qualitatively, comparing them to ground truth masks based on Friesen and\nEkman's description and among them. We use Intersection over Union (IoU) and\nnormalized correlation coefficients for comparisons. We generate 72 heatmaps to\nhighlight critical regions for each expression and architecture. Qualitatively,\nmodels with pre-trained weights show more similarity in heatmaps compared to\nthose without pre-training. Specifically, eye and nose areas influence certain\nfacial expressions, while the mouth is consistently important across all models\nand expressions. Quantitatively, we find low average IoU values (avg. 0.2702)\nacross all expressions and architectures. The best-performing architecture\naverages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,\nbuilt with the normalized correlation coefficient, reveal two main clusters for\nmost expressions: models with pre-training and models without pre-training.\nFindings suggest limited alignment between human and AI facial expression\nrecognition, with network architectures influencing the similarity, as similar\narchitectures prioritize similar facial regions.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis-Guarinos","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11835v3.pdf","comment":"Multimed Tools Appl (2024)"},{"id":"http://arxiv.org/abs/2402.17296v3","updated":"2024-09-03T07:38:14Z","published":"2024-02-27T08:19:51Z","title":"Learning Exposure Correction in Dynamic Scenes","summary":" Exposure correction aims to enhance visual data suffering from improper\nexposures, which can greatly improve satisfactory visual effects. However,\nprevious methods mainly focus on the image modality, and the video counterpart\nis less explored in the literature. Directly applying prior image-based methods\nto videos results in temporal incoherence with low visual quality. Through\nthorough investigation, we find that the development of relevant communities is\nlimited by the absence of a benchmark dataset. Therefore, in this paper, we\nconstruct the first real-world paired video dataset, including both\nunderexposure and overexposure dynamic scenes. To achieve spatial alignment, we\nutilize two DSLR cameras and a beam splitter to simultaneously capture improper\nand normal exposure videos. Additionally, we propose an end-to-end video\nexposure correction network, in which a dual-stream module is designed to deal\nwith both underexposure and overexposure factors, enhancing the illumination\nbased on Retinex theory. The extensive experiments based on various metrics and\nuser studies demonstrate the significance of our dataset and the effectiveness\nof our method. The code and dataset are available at\nhttps://github.com/kravrolens/VECNet.\n","authors":["Jin Liu","Bo Wang","Chuanming Wang","Huiyuan Fu","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2402.17296v3.pdf","comment":"To be published at ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2401.11790v3","updated":"2024-09-03T07:34:44Z","published":"2024-01-22T09:40:52Z","title":"Deep Learning for Computer Vision based Activity Recognition and Fall\n Detection of the Elderly: a Systematic Review","summary":" As the percentage of elderly people in developed countries increases\nworldwide, the healthcare of this collective is a worrying matter, especially\nif it includes the preservation of their autonomy. In this direction, many\nstudies are being published on Ambient Assisted Living (AAL) systems, which\nhelp to reduce the preoccupations raised by the independent living of the\nelderly. In this study, a systematic review of the literature is presented on\nfall detection and Human Activity Recognition (HAR) for the elderly, as the two\nmain tasks to solve to guarantee the safety of elderly people living alone. To\naddress the current tendency to perform these two tasks, the review focuses on\nthe use of Deep Learning (DL) based approaches on computer vision data. In\naddition, different collections of data like DL models, datasets or hardware\n(e.g. depth or thermal cameras) are gathered from the reviewed studies and\nprovided for reference in future studies. Strengths and weaknesses of existing\napproaches are also discussed and, based on them, our recommendations for\nfuture works are provided.\n","authors":["F. Xavier Gaya-Morey","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11790v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00997v3","updated":"2024-09-03T07:25:51Z","published":"2023-07-03T13:21:58Z","title":"RefSAM: Efficiently Adapting Segmenting Anything Model for Referring\n Video Object Segmentation","summary":" The Segment Anything Model (SAM) has gained significant attention for its\nimpressive performance in image segmentation. However, it lacks proficiency in\nreferring video object segmentation (RVOS) due to the need for precise\nuser-interactive prompts and a limited understanding of different modalities,\nsuch as language and vision. This paper presents the RefSAM model, which\nexplores the potential of SAM for RVOS by incorporating multi-view information\nfrom diverse modalities and successive frames at different timestamps in an\nonline manner. Our proposed approach adapts the original SAM model to enhance\ncross-modality learning by employing a lightweight Cross-Modal MLP that\nprojects the text embedding of the referring expression into sparse and dense\nembeddings, serving as user-interactive prompts. Additionally, we have\nintroduced the hierarchical dense attention module to fuse hierarchical visual\nsemantic information with sparse embeddings to obtain fine-grained dense\nembeddings, and an implicit tracking module to generate a tracking token and\nprovide historical information for the mask decoder. Furthermore, we employ a\nparameter-efficient tuning strategy to align and fuse the language and vision\nfeatures effectively. Through comprehensive ablation studies, we demonstrate\nour model's practical and effective design choices. Extensive experiments\nconducted on Refer-Youtube-VOS, Ref-DAVIS17, and three referring image\nsegmentation datasets validate the superiority and effectiveness of our RefSAM\nmodel over existing methods.\n","authors":["Yonglin Li","Jing Zhang","Xiao Teng","Long Lan","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.00997v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15214v2","updated":"2024-09-03T06:40:37Z","published":"2024-05-24T05:02:51Z","title":"PointRWKV: Efficient RWKV-Like Model for Hierarchical Point Cloud\n Learning","summary":" Transformers have revolutionized the point cloud learning task, but the\nquadratic complexity hinders its extension to long sequence and makes a burden\non limited computational resources. The recent advent of RWKV, a fresh breed of\ndeep sequence models, has shown immense potential for sequence modeling in NLP\ntasks. In this paper, we present PointRWKV, a model of linear complexity\nderived from the RWKV model in the NLP field with necessary modifications for\npoint cloud learning tasks. Specifically, taking the embedded point patches as\ninput, we first propose to explore the global processing capabilities within\nPointRWKV blocks using modified multi-headed matrix-valued states and a dynamic\nattention recurrence mechanism. To extract local geometric features\nsimultaneously, we design a parallel branch to encode the point cloud\nefficiently in a fixed radius near-neighbors graph with a graph stabilizer.\nFurthermore, we design PointRWKV as a multi-scale framework for hierarchical\nfeature learning of 3D point clouds, facilitating various downstream tasks.\nExtensive experiments on different point cloud learning tasks show our proposed\nPointRWKV outperforms the transformer- and mamba-based counterparts, while\nsignificantly saving about 42\\% FLOPs, demonstrating the potential option for\nconstructing foundational 3D models.\n","authors":["Qingdong He","Jiangning Zhang","Jinlong Peng","Haoyang He","Xiangtai Li","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2405.15214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v3","updated":"2024-09-03T06:31:48Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v3.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes and formatting improvements. V3: improvements from\n journal revisions"},{"id":"http://arxiv.org/abs/2401.12743v2","updated":"2024-09-03T06:13:37Z","published":"2024-01-23T13:20:57Z","title":"Correlation-Embedded Transformer Tracking: A Single-Branch Framework","summary":" Developing robust and discriminative appearance models has been a\nlong-standing research challenge in visual object tracking. In the prevalent\nSiamese-based paradigm, the features extracted by the Siamese-like networks are\noften insufficient to model the tracked targets and distractor objects, thereby\nhindering them from being robust and discriminative simultaneously. While most\nSiamese trackers focus on designing robust correlation operations, we propose a\nnovel single-branch tracking framework inspired by the transformer. Unlike the\nSiamese-like feature extraction, our tracker deeply embeds cross-image feature\ncorrelation in multiple layers of the feature network. By extensively matching\nthe features of the two images through multiple layers, it can suppress\nnon-target features, resulting in target-aware feature extraction. The output\nfeatures can be directly used for predicting target locations without\nadditional correlation steps. Thus, we reformulate the two-branch Siamese\ntracking as a conceptually simple, fully transformer-based Single-Branch\nTracking pipeline, dubbed SBT. After conducting an in-depth analysis of the SBT\nbaseline, we summarize many effective design principles and propose an improved\ntracker dubbed SuperSBT. SuperSBT adopts a hierarchical architecture with a\nlocal modeling layer to enhance shallow-level features. A unified relation\nmodeling is proposed to remove complex handcrafted layer pattern designs.\nSuperSBT is further improved by masked image modeling pre-training, integrating\ntemporal modeling, and equipping with dedicated prediction heads. Thus,\nSuperSBT outperforms the SBT baseline by 4.7%,3.0%, and 4.5% AUC scores in\nLaSOT, TrackingNet, and GOT-10K. Notably, SuperSBT greatly raises the speed of\nSBT from 37 FPS to 81 FPS. Extensive experiments show that our method achieves\nsuperior results on eight VOT benchmarks.\n","authors":["Fei Xie","Wankou Yang","Chunyu Wang","Lei Chu","Yue Cao","Chao Ma","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.12743v2.pdf","comment":"Extension of SBT paper, accepted by TPAMI"},{"id":"http://arxiv.org/abs/2407.13363v2","updated":"2024-09-03T05:10:50Z","published":"2024-07-18T10:14:49Z","title":"Learning from the Web: Language Drives Weakly-Supervised Incremental\n Learning for Semantic Segmentation","summary":" Current weakly-supervised incremental learning for semantic segmentation\n(WILSS) approaches only consider replacing pixel-level annotations with\nimage-level labels, while the training images are still from well-designed\ndatasets. In this work, we argue that widely available web images can also be\nconsidered for the learning of new classes. To achieve this, firstly we\nintroduce a strategy to select web images which are similar to previously seen\nexamples in the latent space using a Fourier-based domain discriminator. Then,\nan effective caption-driven reharsal strategy is proposed to preserve\npreviously learnt classes. To our knowledge, this is the first work to rely\nsolely on web images for both the learning of new concepts and the preservation\nof the already learned ones in WILSS. Experimental results show that the\nproposed approach can reach state-of-the-art performances without using\nmanually selected and annotated data in the incremental steps.\n","authors":["Chang Liu","Giulia Rizzoli","Pietro Zanuttigh","Fu Li","Yi Niu"],"pdf_url":"https://arxiv.org/pdf/2407.13363v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2311.04811v4","updated":"2024-09-03T03:54:59Z","published":"2023-11-08T16:34:18Z","title":"Image-Based Virtual Try-On: A Survey","summary":" Image-based virtual try-on aims to synthesize a naturally dressed person\nimage with a clothing image, which revolutionizes online shopping and inspires\nrelated topics within image generation, showing both research significance and\ncommercial potential. However, there is a gap between current research progress\nand commercial applications and an absence of comprehensive overview of this\nfield to accelerate the development.In this survey, we provide a comprehensive\nanalysis of the state-of-the-art techniques and methodologies in aspects of\npipeline architecture, person representation and key modules such as try-on\nindication, clothing warping and try-on stage. We additionally apply CLIP to\nassess the semantic alignment of try-on results, and evaluate representative\nmethods with uniformly implemented evaluation metrics on the same dataset.In\naddition to quantitative and qualitative evaluation of current open-source\nmethods, unresolved issues are highlighted and future research directions are\nprospected to identify key trends and inspire further exploration. The\nuniformly implemented evaluation metrics, dataset and collected methods will be\nmade public available at\nhttps://github.com/little-misfit/Survey-Of-Virtual-Try-On.\n","authors":["Dan Song","Xuanpu Zhang","Juan Zhou","Weizhi Nie","Ruofeng Tong","Mohan Kankanhalli","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2311.04811v4.pdf","comment":"30 pages, 20 figures"},{"id":"http://arxiv.org/abs/2406.19101v2","updated":"2024-09-03T03:51:37Z","published":"2024-06-27T11:28:36Z","title":"DocKylin: A Large Multimodal Model for Visual Document Understanding\n with Efficient Visual Slimming","summary":" Current multimodal large language models (MLLMs) face significant challenges\nin visual document understanding (VDU) tasks due to the high resolution, dense\ntext, and complex layouts typical of document images. These characteristics\ndemand a high level of detail perception ability from MLLMs. While increasing\ninput resolution improves detail perception capability, it also leads to longer\nsequences of visual tokens, increasing computational costs and straining the\nmodels' ability to handle long contexts. To address these challenges, we\nintroduce DocKylin, a document-centric MLLM that performs visual content\nslimming at both the pixel and token levels, thereby reducing token sequence\nlength in VDU scenarios. We introduce an Adaptive Pixel Slimming (APS)\npreprocessing module to perform pixel-level slimming, increasing the proportion\nof informative pixels. Moreover, we propose a novel Dynamic Token Slimming\n(DTS) module to conduct token-level slimming, filtering essential tokens and\nremoving others to adaptively create a more compact visual sequence.\nExperiments demonstrate DocKylin's promising performance across various VDU\nbenchmarks and the effectiveness of each component.\n","authors":["Jiaxin Zhang","Wentao Yang","Songxuan Lai","Zecheng Xie","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2406.19101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15667v2","updated":"2024-09-03T03:22:18Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n vision transformers","summary":" Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01169v2","updated":"2024-09-03T03:21:30Z","published":"2024-03-02T10:42:47Z","title":"Learn Suspected Anomalies from Event Prompts for Video Anomaly Detection","summary":" Most models for weakly supervised video anomaly detection (WS-VAD) rely on\nmultiple instance learning, aiming to distinguish normal and abnormal snippets\nwithout specifying the type of anomaly. However, the ambiguous nature of\nanomaly definitions across contexts may introduce inaccuracy in discriminating\nabnormal and normal events. To show the model what is anomalous, a novel\nframework is proposed to guide the learning of suspected anomalies from event\nprompts. Given a textual prompt dictionary of potential anomaly events and the\ncaptions generated from anomaly videos, the semantic anomaly similarity between\nthem could be calculated to identify the suspected events for each video\nsnippet. It enables a new multi-prompt learning process to constrain the\nvisual-semantic features across all videos, as well as provides a new way to\nlabel pseudo anomalies for self-training. To demonstrate its effectiveness,\ncomprehensive experiments and detailed ablation studies are conducted on four\ndatasets, namely XD-Violence, UCF-Crime, TAD, and ShanghaiTech. Our proposed\nmodel outperforms most state-of-the-art methods in terms of AP or AUC (86.5\\%,\n\\hl{90.4}\\%, 94.4\\%, and 97.4\\%). Furthermore, it shows promising performance\nin open-set and cross-dataset cases. The data, code, and models can be found\nat: \\url{https://github.com/shiwoaz/lap}.\n","authors":["Chenchen Tao","Xiaohao Peng","Chong Wang","Jiafei Wu","Puning Zhao","Jun Wang","Jiangbo Qian"],"pdf_url":"https://arxiv.org/pdf/2403.01169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07547v2","updated":"2024-09-03T03:20:54Z","published":"2023-04-15T12:52:23Z","title":"TagCLIP: Improving Discrimination Ability of Open-Vocabulary Semantic\n Segmentation","summary":" Contrastive Language-Image Pre-training (CLIP) has recently shown great\npromise in pixel-level zero-shot learning tasks. However, existing approaches\nutilizing CLIP's text and patch embeddings to generate semantic masks often\nmisidentify input pixels from unseen classes, leading to confusion between\nnovel classes and semantically similar ones. In this work, we propose a novel\napproach, TagCLIP (Trusty-aware guided CLIP), to address this issue. We\ndisentangle the ill-posed optimization problem into two parallel processes:\nsemantic matching performed individually and reliability judgment for improving\ndiscrimination ability. Building on the idea of special tokens in language\nmodeling representing sentence-level embeddings, we introduce a trusty token\nthat enables distinguishing novel classes from known ones in prediction. To\nevaluate our approach, we conduct experiments on two benchmark datasets, PASCAL\nVOC 2012, COCO-Stuff 164K and PASCAL Context. Our results show that TagCLIP\nimproves the Intersection over Union (IoU) of unseen classes by 7.4%, 1.7% and\n2.1%, respectively, with negligible overheads. The code is available at\nhttps://github.com/dvlab-research/TagCLIP.\n","authors":["Jingyao Li","Pengguang Chen","Shengju Qian","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2304.07547v2.pdf","comment":"TPAMI2024"},{"id":"http://arxiv.org/abs/2408.07500v2","updated":"2024-09-03T02:50:56Z","published":"2024-08-14T12:29:49Z","title":"Cross-Platform Video Person ReID: A New Benchmark Dataset and Adaptation\n Approach","summary":" In this paper, we construct a large-scale benchmark dataset for\nGround-to-Aerial Video-based person Re-Identification, named G2A-VReID, which\ncomprises 185,907 images and 5,576 tracklets, featuring 2,788 distinct\nidentities. To our knowledge, this is the first dataset for video ReID under\nGround-to-Aerial scenarios. G2A-VReID dataset has the following\ncharacteristics: 1) Drastic view changes; 2) Large number of annotated\nidentities; 3) Rich outdoor scenarios; 4) Huge difference in resolution.\nAdditionally, we propose a new benchmark approach for cross-platform ReID by\ntransforming the cross-platform visual alignment problem into visual-semantic\nalignment through vision-language model (i.e., CLIP) and applying a\nparameter-efficient Video Set-Level-Adapter module to adapt image-based\nfoundation model to video ReID tasks, termed VSLA-CLIP. Besides, to further\nreduce the great discrepancy across the platforms, we also devise the\nplatform-bridge prompts for efficient visual feature alignment. Extensive\nexperiments demonstrate the superiority of the proposed method on all existing\nvideo ReID datasets and our proposed G2A-VReID dataset.\n","authors":["Shizhou Zhang","Wenlong Luo","De Cheng","Qingchun Yang","Lingyan Ran","Yinghui Xing","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07500v2.pdf","comment":"Published at ECCV 2024"},{"id":"http://arxiv.org/abs/2404.06559v2","updated":"2024-09-03T01:57:04Z","published":"2024-04-09T18:23:34Z","title":"The Impact of Print-Scanning in Heterogeneous Morph Evaluation Scenarios","summary":" Face morphing attacks pose an increasing threat to face recognition (FR)\nsystems. A morphed photo contains biometric information from two different\nsubjects to take advantage of vulnerabilities in FRs. These systems are\nparticularly susceptible to attacks when the morphs are subjected to\nprint-scanning to mask the artifacts generated during the morphing process. We\ninvestigate the impact of print-scanning on morphing attack detection through a\nseries of evaluations on heterogeneous morphing attack scenarios. Our\nexperiments show that we can increase the Mated Morph Presentation Match Rate\n(MMPMR) by up to 8.48%. Furthermore, when a Single-image Morphing Attack\nDetection (S-MAD) algorithm is not trained to detect print-scanned morphs the\nMorphing Attack Classification Error Rate (MACER) can increase by up to 96.12%,\nindicating significant vulnerability.\n","authors":["Richard E. Neddo","Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06559v2.pdf","comment":"Accepted as a special sessions paper at IJCB 2024"},{"id":"http://arxiv.org/abs/2403.08542v2","updated":"2024-09-03T01:53:19Z","published":"2024-03-13T13:56:34Z","title":"AIGCs Confuse AI Too: Investigating and Explaining Synthetic\n Image-induced Hallucinations in Large Vision-Language Models","summary":" The evolution of Artificial Intelligence Generated Contents (AIGCs) is\nadvancing towards higher quality. The growing interactions with AIGCs present a\nnew challenge to the data-driven AI community: While AI-generated contents have\nplayed a crucial role in a wide range of AI models, the potential hidden risks\nthey introduce have not been thoroughly examined. Beyond human-oriented forgery\ndetection, AI-generated content poses potential issues for AI models originally\ndesigned to process natural data. In this study, we underscore the exacerbated\nhallucination phenomena in Large Vision-Language Models (LVLMs) caused by\nAI-synthetic images. Remarkably, our findings shed light on a consistent AIGC\n\\textbf{hallucination bias}: the object hallucinations induced by synthetic\nimages are characterized by a greater quantity and a more uniform position\ndistribution, even these synthetic images do not manifest unrealistic or\nadditional relevant visual features compared to natural images. Moreover, our\ninvestigations on Q-former and Linear projector reveal that synthetic images\nmay present token deviations after visual projection, thereby amplifying the\nhallucination bias.\n","authors":["Yifei Gao","Jiaqi Wang","Zhiyu Lin","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2403.08542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05904v3","updated":"2024-09-03T01:40:52Z","published":"2023-09-12T01:29:37Z","title":"Enhancing Representation in Radiography-Reports Foundation Model: A\n Granular Alignment Algorithm Using Masked Contrastive Learning","summary":" Recently, multi-modal vision-language foundation models have gained\nsignificant attention in the medical field. While these models offer great\nopportunities, they still face crucial challenges, such as the requirement for\nfine-grained knowledge understanding in computer-aided diagnosis and the\ncapability of utilizing very limited or even no task-specific labeled data in\nreal-world clinical applications. In this study, we present MaCo, a masked\ncontrastive chest X-ray foundation model that tackles these challenges. MaCo\nexplores masked contrastive learning to simultaneously achieve fine-grained\nimage understanding and zero-shot learning for a variety of medical imaging\ntasks. It designs a correlation weighting mechanism to adjust the correlation\nbetween masked chest X-ray image patches and their corresponding reports,\nthereby enhancing the model's representation learning capabilities. To evaluate\nthe performance of MaCo, we conducted extensive experiments using 6 well-known\nopen-source X-ray datasets. The experimental results demonstrate the\nsuperiority of MaCo over 10 state-of-the-art approaches across tasks such as\nclassification, segmentation, detection, and phrase grounding. These findings\nhighlight the significant potential of MaCo in advancing a wide range of\nmedical image analysis tasks.\n","authors":["Weijian Huang","Cheng Li","Hong-Yu Zhou","Hao Yang","Jiarun Liu","Yong Liang","Hairong Zheng","Shaoting Zhang","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05904v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02337v1","updated":"2024-09-03T23:52:33Z","published":"2024-09-03T23:52:33Z","title":"Coaching a Robotic Sonographer: Learning Robotic Ultrasound with Sparse\n Expert's Feedback","summary":" Ultrasound is widely employed for clinical intervention and diagnosis, due to\nits advantages of offering non-invasive, radiation-free, and real-time imaging.\nHowever, the accessibility of this dexterous procedure is limited due to the\nsubstantial training and expertise required of operators. The robotic\nultrasound (RUS) offers a viable solution to address this limitation;\nnonetheless, achieving human-level proficiency remains challenging. Learning\nfrom demonstrations (LfD) methods have been explored in RUS, which learns the\npolicy prior from a dataset of offline demonstrations to encode the mental\nmodel of the expert sonographer. However, active engagement of experts, i.e.\nCoaching, during the training of RUS has not been explored thus far. Coaching\nis known for enhancing efficiency and performance in human training. This paper\nproposes a coaching framework for RUS to amplify its performance. The framework\ncombines DRL (self-supervised practice) with sparse expert's feedback through\ncoaching. The DRL employs an off-policy Soft Actor-Critic (SAC) network, with a\nreward based on image quality rating. The coaching by experts is modeled as a\nPartially Observable Markov Decision Process (POMDP), which updates the policy\nparameters based on the correction by the expert. The validation study on\nphantoms showed that coaching increases the learning rate by $25\\%$ and the\nnumber of high-quality image acquisition by $74.5\\%$.\n","authors":["Deepak Raina","Mythra V. Balakuntala","Byung Wook Kim","Juan Wachs","Richard Voyles"],"pdf_url":"https://arxiv.org/pdf/2409.02337v1.pdf","comment":"Accepted in IEEE Transactions on Medical Robotics and Bionics (TMRB)\n 2024"},{"id":"http://arxiv.org/abs/2409.02335v1","updated":"2024-09-03T23:49:45Z","published":"2024-09-03T23:49:45Z","title":"What Do You See in Common? Learning Hierarchical Prototypes over\n Tree-of-Life to Discover Evolutionary Traits","summary":" A grand challenge in biology is to discover evolutionary traits - features of\norganisms common to a group of species with a shared ancestor in the tree of\nlife (also referred to as phylogenetic tree). With the growing availability of\nimage repositories in biology, there is a tremendous opportunity to discover\nevolutionary traits directly from images in the form of a hierarchy of\nprototypes. However, current prototype-based methods are mostly designed to\noperate over a flat structure of classes and face several challenges in\ndiscovering hierarchical prototypes, including the issue of learning\nover-specific features at internal nodes. To overcome these challenges, we\nintroduce the framework of Hierarchy aligned Commonality through Prototypical\nNetworks (HComP-Net). We empirically show that HComP-Net learns prototypes that\nare accurate, semantically consistent, and generalizable to unseen species in\ncomparison to baselines on birds, butterflies, and fishes datasets. The code\nand datasets are available at https://github.com/Imageomics/HComPNet.\n","authors":["Harish Babu Manogaran","M. Maruf","Arka Daw","Kazi Sajeed Mehrab","Caleb Patrick Charpentier","Josef C. Uyeda","Wasila Dahdul","Matthew J Thompson","Elizabeth G Campolongo","Kaiya L Provost","Paula M. Mabee","Hilmar Lapp","Anuj Karpatne"],"pdf_url":"https://arxiv.org/pdf/2409.02335v1.pdf","comment":"34 pages, 27 figures"},{"id":"http://arxiv.org/abs/2409.02334v1","updated":"2024-09-03T23:42:19Z","published":"2024-09-03T23:42:19Z","title":"YoloTag: Vision-based Robust UAV Navigation with Fiducial Markers","summary":" By harnessing fiducial markers as visual landmarks in the environment,\nUnmanned Aerial Vehicles (UAVs) can rapidly build precise maps and navigate\nspaces safely and efficiently, unlocking their potential for fluent\ncollaboration and coexistence with humans. Existing fiducial marker methods\nrely on handcrafted feature extraction, which sacrifices accuracy. On the other\nhand, deep learning pipelines for marker detection fail to meet real-time\nruntime constraints crucial for navigation applications. In this work, we\npropose YoloTag \\textemdash a real-time fiducial marker-based localization\nsystem. YoloTag uses a lightweight YOLO v8 object detector to accurately detect\nfiducial markers in images while meeting the runtime constraints needed for\nnavigation. The detected markers are then used by an efficient\nperspective-n-point algorithm to estimate UAV states. However, this\nlocalization system introduces noise, causing instability in trajectory\ntracking. To suppress noise, we design a higher-order Butterworth filter that\neffectively eliminates noise through frequency domain analysis. We evaluate our\nalgorithm through real-robot experiments in an indoor environment, comparing\nthe trajectory tracking performance of our method against other approaches in\nterms of several distance metrics.\n","authors":["Sourav Raxit","Simant Bahadur Singh","Abdullah Al Redwan Newaz"],"pdf_url":"https://arxiv.org/pdf/2409.02334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14871v3","updated":"2024-09-03T23:13:33Z","published":"2023-12-22T17:49:11Z","title":"BrainVis: Exploring the Bridge between Brain and Visual Signals via\n Image Reconstruction","summary":" Analyzing and reconstructing visual stimuli from brain signals effectively\nadvances the understanding of human visual system. However, the EEG signals are\ncomplex and contain significant noise. This leads to substantial limitations in\nexisting works of visual stimuli reconstruction from EEG, such as difficulties\nin aligning EEG embeddings with the fine-grained semantic information and a\nheavy reliance on additional large self-collected dataset for training. To\naddress these challenges, we propose a novel approach called BrainVis. Firstly,\nwe divide the EEG signals into various units and apply a self-supervised\napproach on them to obtain EEG time-domain features, in an attempt to ease the\ntraining difficulty. Additionally, we also propose to utilize the\nfrequency-domain features to enhance the EEG representations. Then, we\nsimultaneously align EEG time-frequency embeddings with the interpolation of\nthe coarse and fine-grained semantics in the CLIP space, to highlight the\nprimary visual components and reduce the cross-modal alignment difficulty.\nFinally, we adopt the cascaded diffusion models to reconstruct images. Using\nonly 10\\% training data of the previous work, our proposed BrainVis outperforms\nstate of the arts in both semantic fidelity reconstruction and generation\nquality. The code is available at https://github.com/RomGai/BrainVis.\n","authors":["Honghao Fu","Zhiqi Shen","Jing Jih Chin","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14871v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04890v2","updated":"2024-09-03T22:56:42Z","published":"2024-05-08T08:39:25Z","title":"GISR: Geometric Initialization and Silhouette-based Refinement for\n Single-View Robot Pose and Configuration Estimation","summary":" In autonomous robotics, measurement of the robot's internal state and\nperception of its environment, including interaction with other agents such as\ncollaborative robots, are essential. Estimating the pose of the robot arm from\na single view has the potential to replace classical eye-to-hand calibration\napproaches and is particularly attractive for online estimation and dynamic\nenvironments. In addition to its pose, recovering the robot configuration\nprovides a complete spatial understanding of the observed robot that can be\nused to anticipate the actions of other agents in advanced robotics use cases.\nFurthermore, this additional redundancy enables the planning and execution of\nrecovery protocols in case of sensor failures or external disturbances. We\nintroduce GISR - a deep configuration and robot-to-camera pose estimation\nmethod that prioritizes execution in real-time. GISR consists of two modules:\n(i) a geometric initialization module that efficiently computes an approximate\nrobot pose and configuration, and (ii) a deep iterative silhouette-based\nrefinement module that arrives at a final solution in just a few iterations. We\nevaluate GISR on publicly available data and show that it outperforms existing\nmethods of the same class in terms of both speed and accuracy, and can compete\nwith approaches that rely on ground-truth proprioception and recover only the\npose.\n","authors":["Ivan Bilić","Filip Marić","Fabio Bonsignorio","Ivan Petrović"],"pdf_url":"https://arxiv.org/pdf/2405.04890v2.pdf","comment":"IEEE Robotics and Automation Letters (under revision), code available\n at http://github.com/iwhitey/GISR-robot"},{"id":"http://arxiv.org/abs/2409.02324v1","updated":"2024-09-03T22:36:11Z","published":"2024-09-03T22:36:11Z","title":"Visual Servoing for Robotic On-Orbit Servicing: A Survey","summary":" On-orbit servicing (OOS) activities will power the next big step for\nsustainable exploration and commercialization of space. Developing robotic\ncapabilities for autonomous OOS operations is a priority for the space\nindustry. Visual Servoing (VS) enables robots to achieve the precise manoeuvres\nneeded for critical OOS missions by utilizing visual information for motion\ncontrol. This article presents an overview of existing VS approaches for\nautonomous OOS operations with space manipulator systems (SMS). We divide the\napproaches according to their contribution to the typical phases of a robotic\nOOS mission: a) Recognition, b) Approach, and c) Contact. We also present a\ndiscussion on the reviewed VS approaches, identifying current trends. Finally,\nwe highlight the challenges and areas for future research on VS techniques for\nrobotic OOS.\n","authors":["Lina María Amaya-Mejía","Mohamed Ghita","Jan Dentler","Miguel Olivares-Mendez","Carol Martinez"],"pdf_url":"https://arxiv.org/pdf/2409.02324v1.pdf","comment":"Accepted for publication at the 2024 International Conference on\n Space Robotics (iSpaRo)"},{"id":"http://arxiv.org/abs/2408.01690v2","updated":"2024-09-03T22:30:34Z","published":"2024-08-03T07:05:40Z","title":"IDNet: A Novel Dataset for Identity Document Analysis and Fraud\n Detection","summary":" Effective fraud detection and analysis of government-issued identity\ndocuments, such as passports, driver's licenses, and identity cards, are\nessential in thwarting identity theft and bolstering security on online\nplatforms. The training of accurate fraud detection and analysis tools depends\non the availability of extensive identity document datasets. However, current\npublicly available benchmark datasets for identity document analysis, including\nMIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a\nlimited number of samples, cover insufficient varieties of fraud patterns, and\nseldom include alterations in critical personal identifying fields like\nportrait images, limiting their utility in training models capable of detecting\nrealistic frauds while preserving privacy.\n In response to these shortcomings, our research introduces a new benchmark\ndataset, IDNet, designed to advance privacy-preserving fraud detection efforts.\nThe IDNet dataset comprises 837,060 images of synthetically generated identity\ndocuments, totaling approximately 490 gigabytes, categorized into 20 types from\n$10$ U.S. states and 10 European countries. We evaluate the utility and present\nuse cases of the dataset, illustrating how it can aid in training\nprivacy-preserving fraud detection methods, facilitating the generation of\ncamera and video capturing of identity documents, and testing schema\nunification and other identity document management functionalities.\n","authors":["Hong Guan","Yancheng Wang","Lulu Xie","Soham Nag","Rajeev Goel","Niranjan Erappa Narayana Swamy","Yingzhen Yang","Chaowei Xiao","Jonathan Prisby","Ross Maciejewski","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2408.01690v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2409.02310v1","updated":"2024-09-03T21:41:35Z","published":"2024-09-03T21:41:35Z","title":"Geometry-aware Feature Matching for Large-Scale Structure from Motion","summary":" Establishing consistent and dense correspondences across multiple images is\ncrucial for Structure from Motion (SfM) systems. Significant view changes, such\nas air-to-ground with very sparse view overlap, pose an even greater challenge\nto the correspondence solvers. We present a novel optimization-based approach\nthat significantly enhances existing feature matching methods by introducing\ngeometry cues in addition to color cues. This helps fill gaps when there is\nless overlap in large-scale scenarios. Our method formulates geometric\nverification as an optimization problem, guiding feature matching within\ndetector-free methods and using sparse correspondences from detector-based\nmethods as anchor points. By enforcing geometric constraints via the Sampson\nDistance, our approach ensures that the denser correspondences from\ndetector-free methods are geometrically consistent and more accurate. This\nhybrid strategy significantly improves correspondence density and accuracy,\nmitigates multi-view inconsistencies, and leads to notable advancements in\ncamera pose accuracy and point cloud density. It outperforms state-of-the-art\nfeature matching methods on benchmark datasets and enables feature matching in\nchallenging extreme large-scale settings.\n","authors":["Gonglin Chen","Jinsen Wu","Haiwei Chen","Wenbin Teng","Zhiyuan Gao","Andrew Feng","Rongjun Qin","Yajie Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.02310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02309v1","updated":"2024-09-03T21:39:58Z","published":"2024-09-03T21:39:58Z","title":"QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of\n DWI Data","summary":" We propose an image-conditioned diffusion model to estimate high angular\nresolution diffusion weighted imaging (DWI) from a low angular resolution\nacquisition. Our model, which we call QID$^2$, takes as input a set of low\nangular resolution DWI data and uses this information to estimate the DWI data\nassociated with a target gradient direction. We leverage a U-Net architecture\nwith cross-attention to preserve the positional information of the reference\nimages, further guiding the target image generation. We train and evaluate\nQID$^2$ on single-shell DWI samples curated from the Human Connectome Project\n(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to\nproduce low angular resolution DWI data and train QID$^2$ to reconstruct the\nmissing high angular resolution samples. We compare QID$^2$ with two\nstate-of-the-art GAN models. Our results demonstrate that QID$^2$ not only\nachieves higher-quality generated images, but it consistently outperforms the\nGAN models in downstream tensor estimation across multiple metrics. Taken\ntogether, this study highlights the potential of diffusion models, and QID$^2$\nin particular, for q-space up-sampling, thus offering a promising toolkit for\nclinical and research applications.\n","authors":["Zijian Chen","Jueqi Wang","Archana Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2409.02309v1.pdf","comment":"Accepted at MICCAI 2024 International Workshop on Computational\n Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02290v1","updated":"2024-09-03T20:58:56Z","published":"2024-09-03T20:58:56Z","title":"Unsupervised Welding Defect Detection Using Audio And Video","summary":" In this work we explore the application of AI to robotic welding. Robotic\nwelding is a widely used technology in many industries, but robots currently do\nnot have the capability to detect welding defects which get introduced due to\nvarious reasons in the welding process. We describe how deep-learning methods\ncan be applied to detect weld defects in real-time by recording the welding\nprocess with microphones and a camera. Our findings are based on a large\ndatabase with more than 4000 welding samples we collected which covers\ndifferent weld types, materials and various defect categories. All deep\nlearning models are trained in an unsupervised fashion because the space of\npossible defects is large and the defects in our data may contain biases. We\ndemonstrate that a reliable real-time detection of most categories of weld\ndefects is feasible both from audio and video, with improvements achieved by\ncombining both modalities. Specifically, the multi-modal approach achieves an\naverage Area-under-ROC-Curve (AUC) of 0.92 over all eleven defect types in our\ndata. We conclude the paper with an analysis of the results by defect type and\na discussion of future work.\n","authors":["Georg Stemmer","Jose A. Lopez","Juan A. Del Hoyo Ontiveros","Arvind Raju","Tara Thimmanaik","Sovan Biswas"],"pdf_url":"https://arxiv.org/pdf/2409.02290v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2409.02284v1","updated":"2024-09-03T20:37:43Z","published":"2024-09-03T20:37:43Z","title":"Biochemical Prostate Cancer Recurrence Prediction: Thinking Fast & Slow","summary":" Time to biochemical recurrence in prostate cancer is essential for prognostic\nmonitoring of the progression of patients after prostatectomy, which assesses\nthe efficacy of the surgery. In this work, we proposed to leverage multiple\ninstance learning through a two-stage ``thinking fast \\& slow'' strategy for\nthe time to recurrence (TTR) prediction. The first (``thinking fast'') stage\nfinds the most relevant WSI area for biochemical recurrence and the second\n(``thinking slow'') stage leverages higher resolution patches to predict TTR.\nOur approach reveals a mean C-index ($Ci$) of 0.733 ($\\theta=0.059$) on our\ninternal validation and $Ci=0.603$ on the LEOPARD challenge validation set.\nPost hoc attention visualization shows that the most attentive area contributes\nto the TTR prediction.\n","authors":["Suhang You","Sanyukta Adap","Siddhesh Thakur","Bhakti Baheti","Spyridon Bakas"],"pdf_url":"https://arxiv.org/pdf/2409.02284v1.pdf","comment":"8 pages, 3 figures, methodology paper for LEOPRARD Challenge"},{"id":"http://arxiv.org/abs/2409.02281v1","updated":"2024-09-03T20:28:30Z","published":"2024-09-03T20:28:30Z","title":"K-Origins: Better Colour Quantification for Neural Networks","summary":" K-Origins is a neural network layer designed to improve image-based network\nperformances when learning colour, or intensities, is beneficial. Over 250\nencoder-decoder convolutional networks are trained and tested on 16-bit\nsynthetic data, demonstrating that K-Origins improves semantic segmentation\naccuracy in two scenarios: object detection with low signal-to-noise ratios,\nand segmenting multiple objects that are identical in shape but vary in colour.\nK-Origins generates output features from the input features, $\\textbf{X}$, by\nthe equation $\\textbf{Y}_k = \\textbf{X}-\\textbf{J}\\cdot w_k$ for each trainable\nparameter $w_k$, where $\\textbf{J}$ is a matrix of ones. Additionally, networks\nwith varying receptive fields were trained to determine optimal network depths\nbased on the dimensions of target classes, suggesting that receptive field\nlengths should exceed object sizes. By ensuring a sufficient receptive field\nlength and incorporating K-Origins, we can achieve better semantic network\nperformance.\n","authors":["Lewis Mason","Mark Martinez"],"pdf_url":"https://arxiv.org/pdf/2409.02281v1.pdf","comment":"16 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.02278v1","updated":"2024-09-03T20:24:37Z","published":"2024-09-03T20:24:37Z","title":"Evaluation and Comparison of Visual Language Models for Transportation\n Engineering Problems","summary":" Recent developments in vision language models (VLM) have shown great\npotential for diverse applications related to image understanding. In this\nstudy, we have explored state-of-the-art VLM models for vision-based\ntransportation engineering tasks such as image classification and object\ndetection. The image classification task involves congestion detection and\ncrack identification, whereas, for object detection, helmet violations were\nidentified. We have applied open-source models such as CLIP, BLIP, OWL-ViT,\nLlava-Next, and closed-source GPT-4o to evaluate the performance of these\nstate-of-the-art VLM models to harness the capabilities of language\nunderstanding for vision-based transportation tasks. These tasks were performed\nby applying zero-shot prompting to the VLM models, as zero-shot prompting\ninvolves performing tasks without any training on those tasks. It eliminates\nthe need for annotated datasets or fine-tuning for specific tasks. Though these\nmodels gave comparative results with benchmark Convolutional Neural Networks\n(CNN) models in the image classification tasks, for object localization tasks,\nit still needs improvement. Therefore, this study provides a comprehensive\nevaluation of the state-of-the-art VLM models highlighting the advantages and\nlimitations of the models, which can be taken as the baseline for future\nimprovement and wide-scale implementation.\n","authors":["Sanjita Prajapati","Tanu Singh","Chinmay Hegde","Pranamesh Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2409.02278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02274v1","updated":"2024-09-03T20:16:56Z","published":"2024-09-03T20:16:56Z","title":"ADHD diagnosis based on action characteristics recorded in videos using\n machine learning","summary":" Demand for ADHD diagnosis and treatment is increasing significantly and the\nexisting services are unable to meet the demand in a timely manner. In this\nwork, we introduce a novel action recognition method for ADHD diagnosis by\nidentifying and analysing raw video recordings. Our main contributions include\n1) designing and implementing a test focusing on the attention and\nhyperactivity/impulsivity of participants, recorded through three cameras; 2)\nimplementing a novel machine learning ADHD diagnosis system based on action\nrecognition neural networks for the first time; 3) proposing classification\ncriteria to provide diagnosis results and analysis of ADHD action\ncharacteristics.\n","authors":["Yichun Li","Syes Mohsen Naqvi","Rajesh Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02274v1.pdf","comment":"Neuroscience Applied"},{"id":"http://arxiv.org/abs/2310.15128v2","updated":"2024-09-03T19:55:22Z","published":"2023-10-23T17:32:38Z","title":"Projected Stochastic Gradient Descent with Quantum Annealed Binary\n Gradients","summary":" We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards\ntraining neural networks with binary weights, known as binary neural networks\n(BNNs), on quantum hardware. BNNs reduce the computational requirements and\nenergy consumption of deep learning models with minimal loss in accuracy.\nHowever, training them in practice remains to be an open challenge. Most known\nBNN-optimisers either rely on projected updates or binarise weights\npost-training. Instead, QP-SBGD approximately maps the gradient onto binary\nvariables, by solving a quadratic constrained binary optimisation. Under\npractically reasonable assumptions, we show that this update rule converges\nwith a rate of $\\mathcal{O}(1 / \\sqrt{T})$. Moreover, we show how the\n$\\mathcal{NP}$-hard projection can be effectively executed on an adiabatic\nquantum annealer, harnessing recent advancements in quantum computation. We\nalso introduce a projected version of this update rule and prove that if a\nfixed point exists in the binary variable space, the modified updates will\nconverge to it. Last but not least, our algorithm is implemented layer-wise,\nmaking it suitable to train larger networks on resource-limited quantum\nhardware. Through extensive evaluations, we show that QP-SBGD outperforms or is\non par with competitive and well-established baselines such as BinaryConnect,\nsignSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as\nwell as binary graph neural networks.\n","authors":["Maximilian Krahn","Michele Sasdelli","Fengyi Yang","Vladislav Golyanik","Juho Kannala","Tat-Jun Chin","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.15128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02261v1","updated":"2024-09-03T19:38:23Z","published":"2024-09-03T19:38:23Z","title":"Action-Based ADHD Diagnosis in Video","summary":" Attention Deficit Hyperactivity Disorder (ADHD) causes significant impairment\nin various domains. Early diagnosis of ADHD and treatment could significantly\nimprove the quality of life and functioning. Recently, machine learning methods\nhave improved the accuracy and efficiency of the ADHD diagnosis process.\nHowever, the cost of the equipment and trained staff required by the existing\nmethods are generally huge. Therefore, we introduce the video-based frame-level\naction recognition network to ADHD diagnosis for the first time. We also record\na real multi-modal ADHD dataset and extract three action classes from the video\nmodality for ADHD diagnosis. The whole process data have been reported to\nCNTW-NHS Foundation Trust, which would be reviewed by medical\nconsultants/professionals and will be made public in due course.\n","authors":["Yichun Li","Yuxing Yang","Syed Nohsen Naqvi"],"pdf_url":"https://arxiv.org/pdf/2409.02261v1.pdf","comment":"31st European Symposium on Artificial Neural Networks"},{"id":"http://arxiv.org/abs/2409.02259v1","updated":"2024-09-03T19:34:25Z","published":"2024-09-03T19:34:25Z","title":"Optimal L-Systems for Stochastic L-system Inference Problems","summary":" This paper presents two novel theorems that address two open problems in\nstochastic Lindenmayer-system (L-system) inference, specifically focusing on\nthe construction of an optimal stochastic L-system capable of generating a\ngiven sequence of strings. The first theorem delineates a method for crafting a\nstochastic L-system that maximizes the likelihood of producing a given sequence\nof words through a singular derivation. Furthermore, the second theorem\ndetermines the stochastic L-systems with the highest probability of producing a\ngiven sequence of words with multiple possible derivations. From these, we\nintroduce an algorithm to infer an optimal stochastic L-system from a given\nsequence. This algorithm incorporates sophisticated optimization techniques,\nsuch as interior point methods, ensuring production of a stochastically optimal\nstochastic L-system suitable for generating the given sequence. This allows for\nthe use of using stochastic L-systems as model for machine learning using only\npositive data for training.\n","authors":["Ali Lotfi","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2409.02259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02253v1","updated":"2024-09-03T19:26:13Z","published":"2024-09-03T19:26:13Z","title":"How to Determine the Preferred Image Distribution of a Black-Box\n Vision-Language Model?","summary":" Large foundation models have revolutionized the field, yet challenges remain\nin optimizing multi-modal models for specialized visual tasks. We propose a\nnovel, generalizable methodology to identify preferred image distributions for\nblack-box Vision-Language Models (VLMs) by measuring output consistency across\nvaried input prompts. Applying this to different rendering types of 3D objects,\nwe demonstrate its efficacy across various domains requiring precise\ninterpretation of complex structures, with a focus on Computer-Aided Design\n(CAD) as an exemplar field. We further refine VLM outputs using in-context\nlearning with human feedback, significantly enhancing explanation quality. To\naddress the lack of benchmarks in specialized domains, we introduce CAD-VQA, a\nnew dataset for evaluating VLMs on CAD-related visual question answering tasks.\nOur evaluation of state-of-the-art VLMs on CAD-VQA establishes baseline\nperformance levels, providing a framework for advancing VLM capabilities in\ncomplex visual reasoning tasks across various fields requiring expert-level\nvisual interpretation. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/cad_vqa}.\n","authors":["Saeid Asgari Taghanaki","Joseph Lambourne","Alana Mongkhounsavath"],"pdf_url":"https://arxiv.org/pdf/2409.02253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02251v1","updated":"2024-09-03T19:24:46Z","published":"2024-09-03T19:24:46Z","title":"NoiseAttack: An Evasive Sample-Specific Multi-Targeted Backdoor Attack\n Through White Gaussian Noise","summary":" Backdoor attacks pose a significant threat when using third-party data for\ndeep learning development. In these attacks, data can be manipulated to cause a\ntrained model to behave improperly when a specific trigger pattern is applied,\nproviding the adversary with unauthorized advantages. While most existing works\nfocus on designing trigger patterns in both visible and invisible to poison the\nvictim class, they typically result in a single targeted class upon the success\nof the backdoor attack, meaning that the victim class can only be converted to\nanother class based on the adversary predefined value. In this paper, we\naddress this issue by introducing a novel sample-specific multi-targeted\nbackdoor attack, namely NoiseAttack. Specifically, we adopt White Gaussian\nNoise (WGN) with various Power Spectral Densities (PSD) as our underlying\ntriggers, coupled with a unique training strategy to execute the backdoor\nattack. This work is the first of its kind to launch a vision backdoor attack\nwith the intent to generate multiple targeted classes with minimal input\nconfiguration. Furthermore, our extensive experimental results demonstrate that\nNoiseAttack can achieve a high attack success rate against popular network\narchitectures and datasets, as well as bypass state-of-the-art backdoor\ndetection methods. Our source code and experiments are available at\nhttps://github.com/SiSL-URI/NoiseAttack/tree/main.\n","authors":["Abdullah Arafat Miah","Kaan Icer","Resit Sendag","Yu Bi"],"pdf_url":"https://arxiv.org/pdf/2409.02251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02243v1","updated":"2024-09-03T19:16:36Z","published":"2024-09-03T19:16:36Z","title":"A Novel Audio-Visual Information Fusion System for Mental Disorders\n Detection","summary":" Mental disorders are among the foremost contributors to the global healthcare\nchallenge. Research indicates that timely diagnosis and intervention are vital\nin treating various mental disorders. However, the early somatization symptoms\nof certain mental disorders may not be immediately evident, often resulting in\ntheir oversight and misdiagnosis. Additionally, the traditional diagnosis\nmethods incur high time and cost. Deep learning methods based on fMRI and EEG\nhave improved the efficiency of the mental disorder detection process. However,\nthe cost of the equipment and trained staff are generally huge. Moreover, most\nsystems are only trained for a specific mental disorder and are not\ngeneral-purpose. Recently, physiological studies have shown that there are some\nspeech and facial-related symptoms in a few mental disorders (e.g., depression\nand ADHD). In this paper, we focus on the emotional expression features of\nmental disorders and introduce a multimodal mental disorder diagnosis system\nbased on audio-visual information input. Our proposed system is based on\nspatial-temporal attention networks and innovative uses a less computationally\nintensive pre-train audio recognition network to fine-tune the video\nrecognition module for better results. We also apply the unified system for\nmultiple mental disorders (ADHD and depression) for the first time. The\nproposed system achieves over 80\\% accuracy on the real multimodal ADHD dataset\nand achieves state-of-the-art results on the depression dataset AVEC 2014.\n","authors":["Yichun Li","Shuanglin Li","Syed Mohsen Naqvi"],"pdf_url":"https://arxiv.org/pdf/2409.02243v1.pdf","comment":"27th International Conference on Information (FUSION)"},{"id":"http://arxiv.org/abs/2409.02241v1","updated":"2024-09-03T19:14:01Z","published":"2024-09-03T19:14:01Z","title":"What makes a face looks like a hat: Decoupling low-level and high-level\n Visual Properties with Image Triplets","summary":" In visual decision making, high-level features, such as object categories,\nhave a strong influence on choice. However, the impact of low-level features on\nbehavior is less understood partly due to the high correlation between high-\nand low-level features in the stimuli presented (e.g., objects of the same\ncategory are more likely to share low-level features). To disentangle these\neffects, we propose a method that de-correlates low- and high-level visual\nproperties in a novel set of stimuli. Our method uses two Convolutional Neural\nNetworks (CNNs) as candidate models of the ventral visual stream: the CORnet-S\nthat has high neural predictivity in high-level, IT-like responses and the\nVGG-16 that has high neural predictivity in low-level responses. Triplets\n(root, image1, image2) of stimuli are parametrized by the level of low- and\nhigh-level similarity of images extracted from the different layers. These\nstimuli are then used in a decision-making task where participants are tasked\nto choose the most similar-to-the-root image. We found that different networks\nshow differing abilities to predict the effects of low-versus-high-level\nsimilarity: while CORnet-S outperforms VGG-16 in explaining human choices based\non high-level similarity, VGG-16 outperforms CORnet-S in explaining human\nchoices based on low-level similarity. Using Brain-Score, we observed that the\nbehavioral prediction abilities of different layers of these networks\nqualitatively corresponded to their ability to explain neural activity at\ndifferent levels of the visual hierarchy. In summary, our algorithm for\nstimulus set generation enables the study of how different representations in\nthe visual stream affect high-level cognitive behaviors.\n","authors":["Maytus Piriyajitakonkij","Sirawaj Itthipuripat","Ian Ballard","Ioannis Pappas"],"pdf_url":"https://arxiv.org/pdf/2409.02241v1.pdf","comment":"Accepted at Workshop on Human-inspired Computer Vision @ ECCV2024"},{"id":"http://arxiv.org/abs/2407.14434v2","updated":"2024-09-03T19:05:17Z","published":"2024-07-19T16:06:11Z","title":"Co-synthesis of Histopathology Nuclei Image-Label Pairs using a\n Context-Conditioned Joint Diffusion Model","summary":" In multi-class histopathology nuclei analysis tasks, the lack of training\ndata becomes a main bottleneck for the performance of learning-based methods.\nTo tackle this challenge, previous methods have utilized generative models to\nincrease data by generating synthetic samples. However, existing methods often\noverlook the importance of considering the context of biological tissues (e.g.,\nshape, spatial layout, and tissue type) in the synthetic data. Moreover, while\ngenerative models have shown superior performance in synthesizing realistic\nhistopathology images, none of the existing methods are capable of producing\nimage-label pairs at the same time. In this paper, we introduce a novel\nframework for co-synthesizing histopathology nuclei images and paired semantic\nlabels using a context-conditioned joint diffusion model. We propose\nconditioning of a diffusion model using nucleus centroid layouts with\nstructure-related text prompts to incorporate spatial and structural context\ninformation into the generation targets. Moreover, we enhance the granularity\nof our synthesized semantic labels by generating instance-wise nuclei labels\nusing distance maps synthesized concurrently in conjunction with the images and\nsemantic labels. We demonstrate the effectiveness of our framework in\ngenerating high-quality samples on multi-institutional, multi-organ, and\nmulti-modality datasets. Our synthetic data consistently outperforms existing\naugmentation methods in the downstream tasks of nuclei segmentation and\nclassification.\n","authors":["Seonghui Min","Hyun-Jic Oh","Won-Ki Jeong"],"pdf_url":"https://arxiv.org/pdf/2407.14434v2.pdf","comment":"ECCV 2024 accepted"},{"id":"http://arxiv.org/abs/2409.02224v1","updated":"2024-09-03T18:53:32Z","published":"2024-09-03T18:53:32Z","title":"EgoPressure: A Dataset for Hand Pressure and Pose Estimation in\n Egocentric Vision","summary":" Estimating touch contact and pressure in egocentric vision is a central task\nfor downstream applications in Augmented Reality, Virtual Reality, as well as\nmany robotic applications, because it provides precise physical insights into\nhand-object interaction and object manipulation. However, existing contact\npressure datasets lack egocentric views and hand poses, which are essential for\naccurate estimation during in-situ operation, both for AR/VR interaction and\nrobotic manipulation. In this paper, we introduce EgoPressure,a novel dataset\nof touch contact and pressure interaction from an egocentric perspective,\ncomplemented with hand pose meshes and fine-grained pressure intensities for\neach contact. The hand poses in our dataset are optimized using our proposed\nmulti-view sequence-based method that processes footage from our capture rig of\n8 accurately calibrated RGBD cameras. EgoPressure comprises 5.0 hours of touch\ncontact and pressure interaction from 21 participants captured by a moving\negocentric camera and 7 stationary Kinect cameras, which provided RGB images\nand depth maps at 30 Hz. In addition, we provide baselines for estimating\npressure with different modalities, which will enable future developments and\nbenchmarking on the dataset. Overall, we demonstrate that pressure and hand\nposes are complementary, which supports our intention to better facilitate the\nphysical understanding of hand-object interactions in AR/VR and robotics\nresearch.\n","authors":["Yiming Zhao","Taein Kwon","Paul Streli","Marc Pollefeys","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2409.02224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02865v1","updated":"2024-09-03T17:59:50Z","published":"2024-09-03T17:59:50Z","title":"Visually Grounded Speech Models for Low-resource Languages and Cognitive\n Modelling","summary":" This dissertation examines visually grounded speech (VGS) models that learn\nfrom unlabelled speech paired with images. It focuses on applications for\nlow-resource languages and understanding human language acquisition. We\nintroduce a task called visually prompted keyword localisation to detect and\nlocalise keywords in speech using images. We demonstrate the effectiveness of\nVGS models in few-shot learning scenarios for low-resource languages like\nYoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our\nmonolingual VGS model exhibits this bias, but we found that multilingualism\ndoes not affect the bias in this VGS model similarly to what is observed in\nchildren.\n","authors":["Leanne Nortje"],"pdf_url":"https://arxiv.org/pdf/2409.02865v1.pdf","comment":"PhD Dissertation"},{"id":"http://arxiv.org/abs/2409.02108v1","updated":"2024-09-03T17:59:05Z","published":"2024-09-03T17:59:05Z","title":"Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection,\n Removal, and Generation in the Era of Deep Learning","summary":" Shadows are formed when light encounters obstacles, leading to areas of\ndiminished illumination. In computer vision, shadow detection, removal, and\ngeneration are crucial for enhancing scene understanding, refining image\nquality, ensuring visual consistency in video editing, and improving virtual\nenvironments. This paper presents a comprehensive survey of shadow detection,\nremoval, and generation in images and videos within the deep learning landscape\nover the past decade, covering tasks, deep models, datasets, and evaluation\nmetrics. Our key contributions include a comprehensive survey of shadow\nanalysis, standardization of experimental comparisons, exploration of the\nrelationships among model size, speed, and performance, a cross-dataset\ngeneralization study, identification of open issues and future directions, and\nprovision of publicly available resources to support further research.\n","authors":["Xiaowei Hu","Zhenghao Xing","Tianyu Wang","Chi-Wing Fu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02108v1.pdf","comment":"Publicly available results, trained models, and evaluation metrics at\n https://github.com/xw-hu/Unveiling-Deep-Shadows"},{"id":"http://arxiv.org/abs/2409.02104v1","updated":"2024-09-03T17:58:03Z","published":"2024-09-03T17:58:03Z","title":"DynOMo: Online Point Tracking by Dynamic Online Monocular Gaussian\n Reconstruction","summary":" Reconstructing scenes and tracking motion are two sides of the same coin.\nTracking points allow for geometric reconstruction [14], while geometric\nreconstruction of (dynamic) scenes allows for 3D tracking of points over time\n[24, 39]. The latter was recently also exploited for 2D point tracking to\novercome occlusion ambiguities by lifting tracking directly into 3D [38].\nHowever, above approaches either require offline processing or multi-view\ncamera setups both unrealistic for real-world applications like robot\nnavigation or mixed reality. We target the challenge of online 2D and 3D point\ntracking from unposed monocular camera input introducing Dynamic Online\nMonocular Reconstruction (DynOMo). We leverage 3D Gaussian splatting to\nreconstruct dynamic scenes in an online fashion. Our approach extends 3D\nGaussians to capture new content and object motions while estimating camera\nmovements from a single RGB frame. DynOMo stands out by enabling emergence of\npoint trajectories through robust image feature reconstruction and a novel\nsimilarity-enhanced regularization term, without requiring any\ncorrespondence-level supervision. It sets the first baseline for online point\ntracking with monocular unposed cameras, achieving performance on par with\nexisting methods. We aim to inspire the community to advance online point\ntracking and reconstruction, expanding the applicability to diverse real-world\nscenarios.\n","authors":["Jenny Seidenschwarz","Qunjie Zhou","Bardienus Duisterhof","Deva Ramanan","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2409.02104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02101v1","updated":"2024-09-03T17:56:51Z","published":"2024-09-03T17:56:51Z","title":"Towards Real-World Adverse Weather Image Restoration: Enhancing\n Clearness and Semantics with Vision-Language Models","summary":" This paper addresses the limitations of adverse weather image restoration\napproaches trained on synthetic data when applied to real-world scenarios. We\nformulate a semi-supervised learning framework employing vision-language models\nto enhance restoration performance across diverse adverse weather conditions in\nreal-world settings. Our approach involves assessing image clearness and\nproviding semantics using vision-language models on real data, serving as\nsupervision signals for training restoration models. For clearness enhancement,\nwe use real-world data, utilizing a dual-step strategy with pseudo-labels\nassessed by vision-language models and weather prompt learning. For semantic\nenhancement, we integrate real-world data by adjusting weather conditions in\nvision-language model descriptions while preserving semantic meaning.\nAdditionally, we introduce an effective training strategy to bootstrap\nrestoration performance. Our approach achieves superior results in real-world\nadverse weather image restoration, demonstrated through qualitative and\nquantitative comparisons with state-of-the-art works.\n","authors":["Jiaqi Xu","Mengyang Wu","Xiaowei Hu","Chi-Wing Fu","Qi Dou","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02101v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02097v1","updated":"2024-09-03T17:54:39Z","published":"2024-09-03T17:54:39Z","title":"LinFusion: 1 GPU, 1 Minute, 16K Image","summary":" Modern diffusion models, particularly those utilizing a Transformer-based\nUNet for denoising, rely heavily on self-attention operations to manage complex\nspatial relationships, thus achieving impressive generation performance.\nHowever, this existing paradigm faces significant challenges in generating\nhigh-resolution visual content due to its quadratic time and memory complexity\nwith respect to the number of spatial tokens. To address this limitation, we\naim at a novel linear attention mechanism as an alternative in this paper.\nSpecifically, we begin our exploration from recently introduced models with\nlinear complexity, e.g., Mamba, Mamba2, and Gated Linear Attention, and\nidentify two key features-attention normalization and non-causal inference-that\nenhance high-resolution visual generation performance. Building on these\ninsights, we introduce a generalized linear attention paradigm, which serves as\na low-rank approximation of a wide spectrum of popular linear token mixers. To\nsave the training cost and better leverage pre-trained models, we initialize\nour models and distill the knowledge from pre-trained StableDiffusion (SD). We\nfind that the distilled model, termed LinFusion, achieves performance on par\nwith or superior to the original SD after only modest training, while\nsignificantly reducing time and memory complexity. Extensive experiments on\nSD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory\nzero-shot cross-resolution generation performance, generating high-resolution\nimages like 16K resolution. Moreover, it is highly compatible with pre-trained\nSD components, such as ControlNet and IP-Adapter, requiring no adaptation\nefforts. Codes are available at https://github.com/Huage001/LinFusion.\n","authors":["Songhua Liu","Weihao Yu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02097v1.pdf","comment":"Work in Progress. Codes are available at\n https://github.com/Huage001/LinFusion"},{"id":"http://arxiv.org/abs/2409.02095v1","updated":"2024-09-03T17:52:03Z","published":"2024-09-03T17:52:03Z","title":"DepthCrafter: Generating Consistent Long Depth Sequences for Open-world\n Videos","summary":" Despite significant advancements in monocular depth estimation for static\nimages, estimating video depth in the open world remains challenging, since\nopen-world videos are extremely diverse in content, motion, camera movement,\nand length. We present DepthCrafter, an innovative method for generating\ntemporally consistent long depth sequences with intricate details for\nopen-world videos, without requiring any supplementary information such as\ncamera poses or optical flow. DepthCrafter achieves generalization ability to\nopen-world videos by training a video-to-depth model from a pre-trained\nimage-to-video diffusion model, through our meticulously designed three-stage\ntraining strategy with the compiled paired video-depth datasets. Our training\napproach enables the model to generate depth sequences with variable lengths at\none time, up to 110 frames, and harvest both precise depth details and rich\ncontent diversity from realistic and synthetic datasets. We also propose an\ninference strategy that processes extremely long videos through segment-wise\nestimation and seamless stitching. Comprehensive evaluations on multiple\ndatasets reveal that DepthCrafter achieves state-of-the-art performance in\nopen-world video depth estimation under zero-shot settings. Furthermore,\nDepthCrafter facilitates various downstream applications, including depth-based\nvisual effects and conditional video generation.\n","authors":["Wenbo Hu","Xiangjun Gao","Xiaoyu Li","Sijie Zhao","Xiaodong Cun","Yong Zhang","Long Quan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2409.02095v1.pdf","comment":"Project webpage: https://depthcrafter.github.io"},{"id":"http://arxiv.org/abs/2409.02084v1","updated":"2024-09-03T17:35:48Z","published":"2024-09-03T17:35:48Z","title":"GraspSplats: Efficient Manipulation with 3D Feature Splatting","summary":" The ability for robots to perform efficient and zero-shot grasping of object\nparts is crucial for practical applications and is becoming prevalent with\nrecent advances in Vision-Language Models (VLMs). To bridge the 2D-to-3D gap\nfor representations to support such a capability, existing methods rely on\nneural fields (NeRFs) via differentiable rendering or point-based projection\nmethods. However, we demonstrate that NeRFs are inappropriate for scene changes\ndue to their implicitness and point-based methods are inaccurate for part\nlocalization without rendering-based optimization. To amend these issues, we\npropose GraspSplats. Using depth supervision and a novel reference feature\ncomputation method, GraspSplats generates high-quality scene representations in\nunder 60 seconds. We further validate the advantages of Gaussian-based\nrepresentation by showing that the explicit and optimized geometry in\nGraspSplats is sufficient to natively support (1) real-time grasp sampling and\n(2) dynamic and articulated object manipulation with point trackers. With\nextensive experiments on a Franka robot, we demonstrate that GraspSplats\nsignificantly outperforms existing methods under diverse task settings. In\nparticular, GraspSplats outperforms NeRF-based methods like F3RM and LERF-TOGO,\nand 2D detection methods.\n","authors":["Mazeyu Ji","Ri-Zhao Qiu","Xueyan Zou","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02084v1.pdf","comment":"Project webpage: https://graspsplats.github.io/"},{"id":"http://arxiv.org/abs/2409.02081v1","updated":"2024-09-03T17:32:35Z","published":"2024-09-03T17:32:35Z","title":"Physical Rule-Guided Convolutional Neural Network","summary":" The black-box nature of Convolutional Neural Networks (CNNs) and their\nreliance on large datasets limit their use in complex domains with limited\nlabeled data. Physics-Guided Neural Networks (PGNNs) have emerged to address\nthese limitations by integrating scientific principles and real-world\nknowledge, enhancing model interpretability and efficiency. This paper proposes\na novel Physics-Guided CNN (PGCNN) architecture that incorporates dynamic,\ntrainable, and automated LLM-generated, widely recognized rules integrated into\nthe model as custom layers to address challenges like limited data and low\nconfidence scores. The PGCNN is evaluated on multiple datasets, demonstrating\nsuperior performance compared to a baseline CNN model. Key improvements include\na significant reduction in false positives and enhanced confidence scores for\ntrue detection. The results highlight the potential of PGCNNs to improve CNN\nperformance for broader application areas.\n","authors":["Kishor Datta Gupta","Marufa Kamal","Rakib Hossain Rifat","Mohd Ariful Haque","Roy George"],"pdf_url":"https://arxiv.org/pdf/2409.02081v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.17344v2","updated":"2024-09-03T10:50:17Z","published":"2024-08-30T15:16:52Z","title":"rerankers: A Lightweight Python Library to Unify Ranking Methods","summary":" This paper presents rerankers, a Python library which provides an easy-to-use\ninterface to the most commonly used re-ranking approaches. Re-ranking is an\nintegral component of many retrieval pipelines; however, there exist numerous\napproaches to it, relying on different implementation methods. rerankers\nunifies these methods into a single user-friendly interface, allowing\npractitioners and researchers alike to explore different methods while only\nchanging a single line of Python code. Moreover ,rerankers ensures that its\nimplementations are done with the fewest dependencies possible, and re-uses the\noriginal implementation whenever possible, guaranteeing that our simplified\ninterface results in no performance degradation compared to more complex ones.\nThe full source code and list of supported models are updated regularly and\navailable at https://github.com/answerdotai/rerankers.\n","authors":["Benjamin Clavié"],"pdf_url":"https://arxiv.org/pdf/2408.17344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06242v2","updated":"2024-09-03T00:21:23Z","published":"2024-05-10T04:44:34Z","title":"Impedance vs. Power Side-channel Vulnerabilities: A Comparative Study","summary":" In recent times, impedance side-channel analysis has emerged as a potent\nstrategy for adversaries seeking to extract sensitive information from\ncomputing systems. It leverages variations in the intrinsic impedance of a\nchip's internal structure across different logic states. In this study, we\nconduct a comparative analysis between the newly explored impedance side\nchannel and the well-established power side channel. Through experimental\nevaluation, we investigate the efficacy of these two side channels in\nextracting the cryptographic key from the Advanced Encryption Standard (AES)\nand analyze their performance. Our results indicate that impedance analysis\ndemonstrates a higher potential for cryptographic key extraction compared to\npower side-channel analysis. Moreover, we identify scenarios where power\nside-channel analysis does not yield satisfactory results, whereas impedance\nanalysis proves to be more robust and effective. This work not only underscores\nthe significance of impedance side-channel analysis in enhancing cryptographic\nsecurity but also emphasizes the necessity for a deeper understanding of its\nmechanisms and implications.\n","authors":["Md Sadik Awal","Buddhipriya Gayanath","Md Tauhidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2405.06242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01736v1","updated":"2024-09-03T09:25:26Z","published":"2024-09-03T09:25:26Z","title":"SpannerLib: Embedding Declarative Information Extraction in an\n Imperative Workflow","summary":" Document spanners have been proposed as a formal framework for declarative\nInformation Extraction (IE) from text, following IE products from the industry\nand academia. Over the past decade, the framework has been studied thoroughly\nin terms of expressive power, complexity, and the ability to naturally combine\ntext analysis with relational querying. This demonstration presents SpannerLib\na library for embedding document spanners in Python code. SpannerLib\nfacilitates the development of IE programs by providing an implementation of\nSpannerlog (Datalog-based documentspanners) that interacts with the Python code\nin two directions: rules can be embedded inside Python, and they can invoke\ncustom Python code (e.g., calls to ML-based NLP models) via user-defined\nfunctions. The demonstration scenarios showcase IE programs, with increasing\nlevels of complexity, within Jupyter Notebook.\n","authors":["Dean Light","Ahmad Aiashy","Mahmoud Diab","Daniel Nachmias","Stijn Vansummeren","Benny Kimelfeld"],"pdf_url":"https://arxiv.org/pdf/2409.01736v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2409.01605v1","updated":"2024-09-03T04:55:03Z","published":"2024-09-03T04:55:03Z","title":"Laser: Parameter-Efficient LLM Bi-Tuning for Sequential Recommendation\n with Collaborative Information","summary":" Sequential recommender systems are essential for discerning user preferences\nfrom historical interactions and facilitating targeted recommendations. Recent\ninnovations employing Large Language Models (LLMs) have advanced the field by\nencoding item semantics, yet they often necessitate substantial parameter\ntuning and are resource-demanding. Moreover, these works fails to consider the\ndiverse characteristics of different types of users and thus diminishes the\nrecommendation accuracy. In this paper, we propose a parameter-efficient Large\nLanguage Model Bi-Tuning framework for sequential recommendation with\ncollaborative information (Laser). Specifically, Bi-Tuning works by inserting\ntrainable virtual tokens at both the prefix and suffix of the input sequence\nand freezing the LLM parameters, thus optimizing the LLM for the sequential\nrecommendation. In our Laser, the prefix is utilized to incorporate user-item\ncollaborative information and adapt the LLM to the recommendation task, while\nthe suffix converts the output embeddings of the LLM from the language space to\nthe recommendation space for the follow-up item recommendation. Furthermore, to\ncapture the characteristics of different types of users when integrating the\ncollaborative information via the prefix, we introduce M-Former, a lightweight\nMoE-based querying transformer that uses a set of query experts to integrate\ndiverse user-specific collaborative information encoded by frozen ID-based\nsequential recommender systems, significantly improving the accuracy of\nrecommendations. Extensive experiments on real-world datasets demonstrate that\nLaser can parameter-efficiently adapt LLMs to effective recommender systems,\nsignificantly outperforming state-of-the-art methods.\n","authors":["Xinyu Zhang","Linmei Hu","Luhao Zhang","Dandan Song","Heyan Huang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2409.01605v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.01563v1","updated":"2024-09-03T03:00:59Z","published":"2024-09-03T03:00:59Z","title":"Blockchain-based Federated Recommendation with Incentive Mechanism","summary":" Nowadays, federated recommendation technology is rapidly evolving to help\nmultiple organisations share data and train models while meeting user privacy,\ndata security and government regulatory requirements. However, federated\nrecommendation increases customer system costs such as power, computational and\ncommunication resources. Besides, federated recommendation systems are also\nsusceptible to model attacks and data poisoning by participating malicious\nclients. Therefore, most customers are unwilling to participate in federated\nrecommendation without any incentive. To address these problems, we propose a\nblockchain-based federated recommendation system with incentive mechanism to\npromote more trustworthy, secure, and efficient federated recommendation\nservice. First, we construct a federated recommendation system based on NeuMF\nand FedAvg. Then we introduce a reverse auction mechanism to select optimal\nclients that can maximize the social surplus. Finally, we employ blockchain for\non-chain evidence storage of models to ensure the safety of the federated\nrecommendation system. The experimental results show that our proposed\nincentive mechanism can attract clients with superior training data to engage\nin the federal recommendation at a lower cost, which can increase the economic\nbenefit of federal recommendation by 54.9\\% while improve the recommendation\nperformance. Thus our work provides theoretical and technological support for\nthe construction of a harmonious and healthy ecological environment for the\napplication of federal recommendation.\n","authors":["Jianhai Chen","Yanlin Wu","Dazhong Rong","Guoyao Yu","Lingqi Jiang","Zhenguang Liu","Peng Zhou","Rui Shen"],"pdf_url":"https://arxiv.org/pdf/2409.01563v1.pdf","comment":"This paper has been accepted on 2024 Blockchain and Web3 Technology\n Innovation and Application Exchange Conference (BWTAC 2024)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.08803v2","updated":"2024-09-03T16:59:07Z","published":"2024-07-11T18:23:46Z","title":"PID Accelerated Temporal Difference Algorithms","summary":" Long-horizon tasks, which have a large discount factor, pose a challenge for\nmost conventional reinforcement learning (RL) algorithms. Algorithms such as\nValue Iteration and Temporal Difference (TD) learning have a slow convergence\nrate and become inefficient in these tasks. When the transition distributions\nare given, PID VI was recently introduced to accelerate the convergence of\nValue Iteration using ideas from control theory. Inspired by this, we introduce\nPID TD Learning and PID Q-Learning algorithms for the RL setting, in which only\nsamples from the environment are available. We give a theoretical analysis of\nthe convergence of PID TD Learning and its acceleration compared to the\nconventional TD Learning. We also introduce a method for adapting PID gains in\nthe presence of noise and empirically verify its effectiveness.\n","authors":["Mark Bedaywi","Amin Rakhsha","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2407.08803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09075v2","updated":"2024-09-03T16:47:09Z","published":"2024-08-17T02:26:29Z","title":"Improving Rare Word Translation With Dictionaries and Attention Masking","summary":" In machine translation, rare words continue to be a problem for the dominant\nencoder-decoder architecture, especially in low-resource and out-of-domain\ntranslation settings. Human translators solve this problem with monolingual or\nbilingual dictionaries. In this paper, we propose appending definitions from a\nbilingual dictionary to source sentences and using attention masking to link\ntogether rare words with their definitions. We find that including definitions\nfor rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1.\n","authors":["Kenneth J. Sible","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2408.09075v2.pdf","comment":"11 pages, 3 figures, 3 tables. Accepted at AMTA 2024"},{"id":"http://arxiv.org/abs/2406.06385v3","updated":"2024-09-03T16:36:06Z","published":"2024-06-10T15:44:22Z","title":"Low-Rank Quantization-Aware Training for LLMs","summary":" Large language models (LLMs) are omnipresent, however their practical\ndeployment is challenging due to their ever increasing computational and memory\ndemands. Quantization is one of the most effective ways to make them more\ncompute and memory efficient. Quantization-aware training (QAT) methods,\ngenerally produce the best quantized performance, however it comes at the cost\nof potentially long training time and excessive memory usage, making it\nimpractical when applying for LLMs. Inspired by parameter-efficient fine-tuning\n(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a\nlightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several\ncomponents to save memory without sacrificing predictive performance: (a)\nlow-rank auxiliary weights that are aware of the quantization grid; (b) a\ndowncasting operator using fixed-point or double-packed integers and (c)\ncheckpointing. Unlike most related work, our method (i) is inference-efficient,\nleading to no additional overhead compared to traditional PTQ; (ii) can be seen\nas a general extended pretraining framework, meaning that the resulting model\ncan still be utilized for any downstream task afterwards; (iii) can be applied\nacross a wide range of quantization settings, such as different choices\nquantization granularity, activation quantization, and seamlessly combined with\nmany PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families\nand validate its effectiveness on several downstream tasks. Our method\noutperforms common post-training quantization (PTQ) approaches and reaches the\nsame model performance as full-model QAT at the fraction of its memory usage.\nSpecifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of\nmemory. Our source code is available at\nhttps://github.com/qualcomm-ai-research/LR-QAT\n","authors":["Yelysei Bondarenko","Riccardo Del Chiaro","Markus Nagel"],"pdf_url":"https://arxiv.org/pdf/2406.06385v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15126v3","updated":"2024-09-03T15:30:27Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in\nfields of materials science, chemistry, pharmacology just to name a few.\nConventional MD simulations are plagued by numerical stability as well as long\nequilibration time issues, which limits broader applications of MD simulations.\nRecently, a surge of deep learning approaches have been devised for\ntime-coarsened dynamics, which learns the state transition mechanism over much\nlarger time scales to overcome these limitations. However, only a few methods\ntarget the underlying Boltzmann distribution by resampling techniques, where\nproposals are rarely accepted as new states with low efficiency. In this work,\nwe propose a force-guided bridge matching model, FBM, a novel framework that\nfirst incorporates physical priors into bridge matching for full-atom\ntime-coarsened dynamics. With the guidance of our well-designed intermediate\nforce field, FBM is feasible to target the Boltzmann-like distribution by\ndirect inference without extra steps. Experiments on small peptides verify our\nsuperiority in terms of comprehensive metrics and demonstrate transferability\nto unseen peptide systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13713v3","updated":"2024-09-03T15:28:37Z","published":"2024-08-25T03:26:00Z","title":"Verifiable cloud-based variational quantum algorithms","summary":" Variational quantum algorithms (VQAs) have shown potential for quantum\nadvantage with noisy intermediate-scale quantum (NISQ) devices for quantum\nmachine learning (QML). However, given the high cost and limited availability\nof quantum resources, delegating VQAs via cloud networks is a more practical\nsolution for clients with limited quantum capabilities. Recently, Shingu et\nal.[Physical Review A, 105, 022603 (2022)] proposed a variational secure cloud\nquantum computing protocol, utilizing ancilla-driven quantum computation (ADQC)\nfor cloud-based VQAs with minimal quantum resource consumption. However, their\nprotocol lacks verifiability, which exposes it to potential malicious behaviors\nby the server. Additionally, channel loss requires frequent re-delegation as\nthe size of the delegated variational circuit grows, complicating verification\ndue to increased circuit complexity. This paper introduces a new protocol to\naddress these challenges and enhance both verifiability and tolerance to\nchannel loss in cloud-based VQAs.\n","authors":["Junhong Yang","Banghai Wang","Junyu Quan","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2408.13713v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06425v6","updated":"2024-09-03T15:07:13Z","published":"2024-08-12T18:04:59Z","title":"Bayesian Learning in a Nonlinear Multiscale State-Space Model","summary":" The ubiquity of multiscale interactions in complex systems is\nwell-recognized, with development and heredity serving as a prime example of\nhow processes at different temporal scales influence one another. This work\nintroduces a novel multiscale state-space model to explore the dynamic\ninterplay between systems interacting across different time scales, with\nfeedback between each scale. We propose a Bayesian learning framework to\nestimate unknown states by learning the unknown process noise covariances\nwithin this multiscale model. We develop a Particle Gibbs with Ancestor\nSampling (PGAS) algorithm for inference and demonstrate through simulations the\nefficacy of our approach.\n","authors":["Nayely Vélez-Cruz","Manfred D. Laubichler"],"pdf_url":"https://arxiv.org/pdf/2408.06425v6.pdf","comment":"Corrected a typo"},{"id":"http://arxiv.org/abs/2408.14340v3","updated":"2024-09-03T14:53:34Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":" In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wenhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16945v2","updated":"2024-09-03T14:20:02Z","published":"2024-08-29T23:51:51Z","title":"Different Victims, Same Layout: Email Visual Similarity Detection for\n Enhanced Email Protection","summary":" In the pursuit of an effective spam detection system, the focus has often\nbeen on identifying known spam patterns either through rule-based detection\nsystems or machine learning (ML) solutions that rely on keywords. However, both\nsystems are susceptible to evasion techniques and zero-day attacks that can be\nachieved at low cost. Therefore, an email that bypassed the defense system once\ncan do it again in the following days, even though rules are updated or the ML\nmodels are retrained. The recurrence of failures to detect emails that exhibit\nlayout similarities to previously undetected spam is concerning for customers\nand can erode their trust in a company. Our observations show that threat\nactors reuse email kits extensively and can bypass detection with little\neffort, for example, by making changes to the content of emails. In this work,\nwe propose an email visual similarity detection approach, named Pisco, to\nimprove the detection capabilities of an email threat defense system. We apply\nour proof of concept to some real-world samples received from different\nsources. Our results show that email kits are being reused extensively and\nvisually similar emails are sent to our customers at various time intervals.\nTherefore, this method could be very helpful in situations where detection\nfeatures that rely on textual features and keywords are bypassed, an occurrence\nour observations show happens frequently.\n","authors":["Sachin Shukla","Omid Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2408.16945v2.pdf","comment":"To be published in the proceedings of the ACM Conference on Computer\n and Communications Security (ACM CCS 2024)"},{"id":"http://arxiv.org/abs/2402.13108v2","updated":"2024-09-03T14:09:08Z","published":"2024-02-20T16:01:42Z","title":"On the Convergence of Gradient Descent for Large Learning Rates","summary":" A vast literature on convergence guarantees for gradient descent and derived\nmethods exists at the moment. However, a simple practical situation remains\nunexplored: when a fixed step size is used, can we expect gradient descent to\nconverge starting from any initialization? We provide fundamental impossibility\nresults showing that convergence becomes impossible no matter the\ninitialization if the step size gets too big. Looking at the asymptotic value\nof the gradient norm along the optimization trajectory, we see that there is a\nphase transition as the step size crosses a critical value. This has been\nobserved by practitioners, yet the true mechanisms through which this happens\nremain unclear beyond heuristics. Using results from dynamical systems theory,\nwe provide a proof of this in the case of linear neural networks with a squared\nloss. We also prove the impossibility of convergence for more general losses\nwithout requiring strong assumptions such as Lipschitz continuity for the\ngradient. We validate our findings through experiments with non-linear\nnetworks.\n","authors":["Alexandru Crăciun","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2402.13108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17147v4","updated":"2024-09-03T12:55:47Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v4.pdf","comment":"accepted by IEEE RA-L"},{"id":"http://arxiv.org/abs/2401.12843v2","updated":"2024-09-03T12:49:16Z","published":"2024-01-23T15:25:21Z","title":"An embedding-based distance for temporal graphs","summary":" Temporal graphs are commonly used to represent time-resolved relations\nbetween entities in many natural and artificial systems. Many techniques were\ndevised to investigate the evolution of temporal graphs by comparing their\nstate at different time points. However, quantifying the similarity between\ntemporal graphs as a whole is an open problem. Here, we use embeddings based on\ntime-respecting random walks to introduce a new notion of distance between\ntemporal graphs. This distance is well-defined for pairs of temporal graphs\nwith different numbers of nodes and different time spans. We study the case of\na matched pair of graphs, when a known relation exists between their nodes, and\nthe case of unmatched graphs, when such a relation is unavailable and the\ngraphs may be of different sizes. We use empirical and synthetic temporal\nnetwork data to show that the distance we introduce discriminates graphs with\ndifferent topological and temporal properties. We provide an efficient\nimplementation of the distance computation suitable for large-scale temporal\ngraphs.\n","authors":["Lorenzo Dall'Amico","Alain Barrat","Ciro Cattuto"],"pdf_url":"https://arxiv.org/pdf/2401.12843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10800v2","updated":"2024-09-03T12:43:37Z","published":"2024-05-17T14:10:34Z","title":"Heterogeneity-Informed Meta-Parameter Learning for Spatiotemporal Time\n Series Forecasting","summary":" Spatiotemporal time series forecasting plays a key role in a wide range of\nreal-world applications. While significant progress has been made in this area,\nfully capturing and leveraging spatiotemporal heterogeneity remains a\nfundamental challenge. Therefore, we propose a novel Heterogeneity-Informed\nMeta-Parameter Learning scheme. Specifically, our approach implicitly captures\nspatiotemporal heterogeneity through learning spatial and temporal embeddings,\nwhich can be viewed as a clustering process. Then, a novel spatiotemporal\nmeta-parameter learning paradigm is proposed to learn spatiotemporal-specific\nparameters from meta-parameter pools, which is informed by the captured\nheterogeneity. Based on these ideas, we develop a Heterogeneity-Informed\nSpatiotemporal Meta-Network (HimNet) for spatiotemporal time series\nforecasting. Extensive experiments on five widely-used benchmarks demonstrate\nour method achieves state-of-the-art performance while exhibiting superior\ninterpretability. Our code is available at\nhttps://github.com/XDZhelheim/HimNet.\n","authors":["Zheng Dong","Renhe Jiang","Haotian Gao","Hangchen Liu","Jinliang Deng","Qingsong Wen","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2405.10800v2.pdf","comment":"Published in KDD'24 Research Track"},{"id":"http://arxiv.org/abs/2406.14281v4","updated":"2024-09-03T12:38:22Z","published":"2024-06-20T13:07:06Z","title":"FairX: A comprehensive benchmarking tool for model analysis using\n fairness, utility, and explainability","summary":" We present FairX, an open-source Python-based benchmarking tool designed for\nthe comprehensive analysis of models under the umbrella of fairness, utility,\nand eXplainability (XAI). FairX enables users to train benchmarking\nbias-mitigation models and evaluate their fairness using a wide array of\nfairness metrics, data utility metrics, and generate explanations for model\npredictions, all within a unified framework. Existing benchmarking tools do not\nhave the way to evaluate synthetic data generated from fair generative models,\nalso they do not have the support for training fair generative models either.\nIn FairX, we add fair generative models in the collection of our fair-model\nlibrary (pre-processing, in-processing, post-processing) and evaluation metrics\nfor evaluating the quality of synthetic fair data. This version of FairX\nsupports both tabular and image datasets. It also allows users to provide their\nown custom datasets. The open-source FairX benchmarking package is publicly\navailable at \\url{https://github.com/fahim-sikder/FairX}.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Daniel de Leng","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2406.14281v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09360v2","updated":"2024-09-03T12:31:51Z","published":"2024-08-18T04:50:41Z","title":"Behavioral Learning of Dish Rinsing and Scrubbing based on Interruptive\n Direct Teaching Considering Assistance Rate","summary":" Robots are expected to manipulate objects in a safe and dexterous way. For\nexample, washing dishes is a dexterous operation that involves scrubbing the\ndishes with a sponge and rinsing them with water. It is necessary to learn it\nsafely without splashing water and without dropping the dishes. In this study,\nwe propose a safe and dexterous manipulation system. The robot learns a\ndynamics model of the object by estimating the state of the object and the\nrobot itself, the control input, and the amount of human assistance required\n(assistance rate) after the human corrects the initial trajectory of the\nrobot's hands by interruptive direct teaching. By backpropagating the error\nbetween the estimated and the reference value using the acquired dynamics\nmodel, the robot can generate a control input that approaches the reference\nvalue, for example, so that human assistance is not required and the dish does\nnot move excessively. This allows for adaptive rinsing and scrubbing of dishes\nwith unknown shapes and properties. As a result, it is possible to generate\nsafe actions that require less human assistance.\n","authors":["Shumpei Wakabayashi","Kento Kawaharazuka","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.09360v2.pdf","comment":"Accepted at Advanced Robotics"},{"id":"http://arxiv.org/abs/2404.02937v5","updated":"2024-09-03T11:32:50Z","published":"2024-04-03T07:14:15Z","title":"Towards Explainable Traffic Flow Prediction with Large Language Models","summary":" Traffic forecasting is crucial for intelligent transportation systems. It has\nexperienced significant advancements thanks to the power of deep learning in\ncapturing latent patterns of traffic data. However, recent deep-learning\narchitectures require intricate model designs and lack an intuitive\nunderstanding of the mapping from input data to predicted results. Achieving\nboth accuracy and explainability in traffic prediction models remains a\nchallenge due to the complexity of traffic data and the inherent opacity of\ndeep learning models. To tackle these challenges, we propose a Traffic flow\nPrediction model based on Large Language Models (LLMs) to generate explainable\ntraffic predictions, named xTP-LLM. By transferring multi-modal traffic data\ninto natural language descriptions, xTP-LLM captures complex time-series\npatterns and external factors from comprehensive traffic data. The LLM\nframework is fine-tuned using language-based instructions to align with\nspatial-temporal traffic flow data. Empirically, xTP-LLM shows competitive\naccuracy compared with deep learning baselines, while providing an intuitive\nand reliable explanation for predictions. This paper contributes to advancing\nexplainable traffic prediction models and lays a foundation for future\nexploration of LLM applications in transportation. To the best of our\nknowledge, this is the first study to use LLM for explainable prediction of\ntraffic flows.\n","authors":["Xusen Guo","Qiming Zhang","Junyue Jiang","Mingxing Peng","Meixin Zhu"," Hao"," Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02937v5.pdf","comment":"31pages, 16 figures"},{"id":"http://arxiv.org/abs/2310.02031v8","updated":"2024-09-03T10:19:52Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":" Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reasons are the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever large language\nmodel in the ocean domain, which is expert in various ocean science tasks. We\nalso propose OceanGPT, a novel framework to automatically obtain a large volume\nof ocean domain instruction data, which generates instructions based on\nmulti-agent collaboration. Additionally, we construct the first oceanography\nbenchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean\ndomain. Though comprehensive experiments, OceanGPT not only shows a higher\nlevel of knowledge expertise for oceans science tasks but also gains\npreliminary embodied intelligence capabilities in ocean technology.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v8.pdf","comment":"ACL2024. Project Website: http://oceangpt.zjukg.cn/"},{"id":"http://arxiv.org/abs/2405.19047v2","updated":"2024-09-03T09:25:46Z","published":"2024-05-29T12:44:41Z","title":"Statistical Context Detection for Deep Lifelong Reinforcement Learning","summary":" Context detection involves labeling segments of an online stream of data as\nbelonging to different tasks. Task labels are used in lifelong learning\nalgorithms to perform consolidation or other procedures that prevent\ncatastrophic forgetting. Inferring task labels from online experiences remains\na challenging problem. Most approaches assume finite and low-dimension\nobservation spaces or a preliminary training phase during which task labels are\nlearned. Moreover, changes in the transition or reward functions can be\ndetected only in combination with a policy, and therefore are more difficult to\ndetect than changes in the input distribution. This paper presents an approach\nto learning both policies and labels in an online deep reinforcement learning\nsetting. The key idea is to use distance metrics, obtained via optimal\ntransport methods, i.e., Wasserstein distance, on suitable latent action-reward\nspaces to measure distances between sets of data points from past and current\nstreams. Such distances can then be used for statistical tests based on an\nadapted Kolmogorov-Smirnov calculation to assign labels to sequences of\nexperiences. A rollback procedure is introduced to learn multiple policies by\nensuring that only the appropriate data is used to train the corresponding\npolicy. The combination of task detection and policy deployment allows for the\noptimization of lifelong reinforcement learning agents without an oracle that\nprovides task labels. The approach is tested using two benchmarks and the\nresults show promising performance when compared with related context detection\nalgorithms. The results suggest that optimal transport statistical methods\nprovide an explainable and justifiable procedure for online context detection\nand reward optimization in lifelong reinforcement learning.\n","authors":["Jeffery Dick","Saptarshi Nath","Christos Peridis","Eseoghene Benjamin","Soheil Kolouri","Andrea Soltoggio"],"pdf_url":"https://arxiv.org/pdf/2405.19047v2.pdf","comment":"10 pages excluding references and bibliography. Accepted at CoLLAs\n 2024"},{"id":"http://arxiv.org/abs/2308.03887v3","updated":"2024-09-03T08:53:32Z","published":"2023-08-04T15:57:28Z","title":"Enhancing Cell Tracking with a Time-Symmetric Deep Learning Approach","summary":" The accurate tracking of live cells using video microscopy recordings remains\na challenging task for popular state-of-the-art image processing based object\ntracking methods. In recent years, several existing and new applications have\nattempted to integrate deep-learning based frameworks for this task, but most\nof them still heavily rely on consecutive frame based tracking embedded in\ntheir architecture or other premises that hinder generalized learning. To\naddress this issue, we aimed to develop a new deep-learning based tracking\nmethod that relies solely on the assumption that cells can be tracked based on\ntheir spatio-temporal neighborhood, without restricting it to consecutive\nframes. The proposed method has the additional benefit that the motion patterns\nof the cells can be learned completely by the predictor without any prior\nassumptions, and it has the potential to handle a large number of video frames\nwith heavy artifacts. The efficacy of the proposed method is demonstrated\nthrough biologically motivated validation strategies and compared against\nmultiple state-of-the-art cell tracking methods.\n","authors":["Gergely Szabó","Paolo Bonaiuti","Andrea Ciliberto","András Horváth"],"pdf_url":"https://arxiv.org/pdf/2308.03887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17129v2","updated":"2024-09-03T08:45:37Z","published":"2024-08-30T09:14:38Z","title":"Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph\n Neural Networks for Drug Response Prediction","summary":" Graph Neural Networks have been widely applied in critical decision-making\nareas that demand interpretable predictions, leading to the flourishing\ndevelopment of interpretability algorithms. However, current graph\ninterpretability algorithms tend to emphasize generality and often overlook\nbiological significance, thereby limiting their applicability in predicting\ncancer drug responses. In this paper, we propose a novel post-hoc\ninterpretability algorithm for cancer drug response prediction, CETExplainer,\nwhich incorporates a controllable edge-type-specific weighting mechanism. It\nconsiders the mutual information between subgraphs and predictions, proposing a\nstructural scoring approach to provide fine-grained, biologically meaningful\nexplanations for predictive models. We also introduce a method for constructing\nground truth based on real-world datasets to quantitatively evaluate the\nproposed interpretability algorithm. Empirical analysis on the real-world\ndataset demonstrates that CETExplainer achieves superior stability and improves\nexplanation quality compared to leading algorithms, thereby offering a robust\nand insightful tool for cancer drug prediction.\n","authors":["Xiaodi Li","Jianfeng Gui","Qian Gao","Haoyuan Shi","Zhenyu Yue"],"pdf_url":"https://arxiv.org/pdf/2408.17129v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15640v3","updated":"2024-09-03T08:35:15Z","published":"2024-08-28T08:52:14Z","title":"GANs Conditioning Methods: A Survey","summary":" In recent years, Generative Adversarial Networks (GANs) have seen significant\nadvancements, leading to their widespread adoption across various fields. The\noriginal GAN architecture enables the generation of images without any specific\ncontrol over the content, making it an unconditional generation process.\nHowever, many practical applications require precise control over the generated\noutput, which has led to the development of conditional GANs (cGANs) that\nincorporate explicit conditioning to guide the generation process. cGANs extend\nthe original framework by incorporating additional information (conditions),\nenabling the generation of samples that adhere to that specific criteria.\nVarious conditioning methods have been proposed, each differing in how they\nintegrate the conditioning information into both the generator and the\ndiscriminator networks. In this work, we review the conditioning methods\nproposed for GANs, exploring the characteristics of each method and\nhighlighting their unique mechanisms and theoretical foundations. Furthermore,\nwe conduct a comparative analysis of these methods, evaluating their\nperformance on various image datasets. Through these analyses, we aim to\nprovide insights into the strengths and limitations of various conditioning\ntechniques, guiding future research and application in generative modeling.\n","authors":["Anis Bourou","Valérie Mezger","Auguste Genovesio"],"pdf_url":"https://arxiv.org/pdf/2408.15640v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01082v2","updated":"2024-09-03T07:59:58Z","published":"2023-12-02T09:20:10Z","title":"A Survey on Stability of Learning with Limited Labelled Data and its\n Sensitivity to the Effects of Randomness","summary":" Learning with limited labelled data, such as prompting, in-context learning,\nfine-tuning, meta-learning or few-shot learning, aims to effectively train a\nmodel using only a small amount of labelled samples. However, these approaches\nhave been observed to be excessively sensitive to the effects of uncontrolled\nrandomness caused by non-determinism in the training process. The randomness\nnegatively affects the stability of the models, leading to large variances in\nresults across training runs. When such sensitivity is disregarded, it can\nunintentionally, but unfortunately also intentionally, create an imaginary\nperception of research progress. Recently, this area started to attract\nresearch attention and the number of relevant studies is continuously growing.\nIn this survey, we provide a comprehensive overview of 415 papers addressing\nthe effects of randomness on the stability of learning with limited labelled\ndata. We distinguish between four main tasks addressed in the papers\n(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness\neffects), providing findings for each one. Furthermore, we identify and discuss\nseven challenges and open problems together with possible directions to\nfacilitate further research. The ultimate goal of this survey is to emphasise\nthe importance of this growing research area, which so far has not received an\nappropriate level of attention, and reveal impactful directions for future\nresearch.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2312.01082v2.pdf","comment":"Accepted to ACM Comput. Surv. 2024"},{"id":"http://arxiv.org/abs/2310.14481v2","updated":"2024-09-03T07:46:24Z","published":"2023-10-23T01:25:44Z","title":"Efficient Heterogeneous Graph Learning via Random Projection","summary":" Heterogeneous Graph Neural Networks (HGNNs) are powerful tools for deep\nlearning on heterogeneous graphs. Typical HGNNs require repetitive message\npassing during training, limiting efficiency for large-scale real-world graphs.\nRecent pre-computation-based HGNNs use one-time message passing to transform a\nheterogeneous graph into regular-shaped tensors, enabling efficient mini-batch\ntraining. Existing pre-computation-based HGNNs can be mainly categorized into\ntwo styles, which differ in how much information loss is allowed and\nefficiency. We propose a hybrid pre-computation-based HGNN, named Random\nProjection Heterogeneous Graph Neural Network (RpHGNN), which combines the\nbenefits of one style's efficiency with the low information loss of the other\nstyle. To achieve efficiency, the main framework of RpHGNN consists of\npropagate-then-update iterations, where we introduce a Random Projection\nSquashing step to ensure that complexity increases only linearly. To achieve\nlow information loss, we introduce a Relation-wise Neighbor Collection\ncomponent with an Even-odd Propagation Scheme, which aims to collect\ninformation from neighbors in a finer-grained way. Experimental results\nindicate that our approach achieves state-of-the-art results on seven small and\nlarge benchmark datasets while also being 230% faster compared to the most\neffective baseline. Surprisingly, our approach not only surpasses\npre-processing-based baselines but also outperforms end-to-end methods.\n","authors":["Jun Hu","Bryan Hooi","Bingsheng He"],"pdf_url":"https://arxiv.org/pdf/2310.14481v2.pdf","comment":"Accepted by IEEE Transactions on Knowledge and Data Engineering\n (TKDE)"},{"id":"http://arxiv.org/abs/2402.01306v3","updated":"2024-09-03T07:41:51Z","published":"2024-02-02T10:53:36Z","title":"KTO: Model Alignment as Prospect Theoretic Optimization","summary":" Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive\nrandom variables in a biased but well-defined manner (1992); for example,\nhumans are famously loss-averse. We show that objectives for aligning LLMs with\nhuman feedback implicitly incorporate many of these biases -- the success of\nthese objectives (e.g., DPO) over cross-entropy minimization can partly be\nascribed to them belonging to a family of loss functions that we call\n$\\textit{human-aware losses}$ (HALOs). However, the utility functions these\nmethods attribute to humans still differ from those in the prospect theory\nliterature. Using a Kahneman-Tversky model of human utility, we propose a HALO\nthat directly maximizes the utility of generations instead of maximizing the\nlog-likelihood of preferences, as current methods do. We call this approach\nKTO, and it matches or exceeds the performance of preference-based methods at\nscales from 1B to 30B, despite only learning from a binary signal of whether an\noutput is desirable. More broadly, our work suggests that there is no one HALO\nthat is universally superior; the best loss depends on the inductive biases\nmost appropriate for a given setting, an oft-overlooked consideration.\n","authors":["Kawin Ethayarajh","Winnie Xu","Niklas Muennighoff","Dan Jurafsky","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.01306v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2310.18542v2","updated":"2024-09-03T07:34:54Z","published":"2023-10-28T00:15:10Z","title":"End-to-end Feature Selection Approach for Learning Skinny Trees","summary":" We propose a new optimization-based approach for feature selection in tree\nensembles, an important problem in statistics and machine learning. Popular\ntree ensemble toolkits e.g., Gradient Boosted Trees and Random Forests support\nfeature selection post-training based on feature importance scores, while very\npopular, they are known to have drawbacks. We propose Skinny Trees: an\nend-to-end toolkit for feature selection in tree ensembles where we train a\ntree ensemble while controlling the number of selected features. Our\noptimization-based approach learns an ensemble of differentiable trees, and\nsimultaneously performs feature selection using a grouped $\\ell_0$-regularizer.\nWe use first-order methods for optimization and present convergence guarantees\nfor our approach. We use a dense-to-sparse regularization scheduling scheme\nthat can lead to more expressive and sparser tree ensembles. On 15 synthetic\nand real-world datasets, Skinny Trees can achieve $1.5\\!\\times\\!\n-~620~\\!\\times\\!$ feature compression rates, leading up to $10\\times$ faster\ninference over dense trees, without any loss in performance. Skinny Trees lead\nto superior feature selection than many existing toolkits e.g., in terms of AUC\nperformance for 25\\% feature budget, Skinny Trees outperforms LightGBM by\n$10.2\\%$ (up to $37.7\\%$), and Random Forests by $3\\%$ (up to $12.5\\%$).\n","authors":["Shibal Ibrahim","Kayhan Behdin","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2310.18542v2.pdf","comment":"Accepted in AISTATS 2024"},{"id":"http://arxiv.org/abs/2405.06433v3","updated":"2024-09-03T07:10:31Z","published":"2024-05-10T12:25:06Z","title":"Fair Mixed Effects Support Vector Machine","summary":" To ensure unbiased and ethical automated predictions, fairness must be a core\nprinciple in machine learning applications. Fairness in machine learning aims\nto mitigate biases present in the training data and model imperfections that\ncould lead to discriminatory outcomes. This is achieved by preventing the model\nfrom making decisions based on sensitive characteristics like ethnicity or\nsexual orientation. A fundamental assumption in machine learning is the\nindependence of observations. However, this assumption often does not hold true\nfor data describing social phenomena, where data points are often clustered\nbased. Hence, if the machine learning models do not account for the cluster\ncorrelations, the results may be biased. Especially high is the bias in cases\nwhere the cluster assignment is correlated to the variable of interest. We\npresent a fair mixed effects support vector machine algorithm that can handle\nboth problems simultaneously. With a reproducible simulation study we\ndemonstrate the impact of clustered data on the quality of fair machine\nlearning predictions.\n","authors":["João Vitor Pamplona","Jan Pablo Burgard"],"pdf_url":"https://arxiv.org/pdf/2405.06433v3.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.13110v3","updated":"2024-09-03T06:31:48Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v3.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes and formatting improvements. V3: improvements from\n journal revisions"},{"id":"http://arxiv.org/abs/2408.02247v4","updated":"2024-09-03T06:02:25Z","published":"2024-08-05T05:41:16Z","title":"Contrastive Learning and Abstract Concepts: The Case of Natural Numbers","summary":" Contrastive Learning (CL) has been successfully applied to classification and\nother downstream tasks related to concrete concepts, such as objects contained\nin the ImageNet dataset. No attempts seem to have been made so far in applying\nthis promising scheme to more abstract entities. A prominent example of these\ncould be the concept of (discrete) Quantity. CL can be frequently interpreted\nas a self-supervised scheme guided by some profound and ubiquitous conservation\nprinciple (e.g. conservation of identity in object classification tasks). In\nthis introductory work we apply a suitable conservation principle to the\nsemi-abstract concept of natural numbers by which discrete quantities can be\nestimated or predicted. We experimentally show, by means of a toy problem, that\ncontrastive learning can be trained to count at a glance with high accuracy\nboth at human as well as at super-human ranges.. We compare this with the\nresults of a trained-to-count at a glance supervised learning (SL) neural\nnetwork scheme of similar architecture. We show that both schemes exhibit\nsimilar good performance on baseline experiments, where the distributions of\nthe training and testing stages are equal. Importantly, we demonstrate that in\nsome generalization scenarios, where training and testing distributions differ,\nCL boasts more robust and much better error performance.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2408.02247v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01481v2","updated":"2024-09-03T05:47:42Z","published":"2024-05-02T17:13:40Z","title":"NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment","summary":" Aligning Large Language Models (LLMs) with human values and preferences is\nessential for making them helpful and safe. However, building efficient tools\nto perform alignment can be challenging, especially for the largest and most\ncompetent LLMs which often contain tens or hundreds of billions of parameters.\nWe create NeMo-Aligner, a toolkit for model alignment that can efficiently\nscale to a thousand GPUs for training the largest open-source LLMs such as\nNemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized\nand scalable implementations for major paradigms of model alignment such as:\nReinforcement Learning from Human Feedback (RLHF), Direct Preference\nOptimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally,\nour toolkit supports running most of the alignment techniques in a Parameter\nEfficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for\nextensibility, allowing support for other alignment techniques with minimal\neffort. It is open-sourced with Apache 2.0 License and we invite community\ncontributions at https://github.com/NVIDIA/NeMo-Aligner\n","authors":["Gerald Shen","Zhilin Wang","Olivier Delalleau","Jiaqi Zeng","Yi Dong","Daniel Egert","Shengyang Sun","Jimmy Zhang","Sahil Jain","Ali Taghibakhshi","Markel Sanz Ausin","Ashwath Aithal","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2405.01481v2.pdf","comment":"16 pages, 4 figures, Accepted to COLM 2024"},{"id":"http://arxiv.org/abs/2303.14942v3","updated":"2024-09-03T04:57:03Z","published":"2023-03-27T06:50:31Z","title":"On the Optimality of Misspecified Spectral Algorithms","summary":" In the misspecified spectral algorithms problem, researchers usually assume\nthe underground true function $f_{\\rho}^{*} \\in [\\mathcal{H}]^{s}$, a\nless-smooth interpolation space of a reproducing kernel Hilbert space (RKHS)\n$\\mathcal{H}$ for some $s\\in (0,1)$. The existing minimax optimal results\nrequire $\\|f_{\\rho}^{*}\\|_{L^{\\infty}}<\\infty$ which implicitly requires $s >\n\\alpha_{0}$ where $\\alpha_{0}\\in (0,1)$ is the embedding index, a constant\ndepending on $\\mathcal{H}$. Whether the spectral algorithms are optimal for all\n$s\\in (0,1)$ is an outstanding problem lasting for years. In this paper, we\nshow that spectral algorithms are minimax optimal for any\n$\\alpha_{0}-\\frac{1}{\\beta} < s < 1$, where $\\beta$ is the eigenvalue decay\nrate of $\\mathcal{H}$. We also give several classes of RKHSs whose embedding\nindex satisfies $ \\alpha_0 = \\frac{1}{\\beta} $. Thus, the spectral algorithms\nare minimax optimal for all $s\\in (0,1)$ on these RKHSs.\n","authors":["Haobo Zhang","Yicheng Li","Qian Lin"],"pdf_url":"https://arxiv.org/pdf/2303.14942v3.pdf","comment":"50 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.08443v2","updated":"2024-09-03T04:10:57Z","published":"2024-05-14T09:03:00Z","title":"Safety Constrained Multi-Agent Reinforcement Learning for Active Voltage\n Control","summary":" Active voltage control presents a promising avenue for relieving power\ncongestion and enhancing voltage quality, taking advantage of the distributed\ncontrollable generators in the power network, such as roof-top photovoltaics.\nWhile Multi-Agent Reinforcement Learning (MARL) has emerged as a compelling\napproach to address this challenge, existing MARL approaches tend to overlook\nthe constrained optimization nature of this problem, failing in guaranteeing\nsafety constraints. In this paper, we formalize the active voltage control\nproblem as a constrained Markov game and propose a safety-constrained MARL\nalgorithm. We expand the primal-dual optimization RL method to multi-agent\nsettings, and augment it with a novel approach of double safety estimation to\nlearn the policy and to update the Lagrange-multiplier. In addition, we\nproposed different cost functions and investigated their influences on the\nbehavior of our constrained MARL method. We evaluate our approach in the power\ndistribution network simulation environment with real-world scale scenarios.\nExperimental results demonstrate the effectiveness of the proposed method\ncompared with the state-of-the-art MARL methods. This paper is published at\n\\url{https://www.ijcai.org/Proceedings/2024/}.\n","authors":["Yang Qu","Jinming Ma","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2405.08443v2.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2408.15667v2","updated":"2024-09-03T03:22:18Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n vision transformers","summary":" Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10802v3","updated":"2024-09-03T02:37:48Z","published":"2024-02-16T16:25:20Z","title":"TimeSeriesBench: An Industrial-Grade Benchmark for Time Series Anomaly\n Detection Models","summary":" Time series anomaly detection (TSAD) has gained significant attention due to\nits real-world applications to improve the stability of modern software\nsystems. However, there is no effective way to verify whether they can meet the\nrequirements for real-world deployment. Firstly, current algorithms typically\ntrain a specific model for each time series. Maintaining such many models is\nimpractical in a large-scale system with tens of thousands of curves. The\nperformance of using merely one unified model to detect anomalies remains\nunknown. Secondly, most TSAD models are trained on the historical part of a\ntime series and are tested on its future segment. In distributed systems,\nhowever, there are frequent system deployments and upgrades, with new,\npreviously unseen time series emerging daily. The performance of testing newly\nincoming unseen time series on current TSAD algorithms remains unknown. Lastly,\nthe assumptions of the evaluation metrics in existing benchmarks are far from\npractical demands. To solve the above-mentioned problems, we propose an\nindustrial-grade benchmark TimeSeriesBench. We assess the performance of\nexisting algorithms across more than 168 evaluation settings and provide\ncomprehensive analysis for the future design of anomaly detection algorithms.\nAn industrial dataset is also released along with TimeSeriesBench.\n","authors":["Haotian Si","Jianhui Li","Changhua Pei","Hang Cui","Jingwen Yang","Yongqian Sun","Shenglin Zhang","Jingjing Li","Haiming Zhang","Jing Han","Dan Pei","Gaogang Xie"],"pdf_url":"https://arxiv.org/pdf/2402.10802v3.pdf","comment":"Accepted by ISSRE'24"},{"id":"http://arxiv.org/abs/2402.00976v3","updated":"2024-09-03T02:35:52Z","published":"2024-02-01T19:47:31Z","title":"Investigating Recurrent Transformers with Dynamic Halt","summary":" In this paper, we comprehensively study the inductive biases of two major\napproaches to augmenting Transformers with a recurrent mechanism: (1) the\napproach of incorporating a depth-wise recurrence similar to Universal\nTransformers; and (2) the approach of incorporating a chunk-wise temporal\nrecurrence like Temporal Latent Bottleneck. Furthermore, we propose and\ninvestigate novel ways to extend and combine the above methods - for example,\nwe propose a global mean-based dynamic halting mechanism for Universal\nTransformers and an augmentation of Temporal Latent Bottleneck with elements\nfrom Universal Transformer. We compare the models and probe their inductive\nbiases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop\nlanguage modeling, ListOps, and Logical Inference. The code is released in:\nhttps://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main\n","authors":["Jishnu Ray Chowdhury","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2402.00976v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06576v4","updated":"2024-09-03T02:11:01Z","published":"2024-06-04T04:17:40Z","title":"OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step","summary":" Despite significant advancements in text generation and reasoning, Large\nLanguage Models (LLMs) still face challenges in accurately performing complex\narithmetic operations. Language model systems often enable LLMs to generate\ncode for arithmetic operations to achieve accurate calculations. However, this\napproach compromises speed and security, and fine-tuning risks the language\nmodel losing prior capabilities. We propose a framework that enables exact\narithmetic in a single autoregressive step, providing faster, more secure, and\nmore interpretable LLM systems with arithmetic capabilities. We use the hidden\nstates of a LLM to control a symbolic architecture that performs arithmetic.\nOur implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama)\nachieves 100\\% accuracy on single arithmetic operations\n($+,-,\\times,\\div,\\sin{},\\cos{},\\log{},\\exp{},\\sqrt{}$), outperforming GPT 4o\nwith and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o\nwith and without a code interpreter on average across a range of mathematical\nproblem solving benchmarks, demonstrating that OccamLLMs can excel in\narithmetic tasks, even surpassing much larger models. We will make our code\npublic shortly.\n","authors":["Owen Dugan","Donato Manuel Jimenez Beneto","Charlotte Loh","Zhuo Chen","Rumen Dangovski","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2406.06576v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13482v2","updated":"2024-09-03T00:48:37Z","published":"2024-08-24T05:54:47Z","title":"MPruner: Optimizing Neural Network Size with CKA-Based Mutual\n Information Pruning","summary":" Determining the optimal size of a neural network is critical, as it directly\nimpacts runtime performance and memory usage. Pruning is a well-established\nmodel compression technique that reduces the size of neural networks while\nmathematically guaranteeing accuracy preservation. However, many recent pruning\nmethods overlook the global contributions of individual model components,\nmaking it difficult to ensure that a pruned model meets the desired dataset and\nperformance requirements. To address these challenges, we developed a new\npruning algorithm, MPruner, that leverages mutual information through vector\nsimilarity. MPruner utilizes layer clustering with the Centered Kernel\nAlignment (CKA) similarity metric, allowing us to incorporate global\ninformation from the neural network for more precise and efficient layer-wise\npruning. We evaluated MPruner across various architectures and configurations,\ndemonstrating its versatility and providing practical guidelines. MPruner\nachieved up to a 50% reduction in parameters and memory usage for CNN and\ntransformer-based models, with minimal to no loss in accuracy.\n","authors":["Seungbeom Hu","ChanJun Park","Andrew Ferraiuolo","Sang-Ki Ko","Jinwoo Kim","Haein Song","Jieung Kim"],"pdf_url":"https://arxiv.org/pdf/2408.13482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10733v3","updated":"2024-09-03T23:41:14Z","published":"2022-08-23T05:02:09Z","title":"Recursively Feasible Probabilistic Safe Online Learning with Control\n Barrier Functions","summary":" Learning-based control has recently shown great efficacy in performing\ncomplex tasks for various applications. However, to deploy it in real systems,\nit is of vital importance to guarantee the system will stay safe. Control\nBarrier Functions (CBFs) offer mathematical tools for designing\nsafety-preserving controllers for systems with known dynamics. In this article,\nwe first introduce a model-uncertainty-aware reformulation of CBF-based\nsafety-critical controllers using Gaussian Process (GP) regression to close the\ngap between an approximate mathematical model and the real system, which\nresults in a second-order cone program (SOCP)-based control design. We then\npresent the pointwise feasibility conditions of the resulting safety\ncontroller, highlighting the level of richness that the available system\ninformation must meet to ensure safety. We use these conditions to devise an\nevent-triggered online data collection strategy that ensures the recursive\nfeasibility of the learned safety controller. Our method works by constantly\nreasoning about whether the current information is sufficient to ensure safety\nor if new measurements under active safe exploration are required to reduce the\nuncertainty. As a result, our proposed framework can guarantee the forward\ninvariance of the safe set defined by the CBF with high probability, even if it\ncontains a priori unexplored regions. We validate the proposed framework in two\nnumerical simulation experiments.\n","authors":["Fernando Castañeda","Jason J. Choi","Wonsuhk Jung","Bike Zhang","Claire J. Tomlin","Koushil Sreenath"],"pdf_url":"https://arxiv.org/pdf/2208.10733v3.pdf","comment":"Journal article. Includes the results of the 2021 CDC paper titled\n \"Pointwise feasibility of gaussian process-based safety-critical control\n under model uncertainty\" and proposes a recursively feasible safe online\n learning algorithm as new contribution"},{"id":"http://arxiv.org/abs/2409.02332v1","updated":"2024-09-03T23:13:04Z","published":"2024-09-03T23:13:04Z","title":"Double Machine Learning at Scale to Predict Causal Impact of Customer\n Actions","summary":" Causal Impact (CI) of customer actions are broadly used across the industry\nto inform both short- and long-term investment decisions of various types. In\nthis paper, we apply the double machine learning (DML) methodology to estimate\nthe CI values across 100s of customer actions of business interest and 100s of\nmillions of customers. We operationalize DML through a causal ML library based\non Spark with a flexible, JSON-driven model configuration approach to estimate\nCI at scale (i.e., across hundred of actions and millions of customers). We\noutline the DML methodology and implementation, and associated benefits over\nthe traditional potential outcomes based CI model. We show population-level as\nwell as customer-level CI values along with confidence intervals. The\nvalidation metrics show a 2.2% gain over the baseline methods and a 2.5X gain\nin the computational time. Our contribution is to advance the scalable\napplication of CI, while also providing an interface that allows faster\nexperimentation, cross-platform support, ability to onboard new use cases, and\nimproves accessibility of underlying code for partner teams.\n","authors":["Sushant More","Priya Kotwal","Sujith Chappidi","Dinesh Mandalapu","Chris Khawand"],"pdf_url":"https://arxiv.org/pdf/2409.02332v1.pdf","comment":"16 pages, 11 figures. Accepted at the European Conference on Machine\n Learning and Principles and Practice of Knowledge Discovery in Databases\n (ECML PKDD) 2023, Turin, Italy"},{"id":"http://arxiv.org/abs/2406.16746v3","updated":"2024-09-03T23:03:41Z","published":"2024-06-24T15:55:49Z","title":"The Responsible Foundation Model Development Cheatsheet: A Review of\n Tools & Resources","summary":" Foundation model development attracts a rapidly expanding body of\ncontributors, scientists, and applications. To help shape responsible\ndevelopment practices, we introduce the Foundation Model Development\nCheatsheet: a growing collection of 250+ tools and resources spanning text,\nvision, and speech modalities. We draw on a large body of prior work to survey\nresources (e.g. software, documentation, frameworks, guides, and practical\ntools) that support informed data selection, processing, and understanding,\nprecise and limitation-aware artifact documentation, efficient model training,\nadvance awareness of the environmental impact from training, careful model\nevaluation of capabilities, risks, and claims, as well as responsible model\nrelease, licensing and deployment practices. We hope this curated collection of\nresources helps guide more responsible development. The process of curating\nthis list, enabled us to review the AI development ecosystem, revealing what\ntools are critically missing, misused, or over-used in existing practices. We\nfind that (i) tools for data sourcing, model evaluation, and monitoring are\ncritically under-serving ethical and real-world needs, (ii) evaluations for\nmodel safety, capabilities, and environmental impact all lack reproducibility\nand transparency, (iii) text and particularly English-centric analyses continue\nto dominate over multilingual and multi-modal analyses, and (iv) evaluation of\nsystems, rather than just models, is needed so that capabilities and impact are\nassessed in context.\n","authors":["Shayne Longpre","Stella Biderman","Alon Albalak","Hailey Schoelkopf","Daniel McDuff","Sayash Kapoor","Kevin Klyman","Kyle Lo","Gabriel Ilharco","Nay San","Maribeth Rauh","Aviya Skowron","Bertie Vidgen","Laura Weidinger","Arvind Narayanan","Victor Sanh","David Adelani","Percy Liang","Rishi Bommasani","Peter Henderson","Sasha Luccioni","Yacine Jernite","Luca Soldaini"],"pdf_url":"https://arxiv.org/pdf/2406.16746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02327v1","updated":"2024-09-03T22:38:55Z","published":"2024-09-03T22:38:55Z","title":"Generative Principal Component Regression via Variational Inference","summary":" The ability to manipulate complex systems, such as the brain, to modify\nspecific outcomes has far-reaching implications, particularly in the treatment\nof psychiatric disorders. One approach to designing appropriate manipulations\nis to target key features of predictive models. While generative latent\nvariable models, such as probabilistic principal component analysis (PPCA), is\na powerful tool for identifying targets, they struggle incorporating\ninformation relevant to low-variance outcomes into the latent space. When\nstimulation targets are designed on the latent space in such a scenario, the\nintervention can be suboptimal with minimal efficacy. To address this problem,\nwe develop a novel objective based on supervised variational autoencoders\n(SVAEs) that enforces such information is represented in the latent space. The\nnovel objective can be used with linear models, such as PPCA, which we refer to\nas generative principal component regression (gPCR). We show in simulations\nthat gPCR dramatically improves target selection in manipulation as compared to\nstandard PCR and SVAEs. As part of these simulations, we develop a metric for\ndetecting when relevant information is not properly incorporated into the\nloadings. We then show in two neural datasets related to stress and social\nbehavior in which gPCR dramatically outperforms PCR in predictive performance\nand that SVAEs exhibit low incorporation of relevant information into the\nloadings. Overall, this work suggests that our method significantly improves\ntarget selection for manipulation using latent variable models over competitor\ninference schemes.\n","authors":["Austin Talbot","Corey J Keller","David E Carlson","Alex V Kotlar"],"pdf_url":"https://arxiv.org/pdf/2409.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02322v1","updated":"2024-09-03T22:31:57Z","published":"2024-09-03T22:31:57Z","title":"TimeDiT: General-purpose Diffusion Transformers for Time Series\n Foundation Model","summary":" With recent advances in building foundation models for texts and video data,\nthere is a surge of interest in foundation models for time series. A family of\nmodels have been developed, utilizing a temporal auto-regressive generative\nTransformer architecture, whose effectiveness has been proven in Large Language\nModels. While the empirical results are promising, almost all existing time\nseries foundation models have only been tested on well-curated ``benchmark''\ndatasets very similar to texts. However, real-world time series exhibit unique\nchallenges, such as variable channel sizes across domains, missing values, and\nvarying signal sampling intervals due to the multi-resolution nature of\nreal-world data. Additionally, the uni-directional nature of temporally\nauto-regressive decoding limits the incorporation of domain knowledge, such as\nphysical laws expressed as partial differential equations (PDEs). To address\nthese challenges, we introduce the Time Diffusion Transformer (TimeDiT), a\ngeneral foundation model for time series that employs a denoising diffusion\nparadigm instead of temporal auto-regressive generation. TimeDiT leverages the\nTransformer architecture to capture temporal dependencies and employs diffusion\nprocesses to generate high-quality candidate samples without imposing stringent\nassumptions on the target distribution via novel masking schemes and a channel\nalignment strategy. Furthermore, we propose a finetuning-free model editing\nstrategy that allows the seamless integration of external knowledge during the\nsampling process without updating any model parameters. Extensive experiments\nconducted on a varity of tasks such as forecasting, imputation, and anomaly\ndetection, demonstrate the effectiveness of TimeDiT.\n","authors":["Defu Cao","Wen Ye","Yizhou Zhang","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02322v1.pdf","comment":"23 Pages, 6 Figures, 11 Tables. First present at ICML 2024 Workshop\n on Foundation Models in the Wild"},{"id":"http://arxiv.org/abs/2408.10263v2","updated":"2024-09-03T22:23:06Z","published":"2024-08-15T18:58:21Z","title":"Kolmogorov Arnold Networks in Fraud Detection: Bridging the Gap Between\n Theory and Practice","summary":" This study evaluates the applicability of Kolmogorov-Arnold Networks (KAN) in\nfraud detection, finding that their effectiveness is context-dependent. We\npropose a quick decision rule using Principal Component Analysis (PCA) to\nassess the suitability of KAN: if data can be effectively separated in two\ndimensions using splines, KAN may outperform traditional models; otherwise,\nother methods could be more appropriate. We also introduce a heuristic approach\nto hyperparameter tuning, significantly reducing computational costs. These\nfindings suggest that while KAN has potential, its use should be guided by\ndata-specific assessments.\n","authors":["Yang Lu","Felix Zhan"],"pdf_url":"https://arxiv.org/pdf/2408.10263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02313v1","updated":"2024-09-03T21:56:13Z","published":"2024-09-03T21:56:13Z","title":"On the Benefits of Memory for Modeling Time-Dependent PDEs","summary":" Data-driven techniques have emerged as a promising alternative to traditional\nnumerical methods for solving partial differential equations (PDEs). These\ntechniques frequently offer a better trade-off between computational cost and\naccuracy for many PDE families of interest. For time-dependent PDEs, existing\nmethodologies typically treat PDEs as Markovian systems, i.e., the evolution of\nthe system only depends on the ``current state'', and not the past states.\nHowever, distortion of the input signals -- e.g., due to discretization or\nlow-pass filtering -- can render the evolution of the distorted signals\nnon-Markovian. In this work, motivated by the Mori-Zwanzig theory of model\nreduction, we investigate the impact of architectures with memory for modeling\nPDEs: that is, when past states are explicitly used to predict the future. We\nintroduce Memory Neural Operator (MemNO), a network based on the recent SSM\narchitectures and Fourier Neural Operator (FNO). We empirically demonstrate on\na variety of PDE families of interest that when the input is given on a\nlow-resolution grid, MemNO significantly outperforms the baselines without\nmemory, achieving more than 6 times less error on unseen PDEs. Via a\ncombination of theory and experiments, we show that the effect of memory is\nparticularly significant when the solution of the PDE has high frequency\nFourier components (e.g., low-viscosity fluid dynamics), and it also increases\nrobustness to observation noise.\n","authors":["Ricardo Buitrago Ruiz","Tanya Marwah","Albert Gu","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2409.02313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10368v2","updated":"2024-09-03T21:55:35Z","published":"2024-08-19T19:26:07Z","title":"Deep-MacroFin: Informed Equilibrium Neural Network for Continuous Time\n Economic Models","summary":" In this paper, we present Deep-MacroFin, a comprehensive framework designed\nto solve partial differential equations, with a particular focus on models in\ncontinuous time economics. This framework leverages deep learning\nmethodologies, including conventional Multi-Layer Perceptrons and the newly\ndeveloped Kolmogorov-Arnold Networks. It is optimized using economic\ninformation encapsulated by Hamilton-Jacobi-Bellman equations and coupled\nalgebraic equations. The application of neural networks holds the promise of\naccurately resolving high-dimensional problems with fewer computational demands\nand limitations compared to standard numerical methods. This versatile\nframework can be readily adapted for elementary differential equations, and\nsystems of differential equations, even in cases where the solutions may\nexhibit discontinuities. Importantly, it offers a more straightforward and\nuser-friendly implementation than existing libraries.\n","authors":["Yuntao Wu","Jiayuan Guo","Goutham Gopalakrishna","Zisis Poulos"],"pdf_url":"https://arxiv.org/pdf/2408.10368v2.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.02309v1","updated":"2024-09-03T21:39:58Z","published":"2024-09-03T21:39:58Z","title":"QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of\n DWI Data","summary":" We propose an image-conditioned diffusion model to estimate high angular\nresolution diffusion weighted imaging (DWI) from a low angular resolution\nacquisition. Our model, which we call QID$^2$, takes as input a set of low\nangular resolution DWI data and uses this information to estimate the DWI data\nassociated with a target gradient direction. We leverage a U-Net architecture\nwith cross-attention to preserve the positional information of the reference\nimages, further guiding the target image generation. We train and evaluate\nQID$^2$ on single-shell DWI samples curated from the Human Connectome Project\n(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to\nproduce low angular resolution DWI data and train QID$^2$ to reconstruct the\nmissing high angular resolution samples. We compare QID$^2$ with two\nstate-of-the-art GAN models. Our results demonstrate that QID$^2$ not only\nachieves higher-quality generated images, but it consistently outperforms the\nGAN models in downstream tensor estimation across multiple metrics. Taken\ntogether, this study highlights the potential of diffusion models, and QID$^2$\nin particular, for q-space up-sampling, thus offering a promising toolkit for\nclinical and research applications.\n","authors":["Zijian Chen","Jueqi Wang","Archana Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2409.02309v1.pdf","comment":"Accepted at MICCAI 2024 International Workshop on Computational\n Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02303v1","updated":"2024-09-03T21:28:48Z","published":"2024-09-03T21:28:48Z","title":"A Lesion-aware Edge-based Graph Neural Network for Predicting Language\n Ability in Patients with Post-stroke Aphasia","summary":" We propose a lesion-aware graph neural network (LEGNet) to predict language\nability from resting-state fMRI (rs-fMRI) connectivity in patients with\npost-stroke aphasia. Our model integrates three components: an edge-based\nlearning module that encodes functional connectivity between brain regions, a\nlesion encoding module, and a subgraph learning module that leverages\nfunctional similarities for prediction. We use synthetic data derived from the\nHuman Connectome Project (HCP) for hyperparameter tuning and model pretraining.\nWe then evaluate the performance using repeated 10-fold cross-validation on an\nin-house neuroimaging dataset of post-stroke aphasia. Our results demonstrate\nthat LEGNet outperforms baseline deep learning methods in predicting language\nability. LEGNet also exhibits superior generalization ability when tested on a\nsecond in-house dataset that was acquired under a slightly different\nneuroimaging protocol. Taken together, the results of this study highlight the\npotential of LEGNet in effectively learning the relationships between rs-fMRI\nconnectivity and language ability in a patient cohort with brain lesions for\nimproved post-stroke aphasia evaluation.\n","authors":["Zijian Chen","Maria Varkanitsa","Prakash Ishwar","Janusz Konrad","Margrit Betke","Swathi Kiran","Archana Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2409.02303v1.pdf","comment":"Accepted at MICCAI 2024 International Workshop on Machine Learning in\n Clinical Neuroimaging (MLCN)"},{"id":"http://arxiv.org/abs/2409.02281v1","updated":"2024-09-03T20:28:30Z","published":"2024-09-03T20:28:30Z","title":"K-Origins: Better Colour Quantification for Neural Networks","summary":" K-Origins is a neural network layer designed to improve image-based network\nperformances when learning colour, or intensities, is beneficial. Over 250\nencoder-decoder convolutional networks are trained and tested on 16-bit\nsynthetic data, demonstrating that K-Origins improves semantic segmentation\naccuracy in two scenarios: object detection with low signal-to-noise ratios,\nand segmenting multiple objects that are identical in shape but vary in colour.\nK-Origins generates output features from the input features, $\\textbf{X}$, by\nthe equation $\\textbf{Y}_k = \\textbf{X}-\\textbf{J}\\cdot w_k$ for each trainable\nparameter $w_k$, where $\\textbf{J}$ is a matrix of ones. Additionally, networks\nwith varying receptive fields were trained to determine optimal network depths\nbased on the dimensions of target classes, suggesting that receptive field\nlengths should exceed object sizes. By ensuring a sufficient receptive field\nlength and incorporating K-Origins, we can achieve better semantic network\nperformance.\n","authors":["Lewis Mason","Mark Martinez"],"pdf_url":"https://arxiv.org/pdf/2409.02281v1.pdf","comment":"16 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.08205v2","updated":"2024-09-03T20:26:56Z","published":"2024-06-12T13:38:48Z","title":"What do we know about Hugging Face? A systematic literature review and\n quantitative validation of qualitative claims","summary":" Background: Collaborative Software Package Registries (SPRs) are an integral\npart of the software supply chain. Much engineering work synthesizes SPR\npackage into applications. Prior research has examined SPRs for traditional\nsoftware, such as NPM (JavaScript) and PyPI (Python). Pre-Trained Model (PTM)\nRegistries are an emerging class of SPR of increasing importance, because they\nsupport the deep learning supply chain.\n Aims: Recent empirical research has examined PTM registries in ways such as\nvulnerabilities, reuse processes, and evolution. However, no existing research\nsynthesizes them to provide a systematic understanding of the current\nknowledge. Some of the existing research includes qualitative claims lacking\nquantitative analysis. Our research fills these gaps by providing a knowledge\nsynthesis and quantitative analyses.\n Methods: We first conduct a systematic literature review (SLR). We then\nobserve that some of the claims are qualitative. We identify quantifiable\nmetrics associated with those claims, and measure in order to substantiate\nthese claims.\n Results: From our SLR, we identify 12 claims about PTM reuse on the\nHuggingFace platform, 4 of which lack quantitative validation. We successfully\ntest 3 of these claims through a quantitative analysis, and directly compare\none with traditional software. Our findings corroborate qualitative claims with\nquantitative measurements. Our findings are: (1) PTMs have a much higher\nturnover rate than traditional software, indicating a dynamic and rapidly\nevolving reuse environment within the PTM ecosystem; and (2) There is a strong\ncorrelation between documentation quality and PTM popularity.\n Conclusions: We confirm qualitative research claims with concrete metrics,\nsupporting prior qualitative and case study research. Our measures show further\ndynamics of PTM reuse, inspiring research infrastructure and new measures.\n","authors":["Jason Jones","Wenxin Jiang","Nicholas Synovic","George K. Thiruvathukal","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2406.08205v2.pdf","comment":"[ESEM'24] Proceedings of the 18th ACM/IEEE International Symposium on\n Empirical Software Engineering and Measurement (ESEM) 2024"},{"id":"http://arxiv.org/abs/2406.10131v2","updated":"2024-09-03T20:13:24Z","published":"2024-06-14T15:41:21Z","title":"Linear Contextual Bandits with Hybrid Payoff: Revisited","summary":" We study the Linear Contextual Bandit problem in the hybrid reward setting.\nIn this setting every arm's reward model contains arm specific parameters in\naddition to parameters shared across the reward models of all the arms. We can\nreduce this setting to two closely related settings (a) Shared - no arm\nspecific parameters, and (b) Disjoint - only arm specific parameters, enabling\nthe application of two popular state of the art algorithms - $\\texttt{LinUCB}$\nand $\\texttt{DisLinUCB}$ (Algorithm 1 in (Li et al. 2010)). When the arm\nfeatures are stochastic and satisfy a popular diversity condition, we provide\nnew regret analyses for both algorithms, significantly improving on the known\nregret guarantees of these algorithms. Our novel analysis critically exploits\nthe hybrid reward structure and the diversity condition. Moreover, we introduce\na new algorithm $\\texttt{HyLinUCB}$ that crucially modifies $\\texttt{LinUCB}$\n(using a new exploration coefficient) to account for sparsity in the hybrid\nsetting. Under the same diversity assumptions, we prove that\n$\\texttt{HyLinUCB}$ also incurs only $O(\\sqrt{T})$ regret for $T$ rounds. We\nperform extensive experiments on synthetic and real-world datasets\ndemonstrating strong empirical performance of $\\texttt{HyLinUCB}$.For number of\narm specific parameters much larger than the number of shared parameters, we\nobserve that $\\texttt{DisLinUCB}$ incurs the lowest regret. In this case,\nregret of $\\texttt{HyLinUCB}$ is the second best and extremely competitive to\n$\\texttt{DisLinUCB}$. In all other situations, including our real-world\ndataset, $\\texttt{HyLinUCB}$ has significantly lower regret than\n$\\texttt{LinUCB}$, $\\texttt{DisLinUCB}$ and other SOTA baselines we considered.\nWe also empirically observe that the regret of $\\texttt{HyLinUCB}$ grows much\nslower with the number of arms compared to baselines, making it suitable even\nfor very large action spaces.\n","authors":["Nirjhar Das","Gaurav Sinha"],"pdf_url":"https://arxiv.org/pdf/2406.10131v2.pdf","comment":"Accepted at ECML PKDD 2024 as a Research Track Paper"},{"id":"http://arxiv.org/abs/2403.06023v2","updated":"2024-09-03T20:09:57Z","published":"2024-03-09T22:18:26Z","title":"Persian Slang Text Conversion to Formal and Deep Learning of Persian\n Short Texts on Social Media for Sentiment Classification","summary":" The lack of a suitable tool for the analysis of conversational texts in the\nPersian language has made various analyses of these texts, including Sentiment\nAnalysis, difficult. In this research, we tried to make the understanding of\nthese texts easier for the machine by providing PSC, Persian Slang Converter, a\ntool for converting conversational texts into formal ones, and by using the\nmost up-to-date and best deep learning methods along with the PSC, the\nsentiment learning of short Persian language texts for the machine in a better\nway. be made More than 10 million unlabeled texts from various social networks\nand movie subtitles (as Conversational texts) and about 10 million news texts\n(as formal texts) have been used for training unsupervised models and formal\nimplementation of the tool. 60,000 texts from the comments of Instagram social\nnetwork users with positive, negative, and neutral labels are considered\nsupervised data for training the emotion classification model of short texts.\nUsing the formal tool, 57% of the words of the corpus of conversation were\nconverted. Finally, by using the formalizer, FastText model, and deep LSTM\nnetwork, an accuracy of 81.91 was obtained on the test data.\n","authors":["Mohsen Khazeni","Mohammad Heydari","Amir Albadvi"],"pdf_url":"https://arxiv.org/pdf/2403.06023v2.pdf","comment":"16 pages, 4 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.02270v1","updated":"2024-09-03T20:01:56Z","published":"2024-09-03T20:01:56Z","title":"Reinforcement Learning-enabled Satellite Constellation Reconfiguration\n and Retasking for Mission-Critical Applications","summary":" The development of satellite constellation applications is rapidly advancing\ndue to increasing user demands, reduced operational costs, and technological\nadvancements. However, a significant gap in the existing literature concerns\nreconfiguration and retasking issues within satellite constellations, which is\nthe primary focus of our research. In this work, we critically assess the\nimpact of satellite failures on constellation performance and the associated\ntask requirements. To facilitate this analysis, we introduce a system modeling\napproach for GPS satellite constellations, enabling an investigation into\nperformance dynamics and task distribution strategies, particularly in\nscenarios where satellite failures occur during mission-critical operations.\nAdditionally, we introduce reinforcement learning (RL) techniques, specifically\nQ-learning, Policy Gradient, Deep Q-Network (DQN), and Proximal Policy\nOptimization (PPO), for managing satellite constellations, addressing the\nchallenges posed by reconfiguration and retasking following satellite failures.\nOur results demonstrate that DQN and PPO achieve effective outcomes in terms of\naverage rewards, task completion rates, and response times.\n","authors":["Hassan El Alami","Danda B. Rawat"],"pdf_url":"https://arxiv.org/pdf/2409.02270v1.pdf","comment":"Accepted for publication in the IEEE Military Communications\n Conference (IEEE MILCOM 2024)"},{"id":"http://arxiv.org/abs/2310.15128v2","updated":"2024-09-03T19:55:22Z","published":"2023-10-23T17:32:38Z","title":"Projected Stochastic Gradient Descent with Quantum Annealed Binary\n Gradients","summary":" We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards\ntraining neural networks with binary weights, known as binary neural networks\n(BNNs), on quantum hardware. BNNs reduce the computational requirements and\nenergy consumption of deep learning models with minimal loss in accuracy.\nHowever, training them in practice remains to be an open challenge. Most known\nBNN-optimisers either rely on projected updates or binarise weights\npost-training. Instead, QP-SBGD approximately maps the gradient onto binary\nvariables, by solving a quadratic constrained binary optimisation. Under\npractically reasonable assumptions, we show that this update rule converges\nwith a rate of $\\mathcal{O}(1 / \\sqrt{T})$. Moreover, we show how the\n$\\mathcal{NP}$-hard projection can be effectively executed on an adiabatic\nquantum annealer, harnessing recent advancements in quantum computation. We\nalso introduce a projected version of this update rule and prove that if a\nfixed point exists in the binary variable space, the modified updates will\nconverge to it. Last but not least, our algorithm is implemented layer-wise,\nmaking it suitable to train larger networks on resource-limited quantum\nhardware. Through extensive evaluations, we show that QP-SBGD outperforms or is\non par with competitive and well-established baselines such as BinaryConnect,\nsignSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as\nwell as binary graph neural networks.\n","authors":["Maximilian Krahn","Michele Sasdelli","Fengyi Yang","Vladislav Golyanik","Juho Kannala","Tat-Jun Chin","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.15128v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.05449v2","updated":"2024-09-03T08:01:47Z","published":"2023-12-09T03:33:14Z","title":"TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot\n Image Classification","summary":" Few-shot image classification aims to classify images from unseen novel\nclasses with few samples. Recent works demonstrate that deep local descriptors\nexhibit enhanced representational capabilities compared to image-level\nfeatures. However, most existing methods solely rely on either employing all\nlocal descriptors or directly utilizing partial descriptors, potentially\nresulting in the loss of crucial information. Moreover, these methods primarily\nemphasize the selection of query descriptors while overlooking support\ndescriptors. In this paper, we propose a novel Task-Aware Adaptive Local\nDescriptors Selection Network (TALDS-Net), which exhibits the capacity for\nadaptive selection of task-aware support descriptors and query descriptors.\nSpecifically, we compare the similarity of each local support descriptor with\nother local support descriptors to obtain the optimal support descriptor subset\nand then compare the query descriptors with the optimal support subset to\nobtain discriminative query descriptors. Extensive experiments demonstrate that\nour TALDS-Net outperforms state-of-the-art methods on both general and\nfine-grained datasets.\n","authors":["Qian Qiao","Yu Xie","Ziyin Zeng","Fanzhang Li"],"pdf_url":"https://arxiv.org/pdf/2312.05449v2.pdf","comment":"4 pages, 1 figures, is accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2408.01690v2","updated":"2024-09-03T22:30:34Z","published":"2024-08-03T07:05:40Z","title":"IDNet: A Novel Dataset for Identity Document Analysis and Fraud\n Detection","summary":" Effective fraud detection and analysis of government-issued identity\ndocuments, such as passports, driver's licenses, and identity cards, are\nessential in thwarting identity theft and bolstering security on online\nplatforms. The training of accurate fraud detection and analysis tools depends\non the availability of extensive identity document datasets. However, current\npublicly available benchmark datasets for identity document analysis, including\nMIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a\nlimited number of samples, cover insufficient varieties of fraud patterns, and\nseldom include alterations in critical personal identifying fields like\nportrait images, limiting their utility in training models capable of detecting\nrealistic frauds while preserving privacy.\n In response to these shortcomings, our research introduces a new benchmark\ndataset, IDNet, designed to advance privacy-preserving fraud detection efforts.\nThe IDNet dataset comprises 837,060 images of synthetically generated identity\ndocuments, totaling approximately 490 gigabytes, categorized into 20 types from\n$10$ U.S. states and 10 European countries. We evaluate the utility and present\nuse cases of the dataset, illustrating how it can aid in training\nprivacy-preserving fraud detection methods, facilitating the generation of\ncamera and video capturing of identity documents, and testing schema\nunification and other identity document management functionalities.\n","authors":["Hong Guan","Yancheng Wang","Lulu Xie","Soham Nag","Rajeev Goel","Niranjan Erappa Narayana Swamy","Yingzhen Yang","Chaowei Xiao","Jonathan Prisby","Ross Maciejewski","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2408.01690v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2409.02266v1","updated":"2024-09-03T19:52:49Z","published":"2024-09-03T19:52:49Z","title":"LSTMSE-Net: Long Short Term Speech Enhancement Network for Audio-visual\n Speech Enhancement","summary":" In this paper, we propose long short term memory speech enhancement network\n(LSTMSE-Net), an audio-visual speech enhancement (AVSE) method. This innovative\nmethod leverages the complementary nature of visual and audio information to\nboost the quality of speech signals. Visual features are extracted with\nVisualFeatNet (VFN), and audio features are processed through an encoder and\ndecoder. The system scales and concatenates visual and audio features, then\nprocesses them through a separator network for optimized speech enhancement.\nThe architecture highlights advancements in leveraging multi-modal data and\ninterpolation techniques for robust AVSE challenge systems. The performance of\nLSTMSE-Net surpasses that of the baseline model from the COG-MHEAR AVSE\nChallenge 2024 by a margin of 0.06 in scale-invariant signal-to-distortion\nratio (SISDR), $0.03$ in short-time objective intelligibility (STOI), and\n$1.32$ in perceptual evaluation of speech quality (PESQ). The source code of\nthe proposed LSTMSE-Net is available at\n\\url{https://github.com/mtanveer1/AVSEC-3-Challenge}.\n","authors":["Arnav Jain","Jasmer Singh Sanjotra","Harshvardhan Choudhary","Krish Agrawal","Rupal Shah","Rohan Jha","M. Sajid","Amir Hussain","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2409.02266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02108v1","updated":"2024-09-03T17:59:05Z","published":"2024-09-03T17:59:05Z","title":"Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection,\n Removal, and Generation in the Era of Deep Learning","summary":" Shadows are formed when light encounters obstacles, leading to areas of\ndiminished illumination. In computer vision, shadow detection, removal, and\ngeneration are crucial for enhancing scene understanding, refining image\nquality, ensuring visual consistency in video editing, and improving virtual\nenvironments. This paper presents a comprehensive survey of shadow detection,\nremoval, and generation in images and videos within the deep learning landscape\nover the past decade, covering tasks, deep models, datasets, and evaluation\nmetrics. Our key contributions include a comprehensive survey of shadow\nanalysis, standardization of experimental comparisons, exploration of the\nrelationships among model size, speed, and performance, a cross-dataset\ngeneralization study, identification of open issues and future directions, and\nprovision of publicly available resources to support further research.\n","authors":["Xiaowei Hu","Zhenghao Xing","Tianyu Wang","Chi-Wing Fu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02108v1.pdf","comment":"Publicly available results, trained models, and evaluation metrics at\n https://github.com/xw-hu/Unveiling-Deep-Shadows"},{"id":"http://arxiv.org/abs/2409.02101v1","updated":"2024-09-03T17:56:51Z","published":"2024-09-03T17:56:51Z","title":"Towards Real-World Adverse Weather Image Restoration: Enhancing\n Clearness and Semantics with Vision-Language Models","summary":" This paper addresses the limitations of adverse weather image restoration\napproaches trained on synthetic data when applied to real-world scenarios. We\nformulate a semi-supervised learning framework employing vision-language models\nto enhance restoration performance across diverse adverse weather conditions in\nreal-world settings. Our approach involves assessing image clearness and\nproviding semantics using vision-language models on real data, serving as\nsupervision signals for training restoration models. For clearness enhancement,\nwe use real-world data, utilizing a dual-step strategy with pseudo-labels\nassessed by vision-language models and weather prompt learning. For semantic\nenhancement, we integrate real-world data by adjusting weather conditions in\nvision-language model descriptions while preserving semantic meaning.\nAdditionally, we introduce an effective training strategy to bootstrap\nrestoration performance. Our approach achieves superior results in real-world\nadverse weather image restoration, demonstrated through qualitative and\nquantitative comparisons with state-of-the-art works.\n","authors":["Jiaqi Xu","Mengyang Wu","Xiaowei Hu","Chi-Wing Fu","Qi Dou","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02101v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02049v1","updated":"2024-09-03T16:53:34Z","published":"2024-09-03T16:53:34Z","title":"Low-Resolution Face Recognition via Adaptable Instance-Relation\n Distillation","summary":" Low-resolution face recognition is a challenging task due to the missing of\ninformative details. Recent approaches based on knowledge distillation have\nproven that high-resolution clues can well guide low-resolution face\nrecognition via proper knowledge transfer. However, due to the distribution\ndifference between training and testing faces, the learned models often suffer\nfrom poor adaptability. To address that, we split the knowledge transfer\nprocess into distillation and adaptation steps, and propose an adaptable\ninstance-relation distillation approach to facilitate low-resolution face\nrecognition. In the approach, the student distills knowledge from\nhigh-resolution teacher in both instance level and relation level, providing\nsufficient cross-resolution knowledge transfer. Then, the learned student can\nbe adaptable to recognize low-resolution faces with adaptive batch\nnormalization in inference. In this manner, the capability of recovering\nmissing details of familiar low-resolution faces can be effectively enhanced,\nleading to a better knowledge transfer. Extensive experiments on low-resolution\nface recognition clearly demonstrate the effectiveness and adaptability of our\napproach.\n","authors":["Ruixin Shi","Weijia Guo","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2409.02049v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2409.01761v1","updated":"2024-09-03T10:15:30Z","published":"2024-09-03T10:15:30Z","title":"PRoGS: Progressive Rendering of Gaussian Splats","summary":" Over the past year, 3D Gaussian Splatting (3DGS) has received significant\nattention for its ability to represent 3D scenes in a perceptually accurate\nmanner. However, it can require a substantial amount of storage since each\nsplat's individual data must be stored. While compression techniques offer a\npotential solution by reducing the memory footprint, they still necessitate\nretrieving the entire scene before any part of it can be rendered. In this\nwork, we introduce a novel approach for progressively rendering such scenes,\naiming to display visible content that closely approximates the final scene as\nearly as possible without loading the entire scene into memory. This approach\nbenefits both on-device rendering applications limited by memory constraints\nand streaming applications where minimal bandwidth usage is preferred. To\nachieve this, we approximate the contribution of each Gaussian to the final\nscene and construct an order of prioritization on their inclusion in the\nrendering process. Additionally, we demonstrate that our approach can be\ncombined with existing compression methods to progressively render (and stream)\n3DGS scenes, optimizing bandwidth usage by focusing on the most important\nsplats within a scene. Overall, our work establishes a foundation for making\nremotely hosted 3DGS content more quickly accessible to end-users in\nover-the-top consumption scenarios, with our results showing significant\nimprovements in quality across all metrics compared to existing methods.\n","authors":["Brent Zoomers","Maarten Wijnants","Ivan Molenaers","Joni Vanherck","Jeroen Put","Lode Jorissen","Nick Michiels"],"pdf_url":"https://arxiv.org/pdf/2409.01761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01710v1","updated":"2024-09-03T08:47:17Z","published":"2024-09-03T08:47:17Z","title":"Privacy-Preserving Multimedia Mobile Cloud Computing Using Protective\n Perturbation","summary":" Mobile cloud computing has been adopted in many multimedia applications,\nwhere the resource-constrained mobile device sends multimedia data (e.g.,\nimages) to remote cloud servers to request computation-intensive multimedia\nservices (e.g., image recognition). While significantly improving the\nperformance of the mobile applications, the cloud-based mechanism often causes\nprivacy concerns as the multimedia data and services are offloaded from the\ntrusted user device to untrusted cloud servers. Several recent studies have\nproposed perturbation-based privacy preserving mechanisms, which obfuscate the\noffloaded multimedia data to eliminate privacy exposures without affecting the\nfunctionality of the remote multimedia services. However, the existing privacy\nprotection approaches require the deployment of computation-intensive\nperturbation generation on the resource-constrained mobile devices. Also, the\nobfuscated images are typically not compliant with the standard image\ncompression algorithms and suffer from significant bandwidth consumption. In\nthis paper, we develop a novel privacy-preserving multimedia mobile cloud\ncomputing framework, namely $PMC^2$, to address the resource and bandwidth\nchallenges. $PMC^2$ employs secure confidential computing in the cloud to\ndeploy the perturbation generator, which addresses the resource challenge while\nmaintaining the privacy. Furthermore, we develop a neural compressor\nspecifically trained to compress the perturbed images in order to address the\nbandwidth challenge. We implement $PMC^2$ in an end-to-end mobile cloud\ncomputing system, based on which our evaluations demonstrate superior latency,\npower efficiency, and bandwidth consumption achieved by $PMC^2$ while\nmaintaining high accuracy in the target multimedia service.\n","authors":["Zhongze Tang","Mengmei Ye","Yao Liu","Sheng Wei"],"pdf_url":"https://arxiv.org/pdf/2409.01710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01534v1","updated":"2024-09-03T02:08:47Z","published":"2024-09-03T02:08:47Z","title":"Think Twice Before Recognizing: Large Multimodal Models for General\n Fine-grained Traffic Sign Recognition","summary":" We propose a new strategy called think twice before recognizing to improve\nfine-grained traffic sign recognition (TSR). Fine-grained TSR in the wild is\ndifficult due to the complex road conditions, and existing approaches\nparticularly struggle with cross-country TSR when data is lacking. Our strategy\nachieves effective fine-grained TSR by stimulating the multiple-thinking\ncapability of large multimodal models (LMM). We introduce context,\ncharacteristic, and differential descriptions to design multiple thinking\nprocesses for the LMM. The context descriptions with center coordinate prompt\noptimization help the LMM to locate the target traffic sign in the original\nroad images containing multiple traffic signs and filter irrelevant answers\nthrough the proposed prior traffic sign hypothesis. The characteristic\ndescription is based on few-shot in-context learning of template traffic signs,\nwhich decreases the cross-domain difference and enhances the fine-grained\nrecognition capability of the LMM. The differential descriptions of similar\ntraffic signs optimize the multimodal thinking capability of the LMM. The\nproposed method is independent of training data and requires only simple and\nuniform instructions. We conducted extensive experiments on three benchmark\ndatasets and two real-world datasets from different countries, and the proposed\nmethod achieves state-of-the-art TSR results on all five datasets.\n","authors":["Yaozong Gan","Guang Li","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2409.01534v1.pdf","comment":null}]},"2024-09-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2405.15092v2","updated":"2024-09-02T22:40:20Z","published":"2024-05-23T22:38:58Z","title":"Dissociation of Faithful and Unfaithful Reasoning in LLMs","summary":" Large language models (LLMs) often improve their performance in downstream\ntasks when they generate Chain of Thought reasoning text before producing an\nanswer. We investigate how LLMs recover from errors in Chain of Thought.\nThrough analysis of error recovery behaviors, we find evidence for\nunfaithfulness in Chain of Thought, which occurs when models arrive at the\ncorrect answer despite invalid reasoning text. We identify factors that shift\nLLM recovery behavior: LLMs recover more frequently from obvious errors and in\ncontexts that provide more evidence for the correct answer. Critically, these\nfactors have divergent effects on faithful and unfaithful recoveries. Our\nresults indicate that there are distinct mechanisms driving faithful and\nunfaithful error recoveries. Selective targeting of these mechanisms may be\nable to drive down the rate of unfaithful reasoning and improve model\ninterpretability.\n","authors":["Evelyn Yee","Alice Li","Chenyu Tang","Yeon Ho Jung","Ramamohan Paturi","Leon Bergen"],"pdf_url":"https://arxiv.org/pdf/2405.15092v2.pdf","comment":"code published at\n https://github.com/CoTErrorRecovery/CoTErrorRecovery"},{"id":"http://arxiv.org/abs/2404.07981v2","updated":"2024-09-02T21:29:04Z","published":"2024-04-11T17:57:32Z","title":"Manipulating Large Language Models to Increase Product Visibility","summary":" Large language models (LLMs) are increasingly being integrated into search\nengines to provide natural language responses tailored to user queries.\nCustomers and end-users are also becoming more dependent on these models for\nquick and easy purchase decisions. In this work, we investigate whether\nrecommendations from LLMs can be manipulated to enhance a product's visibility.\nWe demonstrate that adding a strategic text sequence (STS) -- a carefully\ncrafted message -- to a product's information page can significantly increase\nits likelihood of being listed as the LLM's top recommendation. To understand\nthe impact of STS, we use a catalog of fictitious coffee machines and analyze\nits effect on two target products: one that seldom appears in the LLM's\nrecommendations and another that usually ranks second. We observe that the\nstrategic text sequence significantly enhances the visibility of both products\nby increasing their chances of appearing as the top recommendation. This\nability to manipulate LLM-generated search responses provides vendors with a\nconsiderable competitive advantage and has the potential to disrupt fair market\ncompetition. Just as search engine optimization (SEO) revolutionized how\nwebpages are customized to rank higher in search engine results, influencing\nLLM recommendations could profoundly impact content optimization for AI-driven\nsearch services. Code for our experiments is available at\nhttps://github.com/aounon/llm-rank-optimizer.\n","authors":["Aounon Kumar","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2404.07981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10999v2","updated":"2024-09-02T20:26:30Z","published":"2024-06-16T16:25:22Z","title":"Balancing Rigor and Utility: Mitigating Cognitive Biases in Large\n Language Models for Multiple-Choice Questions","summary":" This paper examines the role of cognitive biases in the decision-making\nprocesses of large language models (LLMs), challenging the conventional goal of\neliminating all biases. We show that certain cognitive biases when properly\nbalanced, can enhance decision-making efficiency through rational deviations\nand heuristic shortcuts. By introducing heuristic moderation and an abstention\noption, which allows LLMs to withhold responses when uncertain, we reduce error\nrates, improve decision accuracy, and optimize decision rates. Using the\nBalance Rigor and Utility (BRU) dataset, developed through expert\ncollaboration, our findings demonstrate that targeted inspection of cognitive\nbiases aligns LLM decisions more closely with human reasoning, enhancing\nreliability and suggesting strategies for future improvements. This approach\noffers a novel way to leverage cognitive biases to improve the practical\nutility of LLMs across various applications.\n","authors":["Liman Wang","Hanyang Zhong"],"pdf_url":"https://arxiv.org/pdf/2406.10999v2.pdf","comment":"This article is currently under review. All data will be open on\n GitHub once the review is complete.\n https://github.com/limanwang/Balancing-Rigor-and-Utility"},{"id":"http://arxiv.org/abs/2405.15077v4","updated":"2024-09-02T20:25:36Z","published":"2024-05-23T21:56:12Z","title":"Eliciting Informative Text Evaluations with Large Language Models","summary":" Peer prediction mechanisms motivate high-quality feedback with provable\nguarantees. However, current methods only apply to rather simple reports, like\nmultiple-choice or scalar numbers. We aim to broaden these techniques to the\nlarger domain of text-based reports, drawing on the recent developments in\nlarge language models. This vastly increases the applicability of peer\nprediction mechanisms as textual feedback is the norm in a large variety of\nfeedback channels: peer reviews, e-commerce customer reviews, and comments on\nsocial media.\n We introduce two mechanisms, the Generative Peer Prediction Mechanism (GPPM)\nand the Generative Synopsis Peer Prediction Mechanism (GSPPM). These mechanisms\nutilize LLMs as predictors, mapping from one agent's report to a prediction of\nher peer's report. Theoretically, we show that when the LLM prediction is\nsufficiently accurate, our mechanisms can incentivize high effort and\ntruth-telling as an (approximate) Bayesian Nash equilibrium. Empirically, we\nconfirm the efficacy of our mechanisms through experiments conducted on two\nreal datasets: the Yelp review dataset and the ICLR OpenReview dataset. We\nhighlight the results that on the ICLR dataset, our mechanisms can\ndifferentiate three quality levels -- human-written reviews, GPT-4-generated\nreviews, and GPT-3.5-generated reviews in terms of expected scores.\nAdditionally, GSPPM penalizes LLM-generated reviews more effectively than GPPM.\n","authors":["Yuxuan Lu","Shengwei Xu","Yichi Zhang","Yuqing Kong","Grant Schoenebeck"],"pdf_url":"https://arxiv.org/pdf/2405.15077v4.pdf","comment":"Accepted by the Twenty-Fifth ACM Conference on Economics and\n Computation (EC'24)"},{"id":"http://arxiv.org/abs/2408.13295v2","updated":"2024-09-02T17:00:05Z","published":"2024-08-23T14:47:10Z","title":"Exploring Bias and Prediction Metrics to Characterise the Fairness of\n Machine Learning for Equity-Centered Public Health Decision-Making: A\n Narrative Review","summary":" Background: The rapid advancement of Machine Learning (ML) represents novel\nopportunities to enhance public health research, surveillance, and\ndecision-making. However, there is a lack of comprehensive understanding of\nalgorithmic bias, systematic errors in predicted population health outcomes,\nresulting from the public health application of ML. The objective of this\nnarrative review is to explore the types of bias generated by ML and\nquantitative metrics to assess these biases.\n Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of\nElectrical and Electronics Engineers), ACM (Association for Computing\nMachinery) Digital Library, Science Direct, and Springer Nature. We used\nkeywords to identify studies describing types of bias and metrics to measure\nthese in the domain of ML and public and population health published in English\nbetween 2008 and 2023, inclusive.\n Results: A total of 72 articles met the inclusion criteria. Our review\nidentified the commonly described types of bias and quantitative metrics to\nassess these biases from an equity perspective.\n Conclusion : The review will help formalize the evaluation framework for ML\non public health from an equity perspective.\n","authors":["Shaina Raza","Arash Shaban-Nejad","Elham Dolatabadi","Hiroshi Mamiya"],"pdf_url":"https://arxiv.org/pdf/2408.13295v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.16160v2","updated":"2024-09-02T16:33:29Z","published":"2024-04-24T19:30:18Z","title":"Domain-Specific Improvement on Psychotherapy Chatbot Using Assistant","summary":" Large language models (LLMs) have demonstrated impressive generalization\ncapabilities on specific tasks with human-written instruction data. However,\nthe limited quantity, diversity, and professional expertise of such instruction\ndata raise concerns about the performance of LLMs in psychotherapy tasks when\nprovided with domain-specific instructions. To address this, we firstly propose\nDomain-Specific Assistant Instructions based on AlexanderStreet therapy, and\nsecondly, we use an adaption fine-tuning method and retrieval augmented\ngeneration method to improve pre-trained LLMs. Through quantitative evaluation\nof linguistic quality using automatic and human evaluation, we observe that\npre-trained LLMs on Psychotherapy Assistant Instructions outperform\nstate-of-the-art LLMs response baselines. Our Assistant-Instruction approach\noffers a half-annotation method to align pre-trained LLMs with instructions and\nprovide pre-trained LLMs with more psychotherapy knowledge.\n","authors":["Cheng Kang","Daniel Novak","Katerina Urbanova","Yuqing Cheng","Yong Hu"],"pdf_url":"https://arxiv.org/pdf/2404.16160v2.pdf","comment":"Accepted at ICASSP 2024 EIHRC Workshop"},{"id":"http://arxiv.org/abs/2404.14024v2","updated":"2024-09-02T16:20:49Z","published":"2024-04-22T09:40:07Z","title":"Exploring neural oscillations during speech perception via surrogate\n gradient spiking neural networks","summary":" Understanding cognitive processes in the brain demands sophisticated models\ncapable of replicating neural dynamics at large scales. We present a\nphysiologically inspired speech recognition architecture, compatible and\nscalable with deep learning frameworks, and demonstrate that end-to-end\ngradient descent training leads to the emergence of neural oscillations in the\ncentral spiking neural network. Significant cross-frequency couplings,\nindicative of these oscillations, are measured within and across network layers\nduring speech processing, whereas no such interactions are observed when\nhandling background noise inputs. Furthermore, our findings highlight the\ncrucial inhibitory role of feedback mechanisms, such as spike frequency\nadaptation and recurrent connections, in regulating and synchronising neural\nactivity to improve recognition performance. Overall, on top of developing our\nunderstanding of synchronisation phenomena notably observed in the human\nauditory pathway, our architecture exhibits dynamic and efficient information\nprocessing, with relevance to neuromorphic technology.\n","authors":["Alexandre Bittar","Philip N. Garner"],"pdf_url":"https://arxiv.org/pdf/2404.14024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13152v2","updated":"2024-09-02T15:42:03Z","published":"2024-06-19T02:00:51Z","title":"Analyzing Diversity in Healthcare LLM Research: A Scientometric\n Perspective","summary":" The deployment of large language models (LLMs) in healthcare has demonstrated\nsubstantial potential for enhancing clinical decision-making, administrative\nefficiency, and patient outcomes. However, the underrepresentation of diverse\ngroups in the development and application of these models can perpetuate\nbiases, leading to inequitable healthcare delivery. This paper presents a\ncomprehensive scientometric analysis of LLM research for healthcare, including\ndata from January 1, 2021, to July 1, 2024. By analyzing metadata from PubMed\nand Dimensions, including author affiliations, countries, and funding sources,\nwe assess the diversity of contributors to LLM research. Our findings highlight\nsignificant gender and geographic disparities, with a predominance of male\nauthors and contributions primarily from high-income countries (HICs). We\nintroduce a novel journal diversity index based on Gini diversity to measure\nthe inclusiveness of scientific publications. Our results underscore the\nnecessity for greater representation in order to ensure the equitable\napplication of LLMs in healthcare. We propose actionable strategies to enhance\ndiversity and inclusivity in artificial intelligence research, with the\nultimate goal of fostering a more inclusive and equitable future in healthcare\ninnovation.\n","authors":["David Restrepo","Chenwei Wu","Constanza Vásquez-Venegas","João Matos","Jack Gallifant","Leo Anthony Celi","Danielle S. Bitterman","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2406.13152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02887v2","updated":"2024-09-02T15:41:34Z","published":"2024-05-05T10:52:09Z","title":"Sentiment Analysis Across Languages: Evaluation Before and After Machine\n Translation to English","summary":" People communicate in more than 7,000 languages around the world, with around\n780 languages spoken in India alone. Despite this linguistic diversity,\nresearch on Sentiment Analysis has predominantly focused on English text data,\nresulting in a disproportionate availability of sentiment resources for\nEnglish. This paper examines the performance of transformer models in Sentiment\nAnalysis tasks across multilingual datasets and text that has undergone machine\ntranslation. By comparing the effectiveness of these models in different\nlinguistic contexts, we gain insights into their performance variations and\npotential implications for sentiment analysis across diverse languages. We also\ndiscuss the shortcomings and potential for future work towards the end.\n","authors":["Aekansh Kathunia","Mohammad Kaif","Nalin Arora","N Narotam"],"pdf_url":"https://arxiv.org/pdf/2405.02887v2.pdf","comment":"6 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2310.12537v3","updated":"2024-09-02T12:36:06Z","published":"2023-10-19T07:39:00Z","title":"ExtractGPT: Exploring the Potential of Large Language Models for Product\n Attribute Value Extraction","summary":" In order to facilitate features such as faceted product search and product\ncomparison, e-commerce platforms require accurately structured product data,\nincluding precise attribute/value pairs. Vendors often times provide\nunstructured product descriptions consisting only of an offer title and a\ntextual description. Consequently, extracting attribute values from titles and\ndescriptions is vital for e-commerce platforms. State-of-the-art attribute\nvalue extraction methods based on pre-trained language models, such as BERT,\nface two drawbacks (i) the methods require significant amounts of task-specific\ntraining data and (ii) the fine-tuned models have problems with generalising to\nunseen attribute values that were not part of the training data. This paper\nexplores the potential of using large language models as a more training\ndata-efficient and more robust alternative to existing AVE methods. We propose\nprompt templates for describing the target attributes of the extraction to the\nLLM, covering both zero-shot and few-shot scenarios. In the zero-shot scenario,\ntextual and JSON-based target schema representations of the attributes are\ncompared. In the few-shot scenario, we investigate (i) the provision of example\nattribute values, (ii) the selection of in-context demonstrations, (iii)\nshuffled ensembling to prevent position bias, and (iv) fine-tuning the LLM. We\nevaluate the prompt templates in combination with hosted LLMs, such as GPT-3.5\nand GPT-4, and open-source LLMs which can be run locally. We compare the\nperformance of the LLMs to the PLM-based methods SU-OpenTag, AVEQA, and MAVEQA.\nThe highest average F1-score of 86% was achieved by GPT-4. Llama-3-70B performs\nonly 3% worse than GPT-4, making it a competitive open-source alternative.\nGiven the same training data, this prompt/GPT-4 combination outperforms the\nbest PLM baseline by an average of 6% F1-score.\n","authors":["Alexander Brinkmann","Roee Shraga","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.12537v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20352v2","updated":"2024-09-02T12:07:54Z","published":"2023-10-31T10:47:33Z","title":"AMERICANO: Argument Generation with Discourse-driven Decomposition and\n Agent Interaction","summary":" Argument generation is a challenging task in natural language processing,\nwhich requires rigorous reasoning and proper content organization. Inspired by\nrecent chain-of-thought prompting that breaks down a complex task into\nintermediate steps, we propose Americano, a novel framework with agent\ninteraction for argument generation. Our approach decomposes the generation\nprocess into sequential actions grounded on argumentation theory, which first\nexecutes actions sequentially to generate argumentative discourse components,\nand then produces a final argument conditioned on the components. To further\nmimic the human writing process and improve the left-to-right generation\nparadigm of current autoregressive language models, we introduce an argument\nrefinement module which automatically evaluates and refines argument drafts\nbased on feedback received. We evaluate our framework on the task of\ncounterargument generation using a subset of Reddit/CMV dataset. The results\nshow that our method outperforms both end-to-end and chain-of-thought prompting\nmethods and can generate more coherent and persuasive arguments with diverse\nand rich contents.\n","authors":["Zhe Hu","Hou Pong Chan","Yu Yin"],"pdf_url":"https://arxiv.org/pdf/2310.20352v2.pdf","comment":"INLG 2024"},{"id":"http://arxiv.org/abs/2408.14438v3","updated":"2024-09-02T11:59:05Z","published":"2024-08-26T17:25:16Z","title":"Evaluating Large Language Models on Spatial Tasks: A Multi-Task\n Benchmarking Study","summary":" The advent of large language models such as ChatGPT, Gemini, and others has\nunderscored the importance of evaluating their diverse capabilities, ranging\nfrom natural language understanding to code generation. However, their\nperformance on spatial tasks has not been comprehensively assessed. This study\naddresses this gap by introducing a novel multi-task spatial evaluation\ndataset, designed to systematically explore and compare the performance of\nseveral advanced models on spatial tasks. The dataset encompasses twelve\ndistinct task types, including spatial understanding and path planning, each\nwith verified, accurate answers. We evaluated multiple models, including\nOpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase\ntesting approach. Initially, we conducted zero-shot testing, followed by\ncategorizing the dataset by difficulty and performing prompt tuning tests.\nResults indicate that gpt-4o achieved the highest overall accuracy in the first\nphase, with an average of 71.3%. Although moonshot-v1-8k slightly\nunderperformed overall, it surpassed gpt-4o in place name recognition tasks.\nThe study also highlights the impact of prompt strategies on model performance\nin specific tasks. For example, the Chain-of-Thought (COT) strategy increased\ngpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot\nstrategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to\n76.3%.\n","authors":["Liuchang Xu","Shuo Zhao","Qingming Lin","Luyao Chen","Qianqian Luo","Sensen Wu","Xinyue Ye","Hailin Feng","Zhenhong Du"],"pdf_url":"https://arxiv.org/pdf/2408.14438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07448v2","updated":"2024-09-02T11:45:41Z","published":"2024-08-14T10:36:17Z","title":"LiveFC: A System for Live Fact-Checking of Audio Streams","summary":" The advances in the digital era have led to rapid dissemination of\ninformation. This has also aggravated the spread of misinformation and\ndisinformation. This has potentially serious consequences, such as civil\nunrest. While fact-checking aims to combat this, manual fact-checking is\ncumbersome and not scalable. While automated fact-checking approaches exist,\nthey do not operate in real-time and do not always account for spread of\nmisinformation through different modalities. This is particularly important as\nproactive fact-checking on live streams in real-time can help people be\ninformed of false narratives and prevent catastrophic consequences that may\ncause civil unrest. This is particularly relevant with the rapid dissemination\nof information through video on social media platforms or other streams like\npolitical rallies and debates. Hence, in this work we develop a platform named\nLiveFC, that can aid in fact-checking live audio streams in real-time. LiveFC\nhas a user-friendly interface that displays the claims detected along with\ntheir veracity and evidence for live streams with associated speakers for\nclaims from respective segments. The app can be accessed at\nhttp://livefc.factiverse.ai and a screen recording of the demo can be found at\nhttps://bit.ly/3WVAoIw.\n","authors":["Venktesh V","Vinay Setty"],"pdf_url":"https://arxiv.org/pdf/2408.07448v2.pdf","comment":"Under Review, 11 pages"},{"id":"http://arxiv.org/abs/2403.08564v3","updated":"2024-09-02T11:09:55Z","published":"2024-03-13T14:19:08Z","title":"Generalizing Fairness to Generative Language Models via Reformulation of\n Non-discrimination Criteria","summary":" Generative AI, such as large language models, has undergone rapid development\nwithin recent years. As these models become increasingly available to the\npublic, concerns arise about perpetuating and amplifying harmful biases in\napplications. Gender stereotypes can be harmful and limiting for the\nindividuals they target, whether they consist of misrepresentation or\ndiscrimination. Recognizing gender bias as a pervasive societal construct, this\npaper studies how to uncover and quantify the presence of gender biases in\ngenerative language models. In particular, we derive generative AI analogues of\nthree well-known non-discrimination criteria from classification, namely\nindependence, separation and sufficiency. To demonstrate these criteria in\naction, we design prompts for each of the criteria with a focus on occupational\ngender stereotype, specifically utilizing the medical test to introduce the\nground truth in the generative AI context. Our results address the presence of\noccupational gender bias within such conversational language models.\n","authors":["Sara Sterlie","Nina Weng","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2403.08564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05141v3","updated":"2024-09-02T10:55:30Z","published":"2024-08-09T15:53:55Z","title":"A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning","summary":" Retrieval-augmented generation (RAG) is a framework enabling large language\nmodels (LLMs) to enhance their accuracy and reduce hallucinations by\nintegrating external knowledge bases. In this paper, we introduce a hybrid RAG\nsystem enhanced through a comprehensive suite of optimizations that\nsignificantly improve retrieval quality, augment reasoning capabilities, and\nrefine numerical computation ability. We refined the text chunks and tables in\nweb pages, added attribute predictors to reduce hallucinations, conducted LLM\nKnowledge Extractor and Knowledge Graph Extractor, and finally built a\nreasoning strategy with all the references. We evaluated our system on the CRAG\ndataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and\nonline evaluations demonstrate that our system significantly enhances complex\nreasoning capabilities. In local evaluations, we have significantly improved\naccuracy and reduced error rates compared to the baseline model, achieving a\nnotable increase in scores. In the meanwhile, we have attained outstanding\nresults in online assessments, demonstrating the performance and generalization\ncapabilities of the proposed system. The source code for our system is released\nin \\url{https://gitlab.aicrowd.com/shizueyy/crag-new}.\n","authors":["Ye Yuan","Chengwu Liu","Jingyang Yuan","Gongbo Sun","Siqi Li","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05141v3.pdf","comment":"Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024"},{"id":"http://arxiv.org/abs/2406.03816v2","updated":"2024-09-02T09:48:18Z","published":"2024-06-06T07:40:00Z","title":"ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search","summary":" Recent methodologies in LLM self-training mostly rely on LLM generating\nresponses and filtering those with correct output answers as training data.\nThis approach often yields a low-quality fine-tuning training set (e.g.,\nincorrect plans or intermediate reasoning). In this paper, we develop a\nreinforced self-training approach, called ReST-MCTS*, based on integrating\nprocess reward guidance with tree search MCTS* for collecting higher-quality\nreasoning traces as well as per-step value to train policy and reward models.\nReST-MCTS* circumvents the per-step manual annotation typically used to train\nprocess rewards by tree-search-based reinforcement learning: Given oracle final\ncorrect answers, ReST-MCTS* is able to infer the correct process rewards by\nestimating the probability this step can help lead to the correct answer. These\ninferred rewards serve dual purposes: they act as value targets for further\nrefining the process reward model and also facilitate the selection of\nhigh-quality traces for policy model self-training. We first show that the\ntree-search policy in ReST-MCTS* achieves higher accuracy compared with prior\nLLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same\nsearch budget. We then show that by using traces searched by this tree-search\npolicy as training data, we can continuously enhance the three language models\nfor multiple iterations, and outperform other self-training algorithms such as\nReST$^\\text{EM}$ and Self-Rewarding LM.\n","authors":["Dan Zhang","Sining Zhoubian","Ziniu Hu","Yisong Yue","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2406.03816v2.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2406.16069v2","updated":"2024-09-02T09:13:51Z","published":"2024-06-23T10:36:35Z","title":"FastMem: Fast Memorization of Prompt Improves Context Awareness of Large\n Language Models","summary":" Large language models (LLMs) excel in generating coherent text, but they\noften struggle with context awareness, leading to inaccuracies in tasks\nrequiring faithful adherence to provided information. We introduce FastMem, a\nnovel method designed to enhance instruction fine-tuned LLMs' context awareness\nthrough fast memorization of the prompt. FastMem maximizes the likelihood of\nthe prompt before inference by fine-tuning only the last Feed-Forward Network\n(FFN) module. This targeted approach ensures efficient optimization without\noverfitting, significantly improving the model's ability to comprehend and\naccurately follow the context. Our experiments demonstrate substantial gains in\nreading comprehension, text summarization and adherence to output structures.\nFor instance, FastMem improves the accuracy of Llama 3-8B-Inst on the NQ-SWAP\ndataset from 59.1% to 71.6%, and reduces the output structure failure rate of\nQwen 1.5-4B-Chat from 34.9% to 25.5%. Extensive experimental results highlight\nFastMem's potential to offer a robust solution to enhance the reliability and\naccuracy of LLMs in various applications. Our code is available at:\nhttps://github.com/IAAR-Shanghai/FastMem\n","authors":["Junyi Zhu","Shuochen Liu","Yu Yu","Bo Tang","Yibo Yan","Zhiyu Li","Feiyu Xiong","Tong Xu","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2406.16069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16837v3","updated":"2024-09-02T08:33:21Z","published":"2023-06-29T10:29:23Z","title":"A Formal Perspective on Byte-Pair Encoding","summary":" Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in\nNLP, despite being devised initially as a compression method. BPE appears to be\na greedy algorithm at face value, but the underlying optimization problem that\nBPE seeks to solve has not yet been laid down. We formalize BPE as a\ncombinatorial optimization problem. Via submodular functions, we prove that the\niterative greedy version is a\n$\\frac{1}{{\\sigma(\\boldsymbol{\\mu}^\\star)}}(1-e^{-{\\sigma(\\boldsymbol{\\mu}^\\star)}})$-approximation\nof an optimal merge sequence, where ${\\sigma(\\boldsymbol{\\mu}^\\star)}$ is the\ntotal backward curvature with respect to the optimal merge sequence\n$\\boldsymbol{\\mu}^\\star$. Empirically the lower bound of the approximation is\n$\\approx 0.37$.\n We provide a faster implementation of BPE which improves the runtime\ncomplexity from $\\mathcal{O}\\left(N M\\right)$ to $\\mathcal{O}\\left(N \\log\nM\\right)$, where $N$ is the sequence length and $M$ is the merge count.\nFinally, we optimize the brute-force algorithm for optimal BPE using\nmemoization.\n","authors":["Vilém Zouhar","Clara Meister","Juan Luis Gastaldi","Li Du","Tim Vieira","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2306.16837v3.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2408.15366v2","updated":"2024-09-02T08:18:52Z","published":"2024-08-27T19:03:11Z","title":"Pitfalls and Outlooks in Using COMET","summary":" Since its introduction, the COMET metric has blazed a trail in the machine\ntranslation community, given its strong correlation with human judgements of\ntranslation quality. Its success stems from being a modified pre-trained\nmultilingual model finetuned for quality assessment. However, it being a\nmachine learning model also gives rise to a new set of pitfalls that may not be\nwidely known. We investigate these unexpected behaviours from three aspects: 1)\ntechnical: obsolete software versions and compute precision; 2) data: empty\ncontent, language mismatch, and translationese at test time as well as\ndistribution and domain biases in training; 3) usage and reporting:\nmulti-reference support and model referencing in the literature. All of these\nproblems imply that COMET scores is not comparable between papers or even\ntechnical setups and we put forward our perspective on fixing each issue.\nFurthermore, we release the SacreCOMET package that can generate a signature\nfor the software and model configuration as well as an appropriate citation.\nThe goal of this work is to help the community make more sound use of the COMET\nmetric.\n","authors":["Vilém Zouhar","Pinzhen Chen","Tsz Kin Lam","Nikita Moghe","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2408.15366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16020v3","updated":"2024-09-02T07:54:54Z","published":"2024-06-23T05:40:26Z","title":"AudioBench: A Universal Benchmark for Audio Large Language Models","summary":" We introduce AudioBench, a universal benchmark designed to evaluate Audio\nLarge Language Models (AudioLLMs). It encompasses 8 distinct tasks and 26\ndatasets, among which, 7 are newly proposed datasets. The evaluation targets\nthree main aspects: speech understanding, audio scene understanding, and voice\nunderstanding (paralinguistic). Despite recent advancements, there lacks a\ncomprehensive benchmark for AudioLLMs on instruction following capabilities\nconditioned on audio signals. AudioBench addresses this gap by setting up\ndatasets as well as desired evaluation metrics. Besides, we also evaluated the\ncapabilities of five popular models and found that no single model excels\nconsistently across all tasks. We outline the research outlook for AudioLLMs\nand anticipate that our open-sourced evaluation toolkit, data, and leaderboard\nwill offer a robust testbed for future model developments.\n","authors":["Bin Wang","Xunlong Zou","Geyu Lin","Shuo Sun","Zhuohan Liu","Wenyu Zhang","Zhengyuan Liu","AiTi Aw","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2406.16020v3.pdf","comment":"v3 - Abundent update on models and evaluation details; Code:\n https://github.com/AudioLLMs/AudioBench"},{"id":"http://arxiv.org/abs/2308.09067v3","updated":"2024-09-02T07:26:46Z","published":"2023-08-17T15:54:38Z","title":"Contrasting Linguistic Patterns in Human and LLM-Generated News Text","summary":" We conduct a quantitative analysis contrasting human-written English news\ntext with comparable large language model (LLM) output from six different LLMs\nthat cover three different families and four sizes in total. Our analysis spans\nseveral measurable linguistic dimensions, including morphological, syntactic,\npsychometric, and sociolinguistic aspects. The results reveal various\nmeasurable differences between human and AI-generated texts. Human texts\nexhibit more scattered sentence length distributions, more variety of\nvocabulary, a distinct use of dependency and constituent types, shorter\nconstituents, and more optimized dependency distances. Humans tend to exhibit\nstronger negative emotions (such as fear and disgust) and less joy compared to\ntext generated by LLMs, with the toxicity of these models increasing as their\nsize grows. LLM outputs use more numbers, symbols and auxiliaries (suggesting\nobjective language) than human texts, as well as more pronouns. The sexist bias\nprevalent in human text is also expressed by LLMs, and even magnified in all of\nthem but one. Differences between LLMs and humans are larger than between LLMs.\n","authors":["Alberto Muñoz-Ortiz","Carlos Gómez-Rodríguez","David Vilares"],"pdf_url":"https://arxiv.org/pdf/2308.09067v3.pdf","comment":"Published at Artificial Intelligence Review vol. 57, 265"},{"id":"http://arxiv.org/abs/2310.05191v2","updated":"2024-09-02T06:24:32Z","published":"2023-10-08T15:00:04Z","title":"LLM-as-a-tutor in EFL Writing Education: Focusing on Evaluation of\n Student-LLM Interaction","summary":" In the context of English as a Foreign Language (EFL) writing education,\nLLM-as-a-tutor can assist students by providing real-time feedback on their\nessays. However, challenges arise in assessing LLM-as-a-tutor due to differing\nstandards between educational and general use cases. To bridge this gap, we\nintegrate pedagogical principles to assess student-LLM interaction. First, we\nexplore how LLMs can function as English tutors, providing effective essay\nfeedback tailored to students. Second, we propose three metrics to evaluate\nLLM-as-a-tutor specifically designed for EFL writing education, emphasizing\npedagogical aspects. In this process, EFL experts evaluate the feedback from\nLLM-as-a-tutor regarding quality and characteristics. On the other hand, EFL\nlearners assess their learning outcomes from interaction with LLM-as-a-tutor.\nThis approach lays the groundwork for developing LLMs-as-a-tutor tailored to\nthe needs of EFL learners, advancing the effectiveness of writing education in\nthis context.\n","authors":["Jieun Han","Haneul Yoo","Junho Myung","Minsun Kim","Hyunseung Lim","Yoonsu Kim","Tak Yeon Lee","Hwajung Hong","Juho Kim","So-Yeon Ahn","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2310.05191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14033v2","updated":"2024-09-02T05:55:06Z","published":"2024-08-26T05:55:48Z","title":"MLR-Copilot: Autonomous Machine Learning Research based on Large\n Language Models Agents","summary":" Machine learning research, crucial for technological advancements and\ninnovation, often faces significant challenges due to its inherent complexity,\nslow pace of experimentation, and the necessity for specialized expertise.\nMotivated by this, we present a new systematic framework, autonomous Machine\nLearning Research with large language models (MLR-Copilot), designed to enhance\nmachine learning research productivity through the automatic generation and\nimplementation of research ideas using Large Language Model (LLM) agents. The\nframework consists of three phases: research idea generation, experiment\nimplementation, and implementation execution. First, existing research papers\nare used to generate hypotheses and experimental plans vis IdeaAgent powered by\nLLMs. Next, the implementation generation phase translates these plans into\nexecutables with ExperimentAgent. This phase leverages retrieved prototype code\nand optionally retrieves candidate models and data. Finally, the execution\nphase, also managed by ExperimentAgent, involves running experiments with\nmechanisms for human feedback and iterative debugging to enhance the likelihood\nof achieving executable research outcomes. We evaluate our framework on five\nmachine learning research tasks and the experimental results show the\nframework's potential to facilitate the research progress and innovations.\n","authors":["Ruochen Li","Teerth Patel","Qingyun Wang","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2408.14033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03024v3","updated":"2024-09-02T05:51:02Z","published":"2023-08-06T05:23:25Z","title":"Show Me the World in My Language: Establishing the First Baseline for\n Scene-Text to Scene-Text Translation","summary":" In this work, we study the task of ``visually'' translating scene text from a\nsource language (e.g., Hindi) to a target language (e.g., English). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe source scene text, such as font, size, and background. There are several\nchallenges associated with this task, such as translation with limited context,\ndeciding between translation and transliteration, accommodating varying text\nlengths within fixed spatial boundaries, and preserving the font and background\nstyles of the source scene text in the target language. To address this\nproblem, we make the following contributions: (i) We study visual translation\nas a standalone problem for the first time in the literature. (ii) We present a\ncascaded framework for visual translation that combines state-of-the-art\nmodules for scene text recognition, machine translation, and scene text\nsynthesis as a baseline for the task. (iii) We propose a set of task-specific\ndesign enhancements to design a variant of the baseline to obtain performance\nimprovements. (iv) Currently, the existing related literature lacks any\ncomprehensive performance evaluation for this novel task. To fill this gap, we\nintroduce several automatic and user-assisted evaluation metrics designed\nexplicitly for evaluating visual translation. Further, we evaluate presented\nbaselines for translating scene text between Hindi and English. Our experiments\ndemonstrate that although we can effectively perform visual translation over a\nlarge collection of scene text images, the presented baseline only partially\naddresses challenges posed by visual translation tasks. We firmly believe that\nthis new task and the limitations of existing models, as reported in this\npaper, should encourage further research in visual translation.\n","authors":["Shreyas Vaidya","Arvind Kumar Sharma","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v3.pdf","comment":"Accepted at ICPR 2024, Project Website:\n https://vl2g.github.io/projects/visTrans/"},{"id":"http://arxiv.org/abs/2403.06764v3","updated":"2024-09-02T05:48:54Z","published":"2024-03-11T14:35:32Z","title":"An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference\n Acceleration for Large Vision-Language Models","summary":" In this study, we identify the inefficient attention phenomena in Large\nVision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5,\nQwenVL-Chat and Video-LLaVA. We find out that the attention computation over\nvisual tokens is of extreme inefficiency in the deep layers of popular LVLMs,\nsuggesting a need for a sparser approach compared to textual data handling. To\nthis end, we introduce FastV, a versatile plug-and-play method designed to\noptimize computational efficiency by learning adaptive attention patterns in\nearly layers and pruning visual tokens in subsequent ones. Our evaluations\ndemonstrate FastV's ability to dramatically reduce computational costs (e.g., a\n45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a\nwide range of image and video understanding tasks. The computational efficiency\nand performance trade-off of FastV are highly customizable and\npareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve\na lower budget than that of a 7B-parameter model, while still maintaining\nsuperior performance. We believe FastV has practical values for deployment of\nLVLMs in edge devices and commercial models. Code is released at\nhttps://github.com/pkunlp-icler/FastV.\n","authors":["Liang Chen","Haozhe Zhao","Tianyu Liu","Shuai Bai","Junyang Lin","Chang Zhou","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2403.06764v3.pdf","comment":"Accepted to ECCV 2024 (Oral), code is released at\n https://github.com/pkunlp-icler/FastV,"},{"id":"http://arxiv.org/abs/2406.10311v2","updated":"2024-09-02T03:37:35Z","published":"2024-06-14T06:47:40Z","title":"CHiSafetyBench: A Chinese Hierarchical Safety Benchmark for Large\n Language Models","summary":" With the profound development of large language models(LLMs), their safety\nconcerns have garnered increasing attention. However, there is a scarcity of\nChinese safety benchmarks for LLMs, and the existing safety taxonomies are\ninadequate, lacking comprehensive safety detection capabilities in authentic\nChinese scenarios. In this work, we introduce CHiSafetyBench, a dedicated\nsafety benchmark for evaluating LLMs' capabilities in identifying risky content\nand refusing answering risky questions in Chinese contexts. CHiSafetyBench\nincorporates a dataset that covers a hierarchical Chinese safety taxonomy\nconsisting of 5 risk areas and 31 categories. This dataset comprises two types\nof tasks: multiple-choice questions and question-answering, evaluating LLMs\nfrom the perspectives of risk content identification and the ability to refuse\nanswering risky questions respectively. Utilizing this benchmark, we validate\nthe feasibility of automatic evaluation as a substitute for human evaluation\nand conduct comprehensive automatic safety assessments on mainstream Chinese\nLLMs. Our experiments reveal the varying performance of different models across\nvarious safety domains, indicating that all models possess considerable\npotential for improvement in Chinese safety capabilities. Our dataset is\npublicly available at\nhttps://github.com/UnicomAI/UnicomBenchmark/tree/main/CHiSafetyBench.\n","authors":["Wenjing Zhang","Xuejiao Lei","Zhaoxiang Liu","Meijuan An","Bikun Yang","KaiKai Zhao","Kai Wang","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2406.10311v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.04818v2","updated":"2024-09-02T02:44:11Z","published":"2024-05-08T05:36:52Z","title":"ACORN: Aspect-wise Commonsense Reasoning Explanation Evaluation","summary":" Evaluating the quality of free-text explanations is a multifaceted,\nsubjective, and labor-intensive task. Large language models (LLMs) present an\nappealing alternative due to their potential for consistency, scalability, and\ncost-efficiency. In this work, we present ACORN, a new dataset of 3,500\nfree-text explanations and aspect-wise quality ratings, and use it to evaluate\nhow LLMs rate explanations. We observed that larger models outputted labels\nthat maintained or increased the inter-annotator agreement, suggesting that\nthey are within the expected variance between human raters. However, their\ncorrelation with majority-voted human ratings varied across different quality\naspects, indicating that they are not a complete replacement. In turn, using\nLLMs as a supplement to a smaller group of human raters in some cases improved\nthe correlation with the original majority labels. However, the effect was\nlimited to cases where human raters were scarce, and an additional human rater\nhad a more pronounced effect in all cases. Overall, we recommend against using\nLLMs as a complete replacement for human raters but encourage using them in\nconfigurations that end with targeted human involvement. Data available here:\nhttps://github.com/a-brassard/ACORN\n","authors":["Ana Brassard","Benjamin Heinzerling","Keito Kudo","Keisuke Sakaguchi","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2405.04818v2.pdf","comment":"18 pages, 7 figures, accepted to COLM 2024. Data available here:\n https://github.com/a-brassard/ACORN"},{"id":"http://arxiv.org/abs/2402.14154v3","updated":"2024-09-02T02:41:26Z","published":"2024-02-21T22:27:40Z","title":"MM-Soc: Benchmarking Multimodal Large Language Models in Social Media\n Platforms","summary":" Social media platforms are hubs for multimodal information exchange,\nencompassing text, images, and videos, making it challenging for machines to\ncomprehend the information or emotions associated with interactions in online\nspaces. Multimodal Large Language Models (MLLMs) have emerged as a promising\nsolution to these challenges, yet they struggle to accurately interpret human\nemotions and complex content such as misinformation. This paper introduces\nMM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of\nmultimodal social media content. MM-Soc compiles prominent multimodal datasets\nand incorporates a novel large-scale YouTube tagging dataset, targeting a range\nof tasks from misinformation detection, hate speech detection, and social\ncontext generation. Through our exhaustive evaluation on ten size-variants of\nfour open-source MLLMs, we have identified significant performance disparities,\nhighlighting the need for advancements in models' social understanding\ncapabilities. Our analysis reveals that, in a zero-shot setting, various types\nof MLLMs generally exhibit difficulties in handling social media tasks.\nHowever, MLLMs demonstrate performance improvements post fine-tuning,\nsuggesting potential pathways for improvement. Our code and data are available\nat https://github.com/claws-lab/MMSoc.git.\n","authors":["Yiqiao Jin","Minje Choi","Gaurav Verma","Jindong Wang","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.14154v3.pdf","comment":"In Proceedings of ACL 2024"},{"id":"http://arxiv.org/abs/2408.15879v2","updated":"2024-09-02T02:30:51Z","published":"2024-08-28T15:50:41Z","title":"Persuasion Games using Large Language Models","summary":" Large Language Models (LLMs) have emerged as formidable instruments capable\nof comprehending and producing human-like text. This paper explores the\npotential of LLMs, to shape user perspectives and subsequently influence their\ndecisions on particular tasks. This capability finds applications in diverse\ndomains such as Investment, Credit cards and Insurance, wherein they assist\nusers in selecting appropriate insurance policies, investment plans, Credit\ncards, Retail, as well as in Behavioral Change Support Systems (BCSS).\n We present a sophisticated multi-agent framework wherein a consortium of\nagents operate in collaborative manner. The primary agent engages directly with\nuser agents through persuasive dialogue, while the auxiliary agents perform\ntasks such as information retrieval, response analysis, development of\npersuasion strategies, and validation of facts. Empirical evidence from our\nexperiments demonstrates that this collaborative methodology significantly\nenhances the persuasive efficacy of the LLM. We continuously analyze the\nresistance of the user agent to persuasive efforts and counteract it by\nemploying a combination of rule-based and LLM-based resistance-persuasion\nmapping techniques.\n We employ simulated personas and generate conversations in insurance,\nbanking, and retail domains to evaluate the proficiency of large language\nmodels (LLMs) in recognizing, adjusting to, and influencing various personality\ntypes. Concurrently, we examine the resistance mechanisms employed by LLM\nsimulated personas. Persuasion is quantified via measurable surveys before and\nafter interaction, LLM-generated scores on conversation, and user decisions\n(purchase or non-purchase).\n","authors":["Ganesh Prasath Ramani","Shirish Karande","Santhosh V","Yash Bhatia"],"pdf_url":"https://arxiv.org/pdf/2408.15879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06458v2","updated":"2024-09-02T02:26:18Z","published":"2023-10-10T09:29:38Z","title":"Cultural Compass: Predicting Transfer Learning Success in Offensive\n Language Detection with Cultural Features","summary":" The increasing ubiquity of language technology necessitates a shift towards\nconsidering cultural diversity in the machine learning realm, particularly for\nsubjective tasks that rely heavily on cultural nuances, such as Offensive\nLanguage Detection (OLD). Current understanding underscores that these tasks\nare substantially influenced by cultural values, however, a notable gap exists\nin determining if cultural features can accurately predict the success of\ncross-cultural transfer learning for such subjective tasks. Addressing this,\nour study delves into the intersection of cultural features and transfer\nlearning effectiveness. The findings reveal that cultural value surveys indeed\npossess a predictive power for cross-cultural transfer learning success in OLD\ntasks and that it can be further improved using offensive word distance. Based\non these results, we advocate for the integration of cultural information into\ndatasets. Additionally, we recommend leveraging data sources rich in cultural\ninformation, such as surveys, to enhance cultural adaptability. Our research\nsignifies a step forward in the quest for more inclusive, culturally sensitive\nlanguage technologies.\n","authors":["Li Zhou","Antonia Karamolegkou","Wenyu Chen","Daniel Hershcovich"],"pdf_url":"https://arxiv.org/pdf/2310.06458v2.pdf","comment":"Findings of EMNLP 2023 (update)"},{"id":"http://arxiv.org/abs/2303.12816v4","updated":"2024-09-02T01:48:34Z","published":"2023-03-22T07:34:33Z","title":"From Wide to Deep: Dimension Lifting Network for Parameter-efficient\n Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) that maps entities and relations into vector\nrepresentations is essential for downstream applications. Conventional KGE\nmethods require high-dimensional representations to learn the complex structure\nof knowledge graph, but lead to oversized model parameters. Recent advances\nreduce parameters by low-dimensional entity representations, while developing\ntechniques (e.g., knowledge distillation or reinvented representation forms) to\ncompensate for reduced dimension. However, such operations introduce\ncomplicated computations and model designs that may not benefit large knowledge\ngraphs. To seek a simple strategy to improve the parameter efficiency of\nconventional KGE models, we take inspiration from that deeper neural networks\nrequire exponentially fewer parameters to achieve expressiveness comparable to\nwider networks for compositional structures. We view all entity representations\nas a single-layer embedding network, and conventional KGE methods that adopt\nhigh-dimensional entity representations equal widening the embedding network to\ngain expressiveness. To achieve parameter efficiency, we instead propose a\ndeeper embedding network for entity representations, i.e., a narrow entity\nembedding layer plus a multi-layer dimension lifting network (LiftNet).\nExperiments on three public datasets show that by integrating LiftNet, four\nconventional KGE methods with 16-dimensional representations achieve comparable\nlink prediction accuracy as original models that adopt 512-dimensional\nrepresentations, saving 68.4% to 96.9% parameters.\n","authors":["Borui Cai","Yong Xiang","Longxiang Gao","Di Wu","He Zhang","Jiong Jin","Tom Luan"],"pdf_url":"https://arxiv.org/pdf/2303.12816v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01497v1","updated":"2024-09-02T23:37:20Z","published":"2024-09-02T23:37:20Z","title":"DiversityMedQA: Assessing Demographic Biases in Medical Diagnosis using\n Large Language Models","summary":" As large language models (LLMs) gain traction in healthcare, concerns about\ntheir susceptibility to demographic biases are growing. We introduce\n{DiversityMedQA}, a novel benchmark designed to assess LLM responses to medical\nqueries across diverse patient demographics, such as gender and ethnicity. By\nperturbing questions from the MedQA dataset, which comprises medical board exam\nquestions, we created a benchmark that captures the nuanced differences in\nmedical diagnosis across varying patient profiles. Our findings reveal notable\ndiscrepancies in model performance when tested against these demographic\nvariations. Furthermore, to ensure the perturbations were accurate, we also\npropose a filtering strategy that validates each perturbation. By releasing\nDiversityMedQA, we provide a resource for evaluating and mitigating demographic\nbias in LLM medical diagnoses.\n","authors":["Rajat Rawat","Hudson McBride","Dhiyaan Nirmal","Rajarshi Ghosh","Jong Moon","Dhruv Alamuri","Sean O'Brien","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.01497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01495v1","updated":"2024-09-02T23:28:15Z","published":"2024-09-02T23:28:15Z","title":"The Compressor-Retriever Architecture for Language Model OS","summary":" Recent advancements in large language models (LLMs) have significantly\nenhanced their capacity to aggregate and process information across multiple\nmodalities, enabling them to perform a wide range of tasks such as multimodal\ndata querying, tool usage, web interactions, and handling long documents. These\ncapabilities pave the way for transforming LLMs from mere chatbots into\ngeneral-purpose agents capable of interacting with the real world. This paper\nexplores the concept of using a language model as the core component of an\noperating system (OS), effectively acting as a CPU that processes data stored\nin a context window, which functions as RAM. A key challenge in realizing such\nan LM OS is managing the life-long context and ensuring statefulness across\nsessions, a feature limited by the current session-based interaction paradigm\ndue to context window size limit. To address this, we introduce\ncompressor-retriever, a model-agnostic architecture designed for life-long\ncontext management. Unlike other long-context solutions such as\nretrieval-augmented generation, our approach exclusively uses the base model's\nforward function to compress and retrieve context, ensuring end-to-end\ndifferentiability. Preliminary experiments demonstrate the effectiveness of\nthis architecture in in-context learning tasks, marking a step towards the\ndevelopment of a fully stateful LLM OS. Project repo available at:\nhttps://github.com/gblackout/LM-OS\n","authors":["Yuan Yang","Siheng Xiong","Ehsan Shareghi","Faramarz Fekri"],"pdf_url":"https://arxiv.org/pdf/2409.01495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01483v1","updated":"2024-09-02T22:35:03Z","published":"2024-09-02T22:35:03Z","title":"Revisiting SMoE Language Models by Evaluating Inefficiencies with Task\n Specific Expert Pruning","summary":" Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative\nto dense models in language modeling. These models use conditionally activated\nfeedforward subnetworks in transformer blocks, allowing for a separation\nbetween total model parameters and per-example computation. However, large\ntoken-routed SMoE models face a significant challenge: during inference, the\nentire model must be used for a sequence or a batch, resulting in high\nlatencies in a distributed setting that offsets the advantages of per-token\nsparse activation. Our research explores task-specific model pruning to inform\ndecisions about designing SMoE architectures, mainly modulating the choice of\nexpert counts in pretraining. We investigate whether such pruned models offer\nadvantages over smaller SMoE models trained from scratch, when evaluating and\ncomparing them individually on tasks. To that end, we introduce an adaptive\ntask-aware pruning technique UNCURL to reduce the number of experts per MoE\nlayer in an offline manner post-training. Our findings reveal a threshold\npruning factor for the reduction that depends on the number of experts used in\npretraining, above which, the reduction starts to degrade model performance.\nThese insights contribute to our understanding of model design choices when\npretraining with SMoE architectures, particularly useful when considering\ntask-specific inference optimization for later stages.\n","authors":["Soumajyoti Sarkar","Leonard Lausen","Volkan Cevher","Sheng Zha","Thomas Brox","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2409.01483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01482v1","updated":"2024-09-02T22:17:18Z","published":"2024-09-02T22:17:18Z","title":"Masked Mixers for Language Generation and Retrieval","summary":" Attention mechanisms that confer selective focus on a strict subset of input\nelements are nearly ubiquitous in language models today. We posit there to be\ndownside to the use of attention: most information present in the input is\nnecessarily lost. In support of this idea we observe poor input representation\naccuracy in transformers, but find more accurate representation in what we term\nmasked mixers which replace self-attention with masked convolutions. Applied to\nTinyStories the masked mixer learns causal language tasks more efficiently than\nearly transformer implementations and somewhat less efficiently than optimized,\ncurrent implementations. The most efficient learning algorithm observed for\nthis dataset is a transformer-masked mixer hybrid, suggesting that these models\nlearn in an orthogonal manner. We hypothesized that the information loss\nexhibited by transformers would be much more detrimental to retrieval than\ngeneration, and to test this we introduce an efficient training approach for\nretrieval models based on existing generative model embeddings. With this\nmethod, embeddings from masked mixers are found to result in far better\nsummary-to-story retrieval compared to embeddings from transformers.\n","authors":["Benjamin L. Badger"],"pdf_url":"https://arxiv.org/pdf/2409.01482v1.pdf","comment":"23 pages, 15 figures (11 primary, 4 supplementary)"},{"id":"http://arxiv.org/abs/2409.01466v1","updated":"2024-09-02T21:05:31Z","published":"2024-09-02T21:05:31Z","title":"PoliPrompt: A High-Performance Cost-Effective LLM-Based Text\n Classification Framework for Political Science","summary":" Recent advancements in large language models (LLMs) have opened new avenues\nfor enhancing text classification efficiency in political science, surpassing\ntraditional machine learning methods that often require extensive feature\nengineering, human labeling, and task-specific training. However, their\neffectiveness in achieving high classification accuracy remains questionable.\nThis paper introduces a three-stage in-context learning approach that leverages\nLLMs to improve classification accuracy while minimizing experimental costs.\nOur method incorporates automatic enhanced prompt generation, adaptive exemplar\nselection, and a consensus mechanism that resolves discrepancies between two\nweaker LLMs, refined by an advanced LLM. We validate our approach using\ndatasets from the BBC news reports, Kavanaugh Supreme Court confirmation, and\n2018 election campaign ads. The results show significant improvements in\nclassification F1 score (+0.36 for zero-shot classification) with manageable\neconomic costs (-78% compared with human labeling), demonstrating that our\nmethod effectively addresses the limitations of traditional machine learning\nwhile offering a scalable and reliable solution for text analysis in political\nscience.\n","authors":["Menglin Liu","Ge Shi"],"pdf_url":"https://arxiv.org/pdf/2409.01466v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02141v1","updated":"2024-09-02T19:39:24Z","published":"2024-09-02T19:39:24Z","title":"Efficient and Scalable Estimation of Tool Representations in Vector\n Space","summary":" Recent advancements in function calling and tool use have significantly\nenhanced the capabilities of large language models (LLMs) by enabling them to\ninteract with external information sources and execute complex tasks. However,\nthe limited context window of LLMs presents challenges when a large number of\ntools are available, necessitating efficient methods to manage prompt length\nand maintain accuracy. Existing approaches, such as fine-tuning LLMs or\nleveraging their reasoning capabilities, either require frequent retraining or\nincur significant latency overhead. A more efficient solution involves training\nsmaller models to retrieve the most relevant tools for a given query, although\nthis requires high quality, domain-specific data. To address those challenges,\nwe present a novel framework for generating synthetic data for tool retrieval\napplications and an efficient data-driven tool retrieval strategy using small\nencoder models. Empowered by LLMs, we create ToolBank, a new tool retrieval\ndataset that reflects real human user usages. For tool retrieval methodologies,\nwe propose novel approaches: (1) Tool2Vec: usage-driven tool embedding\ngeneration for tool retrieval, (2) ToolRefiner: a staged retrieval method that\niteratively improves the quality of retrieved tools, and (3) MLC: framing tool\nretrieval as a multi-label classification problem. With these new methods, we\nachieve improvements of up to 27.28 in Recall@K on the ToolBench dataset and\n30.5 in Recall@K on ToolBank. Additionally, we present further experimental\nresults to rigorously validate our methods. Our code is available at\n\\url{https://github.com/SqueezeAILab/Tool2Vec}\n","authors":["Suhong Moon","Siddharth Jha","Lutfi Eren Erdogan","Sehoon Kim","Woosang Lim","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2409.02141v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2207.12554v2","updated":"2024-09-02T22:49:21Z","published":"2022-07-25T22:17:19Z","title":"Inter-Frame Compression for Dynamic Point Cloud Geometry Coding","summary":" Efficient point cloud compression is essential for applications like virtual\nand mixed reality, autonomous driving, and cultural heritage. This paper\nproposes a deep learning-based inter-frame encoding scheme for dynamic point\ncloud geometry compression. We propose a lossy geometry compression scheme that\npredicts the latent representation of the current frame using the previous\nframe by employing a novel feature space inter-prediction network. The proposed\nnetwork utilizes sparse convolutions with hierarchical multiscale 3D feature\nlearning to encode the current frame using the previous frame. The proposed\nmethod introduces a novel predictor network for motion compensation in the\nfeature domain to map the latent representation of the previous frame to the\ncoordinates of the current frame to predict the current frame's feature\nembedding. The framework transmits the residual of the predicted features and\nthe actual features by compressing them using a learned probabilistic\nfactorized entropy model. At the receiver, the decoder hierarchically\nreconstructs the current frame by progressively rescaling the feature\nembedding. The proposed framework is compared to the state-of-the-art\nVideo-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud\nCompression (G-PCC) schemes standardized by the Moving Picture Experts Group\n(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta\nRate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against\nG-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame\nencoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based\ninter-frame encoding mode using HEVC. These significant performance gains are\ncross-checked and verified in the MPEG working group.\n","authors":["Anique Akhtar","Zhu Li","Geert Van der Auwera"],"pdf_url":"https://arxiv.org/pdf/2207.12554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15920v2","updated":"2024-09-02T21:35:51Z","published":"2024-06-22T19:20:35Z","title":"SEDMamba: Enhancing Selective State Space Modelling with Bottleneck\n Mechanism and Fine-to-Coarse Temporal Fusion for Efficient Error Detection in\n Robot-Assisted Surgery","summary":" Automated detection of surgical errors can improve robotic-assisted surgery.\nDespite promising progress, existing methods still face challenges in capturing\nrich temporal context to establish long-term dependencies while maintaining\ncomputational efficiency. In this paper, we propose a novel hierarchical model\nnamed SEDMamba, which incorporates the selective state space model (SSM) into\nsurgical error detection, facilitating efficient long sequence modelling with\nlinear complexity. SEDMamba enhances selective SSM with a bottleneck mechanism\nand fine-to-coarse temporal fusion (FCTF) to detect and temporally localize\nsurgical errors in long videos. The bottleneck mechanism compresses and\nrestores features within their spatial dimension, thereby reducing\ncomputational complexity. FCTF utilizes multiple dilated 1D convolutional\nlayers to merge temporal information across diverse scale ranges, accommodating\nerrors of varying duration. Our work also contributes the first-of-its-kind,\nframe-level, in-vivo surgical error dataset to support error detection in real\nsurgical cases. Specifically, we deploy the clinically validated observational\nclinical human reliability assessment tool (OCHRA) to annotate the errors\nduring suturing tasks in an open-source radical prostatectomy dataset\n(SAR-RARP50). Experimental results demonstrate that our SEDMamba outperforms\nstate-of-the-art methods with at least 1.82% AUC and 3.80% AP performance gains\nwith significantly reduced computational complexity. The corresponding error\nannotations, code and models will be released at\nhttps://github.com/wzjialang/SEDMamba.\n","authors":["Jialang Xu","Nazir Sirajudeen","Matthew Boal","Nader Francis","Danail Stoyanov","Evangelos Mazomenos"],"pdf_url":"https://arxiv.org/pdf/2406.15920v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.17937v2","updated":"2024-09-02T20:58:43Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v2.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2408.17095v2","updated":"2024-09-02T20:33:49Z","published":"2024-08-30T08:26:55Z","title":"RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation\n and Retrieval-Guidance","summary":" Diffusion-based models demonstrate impressive generation capabilities.\nHowever, they also have a massive number of parameters, resulting in enormous\nmodel sizes, thus making them unsuitable for deployment on resource-constraint\ndevices. Block-wise generation can be a promising alternative for designing\ncompact-sized (parameter-efficient) deep generative models since the model can\ngenerate one block at a time instead of generating the whole image at once.\nHowever, block-wise generation is also considerably challenging because\nensuring coherence across generated blocks can be non-trivial. To this end, we\ndesign a retrieval-augmented generation (RAG) approach and leverage the\ncorresponding blocks of the images retrieved by the RAG module to condition the\ntraining and generation stages of a block-wise denoising diffusion model. Our\nconditioning schemes ensure coherence across the different blocks during\ntraining and, consequently, during generation. While we showcase our approach\nusing the latent diffusion model (LDM) as the base model, it can be used with\nother variants of denoising diffusion models. We validate the solution of the\ncoherence problem through the proposed approach by reporting substantive\nexperiments to demonstrate our approach's effectiveness in compact model size\nand excellent generation quality.\n","authors":["Avideep Mukherjee","Soumya Banerjee","Piyush Rai","Vinay P. Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2408.17095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05303v4","updated":"2024-09-02T19:49:06Z","published":"2023-08-10T02:47:36Z","title":"Multi-Visual-Inertial System: Analysis, Calibration and Estimation","summary":" In this paper, we study state estimation of multi-visual-inertial systems\n(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary\nnumber of asynchronous inertial measurement units (IMUs) or gyroscopes and\nglobal and(or) rolling shutter cameras. We are especially interested in the\nfull calibration of the associated visual-inertial sensors, including the IMU\nor camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as\nwell as the image readout time of rolling-shutter cameras (if used). To this\nend, we develop a new analytic combined IMU integration with intrinsics-termed\nACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary\nIMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial\nmeasurements to include all the necessary inertial intrinsic and IMU-IMU\nspatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body\nconstraints to eliminate the necessity of auxiliary inertial poses and thus\nreducing computational complexity. By performing observability analysis of\nMVIS, we prove that the standard four unobservable directions remain - no\nmatter how many inertial sensors are used, and also identify, for the first\ntime, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary\ninertial intrinsics. In addition to the extensive simulations that validate our\nanalysis and algorithms, we have built our own MVIS sensor rig and collected\nover 25 real-world datasets to experimentally verify the proposed calibration\nagainst the state-of-the-art calibration method such as Kalibr. We show that\nthe proposed MVIS calibration is able to achieve competing accuracy with\nimproved convergence and repeatability, which is open sourced to better benefit\nthe community.\n","authors":["Yulin Yang","Patrick Geneva","Guoquan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05303v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08486v2","updated":"2024-09-02T19:04:57Z","published":"2024-06-12T17:59:42Z","title":"On Evaluating Adversarial Robustness of Volumetric Medical Segmentation\n Models","summary":" Volumetric medical segmentation models have achieved significant success on\norgan and tumor-based segmentation tasks in recent years. However, their\nvulnerability to adversarial attacks remains largely unexplored, raising\nserious concerns regarding the real-world deployment of tools employing such\nmodels in the healthcare sector. This underscores the importance of\ninvestigating the robustness of existing models. In this context, our work aims\nto empirically examine the adversarial robustness across current volumetric\nsegmentation architectures, encompassing Convolutional, Transformer, and\nMamba-based models. We extend this investigation across four volumetric\nsegmentation datasets, evaluating robustness under both white box and black box\nadversarial attacks. Overall, we observe that while both pixel and\nfrequency-based attacks perform reasonably well under \\emph{white box} setting,\nthe latter performs significantly better under transfer-based black box\nattacks. Across our experiments, we observe transformer-based models show\nhigher robustness than convolution-based models with Mamba-based models being\nthe most vulnerable. Additionally, we show that large-scale training of\nvolumetric segmentation models improves the model's robustness against\nadversarial attacks. The code and robust models are available at\nhttps://github.com/HashmatShadab/Robustness-of-Volumetric-Medical-Segmentation-Models.\n","authors":["Hashmat Shadab Malik","Numan Saeed","Asif Hanif","Muzammal Naseer","Mohammad Yaqub","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2406.08486v2.pdf","comment":"Accepted at British Machine Vision Conference 2024"},{"id":"http://arxiv.org/abs/2408.13423v3","updated":"2024-09-02T18:02:03Z","published":"2024-08-24T01:33:28Z","title":"Training-free Long Video Generation with Chain of Diffusion Model\n Experts","summary":" Video generation models hold substantial potential in areas such as\nfilmmaking. However, current video diffusion models need high computational\ncosts and produce suboptimal results due to high complexity of video generation\ntask. In this paper, we propose \\textbf{ConFiner}, an efficient high-quality\nvideo generation framework that decouples video generation into easier\nsubtasks: structure \\textbf{con}trol and spatial-temporal re\\textbf{fine}ment.\nIt can generate high-quality videos with chain of off-the-shelf diffusion model\nexperts, each expert responsible for a decoupled subtask. During the\nrefinement, we introduce coordinated denoising, which can merge multiple\ndiffusion experts' capabilities into a single sampling. Furthermore, we design\nConFiner-Long framework, which can generate long coherent video with three\nconstraint strategies on ConFiner. Experimental results indicate that with only\n10\\% of the inference cost, our ConFiner surpasses representative models like\nLavie and Modelscope across all objective and subjective metrics. And\nConFiner-Long can generate high-quality and coherent videos with up to 600\nframes.\n","authors":["Wenhao Li","Yichao Cao","Xiu Su","Xi Lin","Shan You","Mingkai Zheng","Yi Chen","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.13423v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17346v2","updated":"2024-09-02T17:30:45Z","published":"2024-03-26T03:10:45Z","title":"TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos","summary":" We propose TRAM, a two-stage method to reconstruct a human's global\ntrajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover\nthe camera motion in the presence of dynamic humans and uses the scene\nbackground to derive the motion scale. Using the recovered camera as a\nmetric-scale reference frame, we introduce a video transformer model (VIMO) to\nregress the kinematic body motion of a human. By composing the two motions, we\nachieve accurate recovery of 3D humans in the world space, reducing global\nmotion errors by a large margin from prior work.\nhttps://yufu-wang.github.io/tram4d/\n","authors":["Yufu Wang","Ziyun Wang","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2403.17346v2.pdf","comment":"The project website: https://yufu-wang.github.io/tram4d/"},{"id":"http://arxiv.org/abs/2312.10108v2","updated":"2024-09-02T17:00:21Z","published":"2023-12-15T06:30:55Z","title":"Privacy-Aware Document Visual Question Answering","summary":" Document Visual Question Answering (DocVQA) has quickly grown into a central\ntask of document understanding. But despite the fact that documents contain\nsensitive or copyrighted information, none of the current DocVQA methods offers\nstrong privacy guarantees. In this work, we explore privacy in the domain of\nDocVQA for the first time, highlighting privacy issues in state of the art\nmulti-modal LLM models used for DocVQA, and explore possible solutions.\nSpecifically, we focus on invoice processing as a realistic document\nunderstanding scenario, and propose a large scale DocVQA dataset comprising\ninvoice documents and associated questions and answers. We employ a federated\nlearning scheme, that reflects the real-life distribution of documents in\ndifferent businesses, and we explore the use case where the data of the invoice\nprovider is the sensitive information to be protected. We demonstrate that\nnon-private models tend to memorise, a behaviour that can lead to exposing\nprivate information. We then evaluate baseline training schemes employing\nfederated learning and differential privacy in this multi-modal scenario, where\nthe sensitive information might be exposed through either or both of the two\ninput modalities: vision (document image) or language (OCR tokens). Finally, we\ndesign attacks exploiting the memorisation effect of the model, and demonstrate\ntheir effectiveness in probing a representative DocVQA models.\n","authors":["Rubèn Tito","Khanh Nguyen","Marlon Tobaben","Raouf Kerkouche","Mohamed Ali Souibgui","Kangsoo Jung","Joonas Jälkö","Vincent Poulain D'Andecy","Aurelie Joseph","Lei Kang","Ernest Valveny","Antti Honkela","Mario Fritz","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2312.10108v2.pdf","comment":"35 pages, 12 figures, accepted for publication at the 18th\n International Conference on Document Analysis and Recognition, ICDAR 2024"},{"id":"http://arxiv.org/abs/2408.16154v2","updated":"2024-09-02T16:58:16Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":" Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","Luis Filipe Nakayama","André Anjos","Lilian Berton"],"pdf_url":"https://arxiv.org/pdf/2408.16154v2.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n ECCV 2024"},{"id":"http://arxiv.org/abs/2408.17347v2","updated":"2024-09-02T16:08:32Z","published":"2024-08-30T15:22:13Z","title":"LSMS: Language-guided Scale-aware MedSegmentor for Medical Image\n Referring Segmentation","summary":" Conventional medical image segmentation methods have been found inadequate in\nfacilitating physicians with the identification of specific lesions for\ndiagnosis and treatment. Given the utility of text as an instructional format,\nwe introduce a novel task termed Medical Image Referring Segmentation (MIRS),\nwhich requires segmenting specified lesions in images based on the given\nlanguage expressions. Due to the varying object scales in medical images, MIRS\ndemands robust vision-language modeling and comprehensive multi-scale\ninteraction for precise localization and segmentation under linguistic\nguidance. However, existing medical image segmentation methods fall short in\nmeeting these demands, resulting in insufficient segmentation accuracy. In\nresponse, we propose an approach named Language-guided Scale-aware MedSegmentor\n(LSMS), incorporating two appealing designs: (1)~a Scale-aware Vision-Language\nAttention module that leverages diverse convolutional kernels to acquire rich\nvisual knowledge and interact closely with linguistic features, thereby\nenhancing lesion localization capability; (2)~a Full-Scale Decoder that\nglobally models multi-modal features across various scales, capturing\ncomplementary information between scales to accurately outline lesion\nboundaries. Addressing the lack of suitable datasets for MIRS, we constructed a\nvision-language medical dataset called Reference Hepatic Lesion Segmentation\n(RefHL-Seg). This dataset comprises 2,283 abdominal CT slices from 231 cases,\nwith corresponding textual annotations and segmentation masks for various liver\nlesions in images. We validated the performance of LSMS for MIRS and\nconventional medical image segmentation tasks across various datasets. Our LSMS\nconsistently outperforms on all datasets with lower computational costs. The\ncode and datasets will be released.\n","authors":["Shuyi Ouyang","Jinyang Zhang","Xiangye Lin","Xilai Wang","Qingqing Chen","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2408.17347v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.05873v7","updated":"2024-09-02T15:46:13Z","published":"2023-10-09T17:13:10Z","title":"Implicit Concept Removal of Diffusion Models","summary":" Text-to-image (T2I) diffusion models often inadvertently generate unwanted\nconcepts such as watermarks and unsafe images. These concepts, termed as the\n\"implicit concepts\", could be unintentionally learned during training and then\nbe generated uncontrollably during inference. Existing removal methods still\nstruggle to eliminate implicit concepts primarily due to their dependency on\nthe model's ability to recognize concepts it actually can not discern. To\naddress this, we utilize the intrinsic geometric characteristics of implicit\nconcepts and present the Geom-Erasing, a novel concept removal method based on\nthe geometric-driven control. Specifically, once an unwanted implicit concept\nis identified, we integrate the existence and geometric information of the\nconcept into the text prompts with the help of an accessible classifier or\ndetector model. Subsequently, the model is optimized to identify and\ndisentangle this information, which is then adopted as negative prompts during\ngeneration. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel\nimage-text dataset imbued with three typical implicit concepts (i.e., QR codes,\nwatermarks, and text), reflecting real-life situations where implicit concepts\nare easily injected. Geom-Erasing effectively mitigates the generation of\nimplicit concepts, achieving the state-of-the-art results on the Inappropriate\nImage Prompts (I2P) and our challenging Implicit Concept Dataset (ICD)\nbenchmarks.\n","authors":["Zhili Liu","Kai Chen","Yifan Zhang","Jianhua Han","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung","James Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.05873v7.pdf","comment":"Accepted by ECCV2024. Project Page:\n https://kaichen1998.github.io/projects/geom-erasing/"},{"id":"http://arxiv.org/abs/2405.00145v2","updated":"2024-09-02T14:24:55Z","published":"2024-04-30T18:42:18Z","title":"GUing: A Mobile GUI Search Engine using a Vision-Language Model","summary":" App developers use the Graphical User Interface (GUI) of other apps as a\nsource of inspiration for designing and improving their own apps. Recent\nresearch has thus suggested retrieving relevant GUI designs that match a\ncertain text query from screenshot datasets acquired through crowdsourced or\nautomated exploration of GUIs. However, such text-to-GUI retrieval approaches\nonly leverage the textual information of the GUI elements, neglecting visual\ninformation such as icons or background images. In addition, retrieved\nscreenshots are not steered by app developers and often lack important app\nfeatures that require particular input data.\n To overcome these limitations, this paper proposes GUing, a GUI search engine\nbased on a vision-language model called GUIClip, which we trained specifically\nfor the problem of designing app GUIs. For this, we first collected from Google\nPlay app introduction images which usually display the most representative\nscreenshots and are often captioned (i.e.~labeled) by app vendors. Then, we\ndeveloped an automated pipeline to classify, crop, and extract the captions\nfrom these images. This resulted in a large dataset which we share with this\npaper: including 303k app screenshots, out of which 135k have captions. We used\nthis dataset to train a novel vision-language model, which is, to the best of\nour knowledge, the first of its kind in GUI retrieval. We evaluated our\napproach on various datasets from related work and in manual experiment. The\nresults demonstrate that our model outperforms previous approaches in\ntext-to-GUI retrieval achieving a Recall@10 of up to 0.69 and a HIT@10 of 0.91.\nWe also explored the performance of GUIClip for other GUI tasks including GUI\nclassification and sketch-to-GUI retrieval with encouraging results.\n","authors":["Jialiang Wei","Anne-Lise Courbis","Thomas Lambolais","Binbin Xu","Pierre Louis Bernard","Gérard Dray","Walid Maalej"],"pdf_url":"https://arxiv.org/pdf/2405.00145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15365v4","updated":"2024-09-02T14:07:08Z","published":"2024-01-27T09:54:16Z","title":"An open dataset for oracle bone script recognition and decipherment","summary":" Oracle bone script, one of the earliest known forms of ancient Chinese\nwriting, presents invaluable research materials for scholars studying the\nhumanities and geography of the Shang Dynasty, dating back 3,000 years. The\nimmense historical and cultural significance of these writings cannot be\noverstated. However, the passage of time has obscured much of their meaning,\npresenting a significant challenge in deciphering these ancient texts. With the\nadvent of Artificial Intelligence (AI), employing AI to assist in deciphering\nOracle Bone Characters (OBCs) has become a feasible option. Yet, progress in\nthis area has been hindered by a lack of high-quality datasets. To address this\nissue, this paper details the creation of the HUST-OBC dataset. This dataset\nencompasses 77,064 images of 1,588 individual deciphered characters and 62,989\nimages of 9,411 undeciphered characters, with a total of 140,053 images,\ncompiled from diverse sources. The hope is that this dataset could inspire and\nassist future research in deciphering those unknown OBCs. All the codes and\ndatasets are available at https://github.com/Yuliang-Liu/Open-Oracle.\n","authors":["Pengjie Wang","Kaile Zhang","Xinyu Wang","Shengwei Han","Yongge Liu","Jinpeng Wan","Haisu Guan","Zhebin Kuang","Lianwen Jin","Xiang Bai","Yuliang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.15365v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05867v2","updated":"2024-09-02T13:39:30Z","published":"2024-08-11T21:59:34Z","title":"SABER-6D: Shape Representation Based Implicit Object Pose Estimation","summary":" In this paper, we propose a novel encoder-decoder architecture, named SABER,\nto learn the 6D pose of the object in the embedding space by learning shape\nrepresentation at a given pose. This model enables us to learn pose by\nperforming shape representation at a target pose from RGB image input. We\nperform shape representation as an auxiliary task which helps us in learning\nrotations space for an object based on 2D images. An image encoder predicts the\nrotation in the embedding space and the DeepSDF based decoder learns to\nrepresent the object's shape at the given pose. As our approach is shape based,\nthe pipeline is suitable for any type of object irrespective of the symmetry.\nMoreover, we need only a CAD model of the objects to train SABER. Our pipeline\nis synthetic data based and can also handle symmetric objects without symmetry\nlabels and, thus, no additional labeled training data is needed. The\nexperimental evaluation shows that our method achieves close to benchmark\nresults for both symmetric objects and asymmetric objects on Occlusion-LineMOD,\nand T-LESS datasets.\n","authors":["Shishir Reddy Vutukur","Mengkejiergeli Ba","Benjamin Busam","Matthias Kayser","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.05867v2.pdf","comment":"ECCV 2024 R6D workshop"},{"id":"http://arxiv.org/abs/2408.16481v2","updated":"2024-09-02T13:12:23Z","published":"2024-08-29T12:16:55Z","title":"A Deep-Learning-Based Label-free No-Reference Image Quality Assessment\n Metric: Application in Sodium MRI Denoising","summary":" New multinuclear MRI techniques, such as sodium MRI, generally suffer from\nlow image quality due to an inherently low signal. Postprocessing methods, such\nas image denoising, have been developed for image enhancement. However, the\nassessment of these enhanced images is challenging especially considering when\nthere is a lack of high resolution and high signal images as reference, such as\nin sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are\napproaches to solve this problem. Existing learning-based NR-IQA metrics rely\non labels derived from subjective human opinions or metrics like\nSignal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate\nground truths, resulting in unreliable assessment. We note that deep learning\n(DL) models have a unique characteristic in that they are specialized to a\ncharacteristic training set, meaning that deviations between the input testing\ndata from the training data will reduce prediction accuracy. Therefore, we\npropose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM),\nwhich does not depend on ground-truth images or labels. MSM measures the\ndifference between the input image and the model's prediction for evaluating\nthe quality of the input image. Experiments conducted on both simulated\ndistorted proton T1-weighted MR images and denoised sodium MR images\ndemonstrate that MSM exhibits a superior evaluation performance on various\nsimulated noises and distortions. MSM also has a substantial agreement with the\nexpert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528,\noutperforming the existing NR-IQA metrics.\n","authors":["Shuaiyu Yuan","Tristan Whitmarsh","Dimitri A Kessler","Otso Arponen","Mary A McLean","Gabrielle Baxter","Frank Riemer","Aneurin J Kennerley","William J Brackenbury","Fiona J Gilbert","Joshua D Kaggie"],"pdf_url":"https://arxiv.org/pdf/2408.16481v2.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.14966v2","updated":"2024-09-02T12:55:04Z","published":"2024-04-23T12:20:27Z","title":"Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State\n Space Model","summary":" Existing Transformer-based models for point cloud analysis suffer from\nquadratic complexity, leading to compromised point cloud resolution and\ninformation loss. In contrast, the newly proposed Mamba model, based on state\nspace models (SSM), outperforms Transformer in multiple areas with only linear\ncomplexity. However, the straightforward adoption of Mamba does not achieve\nsatisfactory performance on point cloud tasks. In this work, we present\nMamba3D, a state space model tailored for point cloud learning to enhance local\nfeature extraction, achieving superior performance, high efficiency, and\nscalability potential. Specifically, we propose a simple yet effective Local\nNorm Pooling (LNP) block to extract local geometric features. Additionally, to\nobtain better global features, we introduce a bidirectional SSM (bi-SSM) with\nboth a token forward SSM and a novel backward SSM that operates on the feature\nchannel. Extensive experimental results show that Mamba3D surpasses\nTransformer-based counterparts and concurrent works in multiple tasks, with or\nwithout pre-training. Notably, Mamba3D achieves multiple SoTA, including an\noverall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1%\n(with single-modal pre-training) on the ModelNet40 classification task, with\nonly linear complexity. Our code and weights are available at\nhttps://github.com/xhanxu/Mamba3D.\n","authors":["Xu Han","Yuan Tang","Zhaoxuan Wang","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2404.14966v2.pdf","comment":"ACM MM 2024. Code and weights are available at\n https://github.com/xhanxu/Mamba3D"},{"id":"http://arxiv.org/abs/2408.16845v2","updated":"2024-09-02T10:33:48Z","published":"2024-08-29T18:21:50Z","title":"Enabling Local Editing in Diffusion Models by Joint and Individual\n Component Analysis","summary":" Recent advances in Diffusion Models (DMs) have led to significant progress in\nvisual synthesis and editing tasks, establishing them as a strong competitor to\nGenerative Adversarial Networks (GANs). However, the latent space of DMs is not\nas well understood as that of GANs. Recent research has focused on unsupervised\nsemantic discovery in the latent space of DMs by leveraging the bottleneck\nlayer of the denoising network, which has been shown to exhibit properties of a\nsemantic latent space. However, these approaches are limited to discovering\nglobal attributes. In this paper we address, the challenge of local image\nmanipulation in DMs and introduce an unsupervised method to factorize the\nlatent semantics learned by the denoising network of pre-trained DMs. Given an\narbitrary image and defined regions of interest, we utilize the Jacobian of the\ndenoising network to establish a relation between the regions of interest and\ntheir corresponding subspaces in the latent space. Furthermore, we disentangle\nthe joint and individual components of these subspaces to identify latent\ndirections that enable local image manipulation. Once discovered, these\ndirections can be applied to different images to produce semantically\nconsistent edits, making our method suitable for practical applications.\nExperimental results on various datasets demonstrate that our method can\nproduce semantic edits that are more localized and have better fidelity\ncompared to the state-of-the-art.\n","authors":["Theodoros Kouzelis","Manos Plitsis","Mihalis A. Nicolaou","Yannis Panagakis"],"pdf_url":"https://arxiv.org/pdf/2408.16845v2.pdf","comment":"Accepted at BMVC2024"},{"id":"http://arxiv.org/abs/2302.03531v2","updated":"2024-09-02T10:24:46Z","published":"2023-02-07T15:23:52Z","title":"Structured Generative Models for Scene Understanding","summary":" This position paper argues for the use of \\emph{structured generative models}\n(SGMs) for the understanding of static scenes. This requires the reconstruction\nof a 3D scene from an input image (or a set of multi-view images), whereby the\ncontents of the image(s) are causally explained in terms of models of\ninstantiated objects, each with their own type, shape, appearance and pose,\nalong with global variables like scene lighting and camera parameters. This\napproach also requires scene models which account for the co-occurrences and\ninter-relationships of objects in a scene. The SGM approach has the merits that\nit is compositional and generative, which lead to interpretability and\neditability. \\\\\\\\ To pursue the SGM agenda, we need models for objects and\nscenes, and approaches to carry out inference. We first review models for\nobjects, which include ``things'' (object categories that have a well defined\nshape), and ``stuff'' (categories which have amorphous spatial extent). We then\nmove on to review \\emph{scene models} which describe the inter-relationships of\nobjects. Perhaps the most challenging problem for SGMs is \\emph{inference} of\nthe objects, lighting and camera parameters, and scene inter-relationships from\ninput consisting of a single or multiple images. We conclude with a discussion\nof issues that need addressing to advance the SGM agenda.\n","authors":["Christopher K. I. Williams"],"pdf_url":"https://arxiv.org/pdf/2302.03531v2.pdf","comment":"32 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.06198v2","updated":"2024-09-02T09:28:09Z","published":"2024-05-10T02:26:35Z","title":"MAPL: Memory Augmentation and Pseudo-Labeling for Semi-Supervised\n Anomaly Detection","summary":" Large unlabeled data and difficult-to-identify anomalies are the urgent\nissues need to overcome in most industrial scene. In order to address this\nissue, a new meth-odology for detecting surface defects in in-dustrial settings\nis introduced, referred to as Memory Augmentation and Pseudo-Labeling(MAPL).\nThe methodology first in-troduces an anomaly simulation strategy, which\nsignificantly improves the model's ability to recognize rare or unknown\nanom-aly types by generating simulated anomaly samples. To cope with the\nproblem of the lack of labeling of anomalous simulated samples, a\npseudo-labeler method based on a one-classifier ensemble was employed in this\nstudy, which enhances the robustness of the model in the case of limited\nlabeling data by automatically selecting key pseudo-labeling hyperparameters.\nMeanwhile, a memory-enhanced learning mechanism is introduced to effectively\npredict abnormal regions by analyzing the difference be-tween the input samples\nand the normal samples in the memory pool. An end-to-end learning framework is\nemployed by MAPL to identify the abnormal regions directly from the input data,\nwhich optimizes the ef-ficiency and real-time performance of de-tection. By\nconducting extensive trials on the recently developed BHAD dataset (in-cluding\nMVTec AD [1], Visa [2], and MDPP [3]), MAPL achieves an average im-age-level\nAUROC score of 86.2%, demon-strating a 5.1% enhancement compared to the\noriginal MemSeg [4] model. The source code is available at\nhttps://github.com/jzc777/MAPL.\n","authors":["Junzhuo Chen"],"pdf_url":"https://arxiv.org/pdf/2405.06198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v2","updated":"2024-09-02T09:04:51Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v2.pdf","comment":"Accepted by NCMMSC2024"},{"id":"http://arxiv.org/abs/2406.01154v3","updated":"2024-09-02T08:52:19Z","published":"2024-06-03T09:49:54Z","title":"UniUSNet: A Promptable Framework for Universal Ultrasound Disease\n Prediction and Tissue Segmentation","summary":" Ultrasound is widely used in clinical practice due to its affordability,\nportability, and safety. However, current AI research often overlooks combined\ndisease prediction and tissue segmentation. We propose UniUSNet, a universal\nframework for ultrasound image classification and segmentation. This model\nhandles various ultrasound types, anatomical positions, and input formats,\nexcelling in both segmentation and classification tasks. Trained on a\ncomprehensive dataset with over 9.7K annotations from 7 distinct anatomical\npositions, our model matches state-of-the-art performance and surpasses\nsingle-dataset and ablated models. Zero-shot and fine-tuning experiments show\nstrong generalization and adaptability with minimal fine-tuning. We plan to\nexpand our dataset and refine the prompting mechanism, with model weights and\ncode available at (https://github.com/Zehui-Lin/UniUSNet).\n","authors":["Zehui Lin","Zhuoneng Zhang","Xindi Hu","Zhifan Gao","Xin Yang","Yue Sun","Dong Ni","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2406.01154v3.pdf","comment":"Accepted to BIBM 2024"},{"id":"http://arxiv.org/abs/2408.04914v2","updated":"2024-09-02T08:45:47Z","published":"2024-08-09T07:46:01Z","title":"GuidedNet: Semi-Supervised Multi-Organ Segmentation via Labeled Data\n Guide Unlabeled Data","summary":" Semi-supervised multi-organ medical image segmentation aids physicians in\nimproving disease diagnosis and treatment planning and reduces the time and\neffort required for organ annotation.Existing state-of-the-art methods train\nthe labeled data with ground truths and train the unlabeled data with\npseudo-labels. However, the two training flows are separate, which does not\nreflect the interrelationship between labeled and unlabeled data.To address\nthis issue, we propose a semi-supervised multi-organ segmentation method called\nGuidedNet, which leverages the knowledge from labeled data to guide the\ntraining of unlabeled data. The primary goals of this study are to improve the\nquality of pseudo-labels for unlabeled data and to enhance the network's\nlearning capability for both small and complex organs.A key concept is that\nvoxel features from labeled and unlabeled data that are close to each other in\nthe feature space are more likely to belong to the same class.On this basis, a\n3D Consistent Gaussian Mixture Model (3D-CGMM) is designed to leverage the\nfeature distributions from labeled data to rectify the generated\npseudo-labels.Furthermore, we introduce a Knowledge Transfer Cross Pseudo\nSupervision (KT-CPS) strategy, which leverages the prior knowledge obtained\nfrom the labeled data to guide the training of the unlabeled data, thereby\nimproving the segmentation accuracy for both small and complex organs.\nExtensive experiments on two public datasets, FLARE22 and AMOS, demonstrated\nthat GuidedNet is capable of achieving state-of-the-art performance. The source\ncode with our proposed model are available at\nhttps://github.com/kimjisoo12/GuidedNet.\n","authors":["Haochen Zhao","Hui Meng","Deqian Yang","Xiaozheng Xie","Xiaoze Wu","Qingfeng Li","Jianwei Niu"],"pdf_url":"https://arxiv.org/pdf/2408.04914v2.pdf","comment":"Accepted by ACM MM2024, 10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.14491v2","updated":"2024-09-02T08:40:11Z","published":"2024-07-19T17:44:33Z","title":"PD-APE: A Parallel Decoding Framework with Adaptive Position Encoding\n for 3D Visual Grounding","summary":" 3D visual grounding aims to identify objects in 3D point cloud scenes that\nmatch specific natural language descriptions. This requires the model to not\nonly focus on the target object itself but also to consider the surrounding\nenvironment to determine whether the descriptions are met. Most previous works\nattempt to accomplish both tasks within the same module, which can easily lead\nto a distraction of attention. To this end, we propose PD-APE, a dual-branch\ndecoding framework that separately decodes target object attributes and\nsurrounding layouts. Specifically, in the target object branch, the decoder\nprocesses text tokens that describe features of the target object (e.g.,\ncategory and color), guiding the queries to pay attention to the target object\nitself. In the surrounding branch, the queries align with other text tokens\nthat carry surrounding environment information, making the attention maps\naccurately capture the layout described in the text. Benefiting from the\nproposed dual-branch design, the queries are allowed to focus on points\nrelevant to each branch's specific objective. Moreover, we design an adaptive\nposition encoding method for each branch respectively. In the target object\nbranch, the position encoding relies on the relative positions between seed\npoints and predicted 3D boxes. In the surrounding branch, the attention map is\nadditionally guided by the confidence between visual and text features,\nenabling the queries to focus on points that have valuable layout information.\nExtensive experiments demonstrate that we surpass the state-of-the-art on two\nwidely adopted 3D visual grounding datasets, ScanRefer and Nr3D.\n","authors":["Chenshu Hou","Liang Peng","Xiaopei Wu","Xiaofei He","Wenxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11092v3","updated":"2024-09-02T08:37:29Z","published":"2023-10-17T09:21:29Z","title":"DORec: Decomposed Object Reconstruction and Segmentation Utilizing 2D\n Self-Supervised Features","summary":" Recovering 3D geometry and textures of individual objects is crucial for many\nrobotics applications, such as manipulation, pose estimation, and autonomous\ndriving. However, decomposing a target object from a complex background is\nchallenging. Most existing approaches rely on costly manual labels to acquire\nobject instance perception. Recent advancements in 2D self-supervised learning\noffer new prospects for identifying objects of interest, yet leveraging such\nnoisy 2D features for clean decomposition remains difficult. In this paper, we\npropose a Decomposed Object Reconstruction (DORec) network based on neural\nimplicit representations. Our key idea is to use 2D self-supervised features to\ncreate two levels of masks for supervision: a binary mask for foreground\nregions and a K-cluster mask for semantically similar regions. These\ncomplementary masks result in robust decomposition. Experimental results on\ndifferent datasets show DORec's superiority in segmenting and reconstructing\ndiverse foreground objects from varied backgrounds enabling downstream tasks\nsuch as pose estimation.\n","authors":["Jun Wu","Sicheng Li","Sihui Ji","Yifei Yang","Yue Wang","Rong Xiong","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2310.11092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03517v3","updated":"2024-09-02T08:30:37Z","published":"2023-12-06T14:24:26Z","title":"FRDiff : Feature Reuse for Universal Training-free Acceleration of\n Diffusion Models","summary":" The substantial computational costs of diffusion models, especially due to\nthe repeated denoising steps necessary for high-quality image generation,\npresent a major obstacle to their widespread adoption. While several studies\nhave attempted to address this issue by reducing the number of score function\nevaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased\nnumber of denoising iterations misses the opportunity to update fine details,\nresulting in noticeable quality degradation. In our work, we introduce an\nadvanced acceleration technique that leverages the temporal redundancy inherent\nin diffusion models. Reusing feature maps with high temporal similarity opens\nup a new opportunity to save computation resources without compromising output\nquality. To realize the practical benefits of this intuition, we conduct an\nextensive analysis and propose a novel method, FRDiff. FRDiff is designed to\nharness the advantages of both reduced NFE and feature reuse, achieving a\nPareto frontier that balances fidelity and latency trade-offs in various\ngenerative tasks.\n","authors":["Junhyuk So","Jungwon Lee","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2312.03517v3.pdf","comment":"Accepted at ECCV 2024. Code :\n https://github.com/ECoLab-POSTECH/FRDiff"},{"id":"http://arxiv.org/abs/2406.05677v2","updated":"2024-09-02T07:32:55Z","published":"2024-06-09T07:22:50Z","title":"Evolution-aware VAriance (EVA) Coreset Selection for Medical Image\n Classification","summary":" In the medical field, managing high-dimensional massive medical imaging data\nand performing reliable medical analysis from it is a critical challenge,\nespecially in resource-limited environments such as remote medical facilities\nand mobile devices. This necessitates effective dataset compression techniques\nto reduce storage, transmission, and computational cost. However, existing\ncoreset selection methods are primarily designed for natural image datasets,\nand exhibit doubtful effectiveness when applied to medical image datasets due\nto challenges such as intra-class variation and inter-class similarity. In this\npaper, we propose a novel coreset selection strategy termed as Evolution-aware\nVAriance (EVA), which captures the evolutionary process of model training\nthrough a dual-window approach and reflects the fluctuation of sample\nimportance more precisely through variance measurement. Extensive experiments\non medical image datasets demonstrate the effectiveness of our strategy over\nprevious SOTA methods, especially at high compression rates. EVA achieves\n98.27% accuracy with only 10% training data, compared to 97.20% for the full\ntraining set. None of the compared baseline methods can exceed Random at 5%\nselection rate, while EVA outperforms Random by 5.61%, showcasing its potential\nfor efficient medical image analysis.\n","authors":["Yuxin Hong","Xiao Zhang","Xin Zhang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.05677v2.pdf","comment":"Accepted by ACM Multimedia 2024 (oral), see:\n https://openreview.net/forum?id=m1qrB9KSYD"},{"id":"http://arxiv.org/abs/2405.15434v3","updated":"2024-09-02T07:18:16Z","published":"2024-05-24T11:02:55Z","title":"Biometrics and Behavior Analysis for Detecting Distractions in\n e-Learning","summary":" In this article, we explore computer vision approaches to detect abnormal\nhead pose during e-learning sessions and we introduce a study on the effects of\nmobile phone usage during these sessions. We utilize behavioral data collected\nfrom 120 learners monitored while participating in a MOOC learning sessions.\nOur study focuses on the influence of phone-usage events on behavior and\nphysiological responses, specifically attention, heart rate, and meditation,\nbefore, during, and after phone usage. Additionally, we propose an approach for\nestimating head pose events using images taken by the webcam during the MOOC\nlearning sessions to detect phone-usage events. Our hypothesis suggests that\nhead posture undergoes significant changes when learners interact with a mobile\nphone, contrasting with the typical behavior seen when learners face a computer\nduring e-learning sessions. We propose an approach designed to detect\ndeviations in head posture from the average observed during a learner's\nsession, operating as a semi-supervised method. This system flags events\nindicating alterations in head posture for subsequent human review and\nselection of mobile phone usage occurrences with a sensitivity over 90%.\n","authors":["Álvaro Becerra","Javier Irigoyen","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez","Mutlu Cukurova"],"pdf_url":"https://arxiv.org/pdf/2405.15434v3.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2405.20091v4","updated":"2024-09-02T07:15:02Z","published":"2024-05-30T14:27:40Z","title":"VAAD: Visual Attention Analysis Dashboard applied to e-Learning","summary":" In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v4.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2312.16039v2","updated":"2024-09-02T06:47:08Z","published":"2023-12-26T12:56:31Z","title":"Dual-scale Enhanced and Cross-generative Consistency Learning for\n Semi-supervised Medical Image Segmentation","summary":" Medical image segmentation plays a crucial role in computer-aided diagnosis.\nHowever, existing methods heavily rely on fully supervised training, which\nrequires a large amount of labeled data with time-consuming pixel-wise\nannotations. Moreover, accurately segmenting lesions poses challenges due to\nvariations in shape, size, and location. To address these issues, we propose a\nnovel Dual-scale Enhanced and Cross-generative consistency learning framework\nfor semi-supervised medical image Segmentation (DEC-Seg). First, we propose a\nCross-level Feature Aggregation (CFA) module that integrates cross-level\nadjacent layers to enhance the feature representation ability across different\nresolutions. To address scale variation, we present a scale-enhanced\nconsistency constraint, which ensures consistency in the segmentation maps\ngenerated from the same input image at different scales. This constraint helps\nhandle variations in lesion sizes and improves the robustness of the model.\nFurthermore, we propose a cross-generative consistency scheme, in which the\noriginal and perturbed images can be reconstructed using cross-segmentation\nmaps. This consistency constraint allows us to mine effective feature\nrepresentations and boost the segmentation performance. To further exploit the\nscale information, we propose a Dual-scale Complementary Fusion (DCF) module\nthat integrates features from two scale-specific decoders operating at\ndifferent scales to help produce more accurate segmentation maps. Extensive\nexperimental results on multiple medical segmentation tasks (polyp, skin\nlesion, and brain glioma) demonstrate the effectiveness of our DEC-Seg against\nother state-of-the-art semi-supervised segmentation approaches. The\nimplementation code will be released at https://github.com/taozh2017/DECSeg.\n","authors":["Yunqi Gu","Tao Zhou","Yizhe Zhang","Yi Zhou","Kelei He","Chen Gong","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2312.16039v2.pdf","comment":"12 pages 10 figures"},{"id":"http://arxiv.org/abs/2312.10692v2","updated":"2024-09-02T06:37:04Z","published":"2023-12-17T11:59:14Z","title":"Pedestrian Attribute Recognition via CLIP based Prompt Vision-Language\n Fusion","summary":" Existing pedestrian attribute recognition (PAR) algorithms adopt pre-trained\nCNN (e.g., ResNet) as their backbone network for visual feature learning, which\nmight obtain sub-optimal results due to the insufficient employment of the\nrelations between pedestrian images and attribute labels. In this paper, we\nformulate PAR as a vision-language fusion problem and fully exploit the\nrelations between pedestrian images and attribute labels. Specifically, the\nattribute phrases are first expanded into sentences, and then the pre-trained\nvision-language model CLIP is adopted as our backbone for feature embedding of\nvisual images and attribute descriptions. The contrastive learning objective\nconnects the vision and language modalities well in the CLIP-based feature\nspace, and the Transformer layers used in CLIP can capture the long-range\nrelations between pixels. Then, a multi-modal Transformer is adopted to fuse\nthe dual features effectively and feed-forward network is used to predict\nattributes. To optimize our network efficiently, we propose the region-aware\nprompt tuning technique to adjust very few parameters (i.e., only the prompt\nvectors and classification heads) and fix both the pre-trained VL model and\nmulti-modal Transformer. Our proposed PAR algorithm only adjusts 0.75%\nlearnable parameters compared with the fine-tuning strategy. It also achieves\nnew state-of-the-art performance on both standard and zero-shot settings for\nPAR, including RAPv1, RAPv2, WIDER, PA100K, and PETA-ZS, RAP-ZS datasets. The\nsource code and pre-trained models will be released on\nhttps://github.com/Event-AHU/OpenPAR.\n","authors":["Xiao Wang","Jiandong Jin","Chenglong Li","Jin Tang","Cheng Zhang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.10692v2.pdf","comment":"Accepted by IEEE TCSVT 2024, Camera Ready Version"},{"id":"http://arxiv.org/abs/2408.17011v2","updated":"2024-09-02T06:31:48Z","published":"2024-08-30T04:51:19Z","title":"Disease Classification and Impact of Pretrained Deep Convolution Neural\n Networks on Diverse Medical Imaging Datasets across Imaging Modalities","summary":" Imaging techniques such as Chest X-rays, whole slide images, and optical\ncoherence tomography serve as the initial screening and detection for a wide\nvariety of medical pulmonary and ophthalmic conditions respectively. This paper\ninvestigates the intricacies of using pretrained deep convolutional neural\nnetworks with transfer learning across diverse medical imaging datasets with\nvarying modalities for binary and multiclass classification. We conducted a\ncomprehensive performance analysis with ten network architectures and model\nfamilies each with pretraining and random initialization. Our finding showed\nthat the use of pretrained models as fixed feature extractors yields poor\nperformance irrespective of the datasets. Contrary, histopathology microscopy\nwhole slide images have better performance. It is also found that deeper and\nmore complex architectures did not necessarily result in the best performance.\nThis observation implies that the improvements in ImageNet are not parallel to\nthe medical imaging tasks. Within a medical domain, the performance of the\nnetwork architectures varies within model families with shifts in datasets.\nThis indicates that the performance of models within a specific modality may\nnot be conclusive for another modality within the same domain. This study\nprovides a deeper understanding of the applications of deep learning techniques\nin medical imaging and highlights the impact of pretrained networks across\ndifferent medical imaging datasets under five different experimental settings.\n","authors":["Jutika Borah","Kumaresh Sarmah","Hidam Kumarjit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.17011v2.pdf","comment":"15 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.17064v2","updated":"2024-09-02T06:25:09Z","published":"2024-08-30T07:49:35Z","title":"Instant Adversarial Purification with Adversarial Consistency\n Distillation","summary":" Neural networks, despite their remarkable performance in widespread\napplications, including image classification, are also known to be vulnerable\nto subtle adversarial noise. Although some diffusion-based purification methods\nhave been proposed, for example, DiffPure, those methods are time-consuming. In\nthis paper, we propose One Step Control Purification (OSCP), a diffusion-based\npurification model that can purify the adversarial image in one Neural Function\nEvaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and\nControlNet for our one-step purification. OSCP is computationally friendly and\ntime efficient compared to other diffusion-based purification methods; we\nachieve defense success rate of 74.19\\% on ImageNet, only requiring 0.1s for\neach purification. Moreover, there is a fundamental incongruence between\nconsistency distillation and adversarial perturbation. To address this\nontological dissonance, we propose Gaussian Adversarial Noise Distillation\n(GAND), a novel consistency distillation framework that facilitates a more\nnuanced reconciliation of the latent space dynamics, effectively bridging the\nnatural and adversarial manifolds. Our experiments show that the GAND does not\nneed a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient.\n","authors":["Chun Tong Lei","Hon Ming Yam","Zhongliang Guo","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.17064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03024v3","updated":"2024-09-02T05:51:02Z","published":"2023-08-06T05:23:25Z","title":"Show Me the World in My Language: Establishing the First Baseline for\n Scene-Text to Scene-Text Translation","summary":" In this work, we study the task of ``visually'' translating scene text from a\nsource language (e.g., Hindi) to a target language (e.g., English). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe source scene text, such as font, size, and background. There are several\nchallenges associated with this task, such as translation with limited context,\ndeciding between translation and transliteration, accommodating varying text\nlengths within fixed spatial boundaries, and preserving the font and background\nstyles of the source scene text in the target language. To address this\nproblem, we make the following contributions: (i) We study visual translation\nas a standalone problem for the first time in the literature. (ii) We present a\ncascaded framework for visual translation that combines state-of-the-art\nmodules for scene text recognition, machine translation, and scene text\nsynthesis as a baseline for the task. (iii) We propose a set of task-specific\ndesign enhancements to design a variant of the baseline to obtain performance\nimprovements. (iv) Currently, the existing related literature lacks any\ncomprehensive performance evaluation for this novel task. To fill this gap, we\nintroduce several automatic and user-assisted evaluation metrics designed\nexplicitly for evaluating visual translation. Further, we evaluate presented\nbaselines for translating scene text between Hindi and English. Our experiments\ndemonstrate that although we can effectively perform visual translation over a\nlarge collection of scene text images, the presented baseline only partially\naddresses challenges posed by visual translation tasks. We firmly believe that\nthis new task and the limitations of existing models, as reported in this\npaper, should encourage further research in visual translation.\n","authors":["Shreyas Vaidya","Arvind Kumar Sharma","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v3.pdf","comment":"Accepted at ICPR 2024, Project Website:\n https://vl2g.github.io/projects/visTrans/"},{"id":"http://arxiv.org/abs/2403.06764v3","updated":"2024-09-02T05:48:54Z","published":"2024-03-11T14:35:32Z","title":"An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference\n Acceleration for Large Vision-Language Models","summary":" In this study, we identify the inefficient attention phenomena in Large\nVision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5,\nQwenVL-Chat and Video-LLaVA. We find out that the attention computation over\nvisual tokens is of extreme inefficiency in the deep layers of popular LVLMs,\nsuggesting a need for a sparser approach compared to textual data handling. To\nthis end, we introduce FastV, a versatile plug-and-play method designed to\noptimize computational efficiency by learning adaptive attention patterns in\nearly layers and pruning visual tokens in subsequent ones. Our evaluations\ndemonstrate FastV's ability to dramatically reduce computational costs (e.g., a\n45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a\nwide range of image and video understanding tasks. The computational efficiency\nand performance trade-off of FastV are highly customizable and\npareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve\na lower budget than that of a 7B-parameter model, while still maintaining\nsuperior performance. We believe FastV has practical values for deployment of\nLVLMs in edge devices and commercial models. Code is released at\nhttps://github.com/pkunlp-icler/FastV.\n","authors":["Liang Chen","Haozhe Zhao","Tianyu Liu","Shuai Bai","Junyang Lin","Chang Zhou","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2403.06764v3.pdf","comment":"Accepted to ECCV 2024 (Oral), code is released at\n https://github.com/pkunlp-icler/FastV,"},{"id":"http://arxiv.org/abs/2408.11402v2","updated":"2024-09-02T05:46:43Z","published":"2024-08-21T08:01:00Z","title":"Video Diffusion Models are Strong Video Inpainter","summary":" Propagation-based video inpainting using optical flow at the pixel or feature\nlevel has recently garnered significant attention. However, it has limitations\nsuch as the inaccuracy of optical flow prediction and the propagation of noise\nover time. These issues result in non-uniform noise and time consistency\nproblems throughout the video, which are particularly pronounced when the\nremoved area is large and involves substantial movement. To address these\nissues, we propose a novel First Frame Filling Video Diffusion Inpainting model\n(FFF-VDI). We design FFF-VDI inspired by the capabilities of pre-trained\nimage-to-video diffusion models that can transform the first frame image into a\nhighly natural video. To apply this to the video inpainting task, we propagate\nthe noise latent information of future frames to fill the masked areas of the\nfirst frame's noise latent code. Next, we fine-tune the pre-trained\nimage-to-video diffusion model to generate the inpainted video. The proposed\nmodel addresses the limitations of existing methods that rely on optical flow\nquality, producing much more natural and temporally consistent videos. This\nproposed approach is the first to effectively integrate image-to-video\ndiffusion models into video inpainting tasks. Through various comparative\nexperiments, we demonstrate that the proposed model can robustly handle diverse\ninpainting types with high quality.\n","authors":["Minhyeok Lee","Suhwan Cho","Chajin Shin","Jungho Lee","Sunghun Yang","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2408.11402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10901v2","updated":"2024-09-02T05:25:06Z","published":"2024-08-20T14:43:53Z","title":"A Grey-box Attack against Latent Diffusion Model-based Image Editing by\n Posterior Collapse","summary":" Recent advancements in generative AI, particularly Latent Diffusion Models\n(LDMs), have revolutionized image synthesis and manipulation. However, these\ngenerative techniques raises concerns about data misappropriation and\nintellectual property infringement. Adversarial attacks on machine learning\nmodels have been extensively studied, and a well-established body of research\nhas extended these techniques as a benign metric to prevent the underlying\nmisuse of generative AI. Current approaches to safeguarding images from\nmanipulation by LDMs are limited by their reliance on model-specific knowledge\nand their inability to significantly degrade semantic quality of generated\nimages. In response to these shortcomings, we propose the Posterior Collapse\nAttack (PCA) based on the observation that VAEs suffer from posterior collapse\nduring training. Our method minimizes dependence on the white-box information\nof target models to get rid of the implicit reliance on model-specific\nknowledge. By accessing merely a small amount of LDM parameters, in specific\nmerely the VAE encoder of LDMs, our method causes a substantial semantic\ncollapse in generation quality, particularly in perceptual consistency, and\ndemonstrates strong transferability across various model architectures.\nExperimental results show that PCA achieves superior perturbation effects on\nimage generation of LDMs with lower runtime and VRAM. Our method outperforms\nexisting techniques, offering a more robust and generalizable solution that is\nhelpful in alleviating the socio-technical challenges posed by the rapidly\nevolving landscape of generative AI.\n","authors":["Zhongliang Guo","Lei Fang","Jingyu Lin","Yifei Qian","Shuai Zhao","Zeyu Wang","Junhao Dong","Cunjian Chen","Ognjen Arandjelović","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.10901v2.pdf","comment":"21 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2303.03856v3","updated":"2024-09-02T03:56:06Z","published":"2023-03-07T12:48:02Z","title":"Event Voxel Set Transformer for Spatiotemporal Representation Learning\n on Event Streams","summary":" Event cameras are neuromorphic vision sensors that record a scene as sparse\nand asynchronous event streams. Most event-based methods project events into\ndense frames and process them using conventional vision models, resulting in\nhigh computational complexity. A recent trend is to develop point-based\nnetworks that achieve efficient event processing by learning sparse\nrepresentations. However, existing works may lack robust local information\naggregators and effective feature interaction operations, thus limiting their\nmodeling capabilities. To this end, we propose an attention-aware model named\nEvent Voxel Set Transformer (EVSTr) for efficient spatiotemporal representation\nlearning on event streams. It first converts the event stream into voxel sets\nand then hierarchically aggregates voxel features to obtain robust\nrepresentations. The core of EVSTr is an event voxel transformer encoder that\nconsists of two well-designed components, including the Multi-Scale Neighbor\nEmbedding Layer (MNEL) for local information aggregation and the Voxel\nSelf-Attention Layer (VSAL) for global feature interaction. Enabling the\nnetwork to incorporate a long-range temporal structure, we introduce a segment\nmodeling strategy (S$^{2}$TM) to learn motion patterns from a sequence of\nsegmented voxel sets. The proposed model is evaluated on two recognition tasks,\nincluding object classification and action recognition. To provide a convincing\nmodel evaluation, we present a new event-based action recognition dataset\n(NeuroHAR) recorded in challenging scenarios. Comprehensive experiments show\nthat EVSTr achieves state-of-the-art performance while maintaining low model\ncomplexity.\n","authors":["Bochen Xie","Yongjian Deng","Zhanpeng Shao","Qingsong Xu","Youfu Li"],"pdf_url":"https://arxiv.org/pdf/2303.03856v3.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2402.14154v3","updated":"2024-09-02T02:41:26Z","published":"2024-02-21T22:27:40Z","title":"MM-Soc: Benchmarking Multimodal Large Language Models in Social Media\n Platforms","summary":" Social media platforms are hubs for multimodal information exchange,\nencompassing text, images, and videos, making it challenging for machines to\ncomprehend the information or emotions associated with interactions in online\nspaces. Multimodal Large Language Models (MLLMs) have emerged as a promising\nsolution to these challenges, yet they struggle to accurately interpret human\nemotions and complex content such as misinformation. This paper introduces\nMM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of\nmultimodal social media content. MM-Soc compiles prominent multimodal datasets\nand incorporates a novel large-scale YouTube tagging dataset, targeting a range\nof tasks from misinformation detection, hate speech detection, and social\ncontext generation. Through our exhaustive evaluation on ten size-variants of\nfour open-source MLLMs, we have identified significant performance disparities,\nhighlighting the need for advancements in models' social understanding\ncapabilities. Our analysis reveals that, in a zero-shot setting, various types\nof MLLMs generally exhibit difficulties in handling social media tasks.\nHowever, MLLMs demonstrate performance improvements post fine-tuning,\nsuggesting potential pathways for improvement. Our code and data are available\nat https://github.com/claws-lab/MMSoc.git.\n","authors":["Yiqiao Jin","Minje Choi","Gaurav Verma","Jindong Wang","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.14154v3.pdf","comment":"In Proceedings of ACL 2024"},{"id":"http://arxiv.org/abs/2408.15063v3","updated":"2024-09-02T02:32:45Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n with Semantic Feature Fusion Guidance","summary":" Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction. To\naddress these issues, we first design a multi-modal complementary fusion module\nto extract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework. The code will be\navailable at \\url{https://github.com/Angknpng/Sammese}.\n","authors":["Kunpeng Wang","Danying Lin","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v3.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.02112v2","updated":"2024-09-02T02:20:05Z","published":"2024-02-03T10:35:42Z","title":"S-NeRF++: Autonomous Driving Simulation via Neural Reconstruction and\n Generation","summary":" Autonomous driving simulation system plays a crucial role in enhancing\nself-driving data and simulating complex and rare traffic scenarios, ensuring\nnavigation safety. However, traditional simulation systems, which often heavily\nrely on manual modeling and 2D image editing, struggled with scaling to\nextensive scenes and generating realistic simulation data. In this study, we\npresent S-NeRF++, an innovative autonomous driving simulation system based on\nneural reconstruction. Trained on widely-used self-driving datasets such as\nnuScenes and Waymo, S-NeRF++ can generate a large number of realistic street\nscenes and foreground objects with high rendering quality as well as offering\nconsiderable flexibility in manipulation and simulation. Specifically, S-NeRF++\nis an enhanced neural radiance field for synthesizing large-scale scenes and\nmoving vehicles, with improved scene parameterization and camera pose learning.\nThe system effectively utilizes noisy and sparse LiDAR data to refine training\nand address depth outliers, ensuring high-quality reconstruction and novel-view\nrendering. It also provides a diverse foreground asset bank by reconstructing\nand generating different foreground vehicles to support comprehensive scenario\ncreation.Moreover, we have developed an advanced foreground-background fusion\npipeline that skillfully integrates illumination and shadow effects, further\nenhancing the realism of our simulations. With the high-quality simulated data\nprovided by our S-NeRF++, we found the perception methods enjoy performance\nboosts on several autonomous driving downstream tasks, further demonstrating\nour proposed simulator's effectiveness.\n","authors":["Yurui Chen","Junge Zhang","Ziyang Xie","Wenye Li","Feihu Zhang","Jiachen Lu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07704v2","updated":"2024-09-02T02:08:22Z","published":"2023-09-14T13:29:41Z","title":"NutritionVerse: Empirical Study of Various Dietary Intake Estimation\n Approaches","summary":" Accurate dietary intake estimation is critical for informing policies and\nprograms to support healthy eating, as malnutrition has been directly linked to\ndecreased quality of life. However self-reporting methods such as food diaries\nsuffer from substantial bias. Other conventional dietary assessment techniques\nand emerging alternative approaches such as mobile applications incur high time\ncosts and may necessitate trained personnel. Recent work has focused on using\ncomputer vision and machine learning to automatically estimate dietary intake\nfrom food images, but the lack of comprehensive datasets with diverse\nviewpoints, modalities and food annotations hinders the accuracy and realism of\nsuch methods. To address this limitation, we introduce NutritionVerse-Synth,\nthe first large-scale dataset of 84,984 photorealistic synthetic 2D food images\nwith associated dietary information and multimodal annotations (including depth\nimages, instance masks, and semantic masks). Additionally, we collect a real\nimage dataset, NutritionVerse-Real, containing 889 images of 251 dishes to\nevaluate realism. Leveraging these novel datasets, we develop and benchmark\nNutritionVerse, an empirical study of various dietary intake estimation\napproaches, including indirect segmentation-based and direct prediction\nnetworks. We further fine-tune models pretrained on synthetic data with real\nimages to provide insights into the fusion of synthetic and real data. Finally,\nwe release both datasets (NutritionVerse-Synth, NutritionVerse-Real) on\nhttps://www.kaggle.com/nutritionverse/datasets as part of an open initiative to\naccelerate machine learning for dietary sensing.\n","authors":["Chi-en Amy Tai","Matthew Keller","Saeejith Nair","Yuhao Chen","Yifan Wu","Olivia Markham","Krish Parmar","Pengcheng Xi","Heather Keller","Sharon Kirkpatrick","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2309.07704v2.pdf","comment":"Corrections made to Tables 6, 7, and 8, and corrections made to\n Experiments Part C. Additional clarification made in Section 4"},{"id":"http://arxiv.org/abs/2403.10814v2","updated":"2024-09-02T00:54:47Z","published":"2024-03-16T05:21:42Z","title":"DarkGS: Learning Neural Illumination and 3D Gaussians Relighting for\n Robotic Exploration in the Dark","summary":" Humans have the remarkable ability to construct consistent mental models of\nan environment, even under limited or varying levels of illumination. We wish\nto endow robots with this same capability. In this paper, we tackle the\nchallenge of constructing a photorealistic scene representation under poorly\nilluminated conditions and with a moving light source. We approach the task of\nmodeling illumination as a learning problem, and utilize the developed\nillumination model to aid in scene reconstruction. We introduce an innovative\nframework that uses a data-driven approach, Neural Light Simulators (NeLiS), to\nmodel and calibrate the camera-light system. Furthermore, we present DarkGS, a\nmethod that applies NeLiS to create a relightable 3D Gaussian scene model\ncapable of real-time, photorealistic rendering from novel viewpoints. We show\nthe applicability and robustness of our proposed simulator and system in a\nvariety of real-world environments.\n","authors":["Tianyi Zhang","Kaining Huang","Weiming Zhi","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2403.10814v2.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.12606v2","updated":"2024-09-02T00:52:01Z","published":"2024-08-08T05:04:13Z","title":"Towards Non-invasive and Personalized Management of Breast Cancer\n Patients from Multiparametric MRI via A Large Mixture-of-Modality-Experts\n Model","summary":" Breast magnetic resonance imaging (MRI) is the imaging technique with the\nhighest sensitivity for detecting breast cancer and is routinely used for women\nat high risk. Despite the comprehensive multiparametric protocol of breast MRI,\nexisting artificial intelligence-based studies predominantly rely on single\nsequences and have limited validation. Here we report a large\nmixture-of-modality-experts model (MOME) that integrates multiparametric MRI\ninformation within a unified structure, offering a noninvasive method for\npersonalized breast cancer management. We have curated the largest\nmultiparametric breast MRI dataset, involving 5,205 patients from three\nhospitals in the north, southeast, and southwest of China, for the development\nand extensive evaluation of our model. MOME demonstrated accurate and robust\nidentification of breast cancer. It achieved comparable performance for\nmalignancy recognition to that of four senior radiologists and significantly\noutperformed a junior radiologist, with 0.913 AUROC, 0.948 AUPRC, 0.905 F1\nscore, and 0.723 MCC. Our findings suggest that MOME could reduce the need for\nbiopsies in BI-RADS 4 patients with a ratio of 7.3%, classify triple-negative\nbreast cancer with an AUROC of 0.709, and predict pathological complete\nresponse to neoadjuvant chemotherapy with an AUROC of 0.694. The model further\nsupports scalable and interpretable inference, adapting to missing modalities\nand providing decision explanations by highlighting lesions and measuring\nmodality contributions. MOME exemplifies a discriminative, robust, scalable,\nand interpretable multimodal model, paving the way for noninvasive,\npersonalized management of breast cancer patients based on multiparametric\nbreast imaging data.\n","authors":["Luyang Luo","Mingxiang Wu","Mei Li","Yi Xin","Qiong Wang","Varut Vardhanabhuti","Winnie CW Chu","Zhenhui Li","Juan Zhou","Pranav Rajpurkar","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12606v2.pdf","comment":"27 pages, 8 figures, 10 tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.07981v2","updated":"2024-09-02T21:29:04Z","published":"2024-04-11T17:57:32Z","title":"Manipulating Large Language Models to Increase Product Visibility","summary":" Large language models (LLMs) are increasingly being integrated into search\nengines to provide natural language responses tailored to user queries.\nCustomers and end-users are also becoming more dependent on these models for\nquick and easy purchase decisions. In this work, we investigate whether\nrecommendations from LLMs can be manipulated to enhance a product's visibility.\nWe demonstrate that adding a strategic text sequence (STS) -- a carefully\ncrafted message -- to a product's information page can significantly increase\nits likelihood of being listed as the LLM's top recommendation. To understand\nthe impact of STS, we use a catalog of fictitious coffee machines and analyze\nits effect on two target products: one that seldom appears in the LLM's\nrecommendations and another that usually ranks second. We observe that the\nstrategic text sequence significantly enhances the visibility of both products\nby increasing their chances of appearing as the top recommendation. This\nability to manipulate LLM-generated search responses provides vendors with a\nconsiderable competitive advantage and has the potential to disrupt fair market\ncompetition. Just as search engine optimization (SEO) revolutionized how\nwebpages are customized to rank higher in search engine results, influencing\nLLM recommendations could profoundly impact content optimization for AI-driven\nsearch services. Code for our experiments is available at\nhttps://github.com/aounon/llm-rank-optimizer.\n","authors":["Aounon Kumar","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2404.07981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02935v2","updated":"2024-09-02T20:06:21Z","published":"2022-12-06T12:45:15Z","title":"A multi-language toolkit for supporting automated checking of research\n outputs","summary":" This article presents the automatic checking of research outputs package\nacro, which assists researchers and data governance teams by automatically\napplying best-practice principles-based statistical disclosure control (SDC)\ntechniques on-the-fly as researchers conduct their analyses. acro distinguishes\nbetween: research output that is safe to publish; output that requires further\nanalysis; and output that cannot be published because it creates substantial\nrisk of disclosing private data. This is achieved through the use of a\nlightweight Python wrapper that sits over well-known analysis tools that\nproduce outputs such as tables, plots, and statistical models. This adds\nfunctionality to (i) identify potentially disclosive outputs against a range of\ncommonly used disclosure tests; (ii) apply disclosure mitigation strategies\nwhere required; (iii) report reasons for applying SDC; and (iv) produce simple\nsummary documents trusted research environment staff can use to streamline\ntheir workflow. The major analytical programming languages used by researchers\nare supported: Python, R, and Stata. The acro code and documentation are\navailable under an MIT license at https://github.com/AI-SDC/ACRO\n","authors":["Richard J. Preen","Maha Albashir","Simon Davy","Jim Smith"],"pdf_url":"https://arxiv.org/pdf/2212.02935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01304v3","updated":"2024-09-02T18:53:22Z","published":"2023-11-02T15:18:00Z","title":"VM-Rec: A Variational Mapping Approach for Cold-start User\n Recommendation","summary":" The cold-start problem is a common challenge for most recommender systems.\nThe practical application of most cold-start methods is hindered by the\ndeficiency in auxiliary content information for users. Moreover, most methods\nnecessitate simultaneous updates to the extensive parameters of recommender\nmodels, leading to significant training costs, particularly in large-scale\nindustrial scenarios. We observe that the model can generate expressive\nembeddings for warm users with relatively more interactions. Initially, these\nusers were cold-start users, and after transitioning to warm users, they\nexhibit clustering patterns in their embeddings with consistent initial\ninteractions. Based on this motivation, we propose a Variational Mapping\napproach for cold-start user Recommendation (VM-Rec), mapping from few initial\ninteractions to expressive embeddings for cold-start users. Specifically, we\nencode the initial interactions into a latent representation, where each\ndimension disentangledly signifies the degree of association with each warm\nuser. Subsequently, we utilize this latent representation as the parameters for\nthe mapping function, mapping (decoding) it into an expressive embedding, which\ncan be integrated into a pre-trained recommender model directly. Our method is\nevaluated on three datasets using the same base model, demonstrating superior\nperformance compared to other popular cold-start methods.\n","authors":["Linan Zheng","Jiale Chen","Pengsheng Liu","Guangfa Zhang","Jinyun Fang"],"pdf_url":"https://arxiv.org/pdf/2311.01304v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05141v3","updated":"2024-09-02T10:55:30Z","published":"2024-08-09T15:53:55Z","title":"A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning","summary":" Retrieval-augmented generation (RAG) is a framework enabling large language\nmodels (LLMs) to enhance their accuracy and reduce hallucinations by\nintegrating external knowledge bases. In this paper, we introduce a hybrid RAG\nsystem enhanced through a comprehensive suite of optimizations that\nsignificantly improve retrieval quality, augment reasoning capabilities, and\nrefine numerical computation ability. We refined the text chunks and tables in\nweb pages, added attribute predictors to reduce hallucinations, conducted LLM\nKnowledge Extractor and Knowledge Graph Extractor, and finally built a\nreasoning strategy with all the references. We evaluated our system on the CRAG\ndataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and\nonline evaluations demonstrate that our system significantly enhances complex\nreasoning capabilities. In local evaluations, we have significantly improved\naccuracy and reduced error rates compared to the baseline model, achieving a\nnotable increase in scores. In the meanwhile, we have attained outstanding\nresults in online assessments, demonstrating the performance and generalization\ncapabilities of the proposed system. The source code for our system is released\nin \\url{https://gitlab.aicrowd.com/shizueyy/crag-new}.\n","authors":["Ye Yuan","Chengwu Liu","Jingyang Yuan","Gongbo Sun","Siqi Li","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05141v3.pdf","comment":"Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024"},{"id":"http://arxiv.org/abs/2312.12162v2","updated":"2024-09-02T07:58:20Z","published":"2023-12-19T13:51:48Z","title":"PEPT: Expert Finding Meets Personalized Pre-training","summary":" Finding experts is essential in Community Question Answering (CQA) platforms\nas it enables the effective routing of questions to potential users who can\nprovide relevant answers. The key is to personalized learning expert\nrepresentations based on their historical answered questions, and accurately\nmatching them with target questions. There have been some preliminary works\nexploring the usability of PLMs in expert finding, such as pre-training expert\nor question representations. However, these models usually learn pure text\nrepresentations of experts from histories, disregarding personalized and\nfine-grained expert modeling. For alleviating this, we present a personalized\npre-training and fine-tuning paradigm, which could effectively learn expert\ninterest and expertise simultaneously. Specifically, in our pre-training\nframework, we integrate historical answered questions of one expert with one\ntarget question, and regard it as a candidate aware expert-level input unit.\nThen, we fuse expert IDs into the pre-training for guiding the model to model\npersonalized expert representations, which can help capture the unique\ncharacteristics and expertise of each individual expert. Additionally, in our\npre-training task, we design: 1) a question-level masked language model task to\nlearn the relatedness between histories, enabling the modeling of\nquestion-level expert interest; 2) a vote-oriented task to capture\nquestion-level expert expertise by predicting the vote score the expert would\nreceive. Through our pre-training framework and tasks, our approach could\nholistically learn expert representations including interests and expertise.\nOur method has been extensively evaluated on six real-world CQA datasets, and\nthe experimental results consistently demonstrate the superiority of our\napproach over competitive baseline methods.\n","authors":["Qiyao Peng","Hongyan Xu","Yinghui Wang","Hongtao Liu","Cuiying Huo","Wenjun Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01445v1","updated":"2024-09-02T20:00:49Z","published":"2024-09-02T20:00:49Z","title":"Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets","summary":" Temporal video alignment aims to synchronize the key events like object\ninteractions or action phase transitions in two videos. Such methods could\nbenefit various video editing, processing, and understanding tasks. However,\nexisting approaches operate under the restrictive assumption that a suitable\nvideo pair for alignment is given, significantly limiting their broader\napplicability. To address this, we re-pose temporal alignment as a search\nproblem and introduce the task of Alignable Video Retrieval (AVR). Given a\nquery video, our approach can identify well-alignable videos from a large\ncollection of clips and temporally synchronize them to the query. To achieve\nthis, we make three key contributions: 1) we introduce DRAQ, a video\nalignability indicator to identify and re-rank the best alignable video from a\nset of candidates; 2) we propose an effective and generalizable frame-level\nvideo feature design to improve the alignment performance of several\noff-the-shelf feature representations, and 3) we propose a novel benchmark and\nevaluation protocol for AVR using cycle-consistency metrics. Our experiments on\n3 datasets, including large-scale Kinetics700, demonstrate the effectiveness of\nour approach in identifying alignable video pairs from diverse datasets.\nProject Page: https://daveishan.github.io/avr-webpage/.\n","authors":["Ishan Rajendrakumar Dave","Fabian Caba Heilbron","Mubarak Shah","Simon Jenni"],"pdf_url":"https://arxiv.org/pdf/2409.01445v1.pdf","comment":"ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.01357v1","updated":"2024-09-02T16:19:13Z","published":"2024-09-02T16:19:13Z","title":"Know When to Fuse: Investigating Non-English Hybrid Retrieval in the\n Legal Domain","summary":" Hybrid search has emerged as an effective strategy to offset the limitations\nof different matching paradigms, especially in out-of-domain contexts where\nnotable improvements in retrieval quality have been observed. However, existing\nresearch predominantly focuses on a limited set of retrieval methods, evaluated\nin pairs on domain-general datasets exclusively in English. In this work, we\nstudy the efficacy of hybrid search across a variety of prominent retrieval\nmodels within the unexplored field of law in the French language, assessing\nboth zero-shot and in-domain scenarios. Our findings reveal that in a zero-shot\ncontext, fusing different domain-general models consistently enhances\nperformance compared to using a standalone model, regardless of the fusion\nmethod. Surprisingly, when models are trained in-domain, we find that fusion\ngenerally diminishes performance relative to using the best single system,\nunless fusing scores with carefully tuned weights. These novel insights, among\nothers, expand the applicability of prior findings across a new field and\nlanguage, and contribute to a deeper understanding of hybrid search in\nnon-English specialized domains.\n","authors":["Antoine Louis","Gijs van Dijck","Gerasimos Spanakis"],"pdf_url":"https://arxiv.org/pdf/2409.01357v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.01192v1","updated":"2024-09-02T11:58:56Z","published":"2024-09-02T11:58:56Z","title":"SSD4Rec: A Structured State Space Duality Model for Efficient Sequential\n Recommendation","summary":" Sequential recommendation methods are crucial in modern recommender systems\nfor their remarkable capability to understand a user's changing interests based\non past interactions. However, a significant challenge faced by current methods\n(e.g., RNN- or Transformer-based models) is to effectively and efficiently\ncapture users' preferences by modeling long behavior sequences, which impedes\ntheir various applications like short video platforms where user interactions\nare numerous. Recently, an emerging architecture named Mamba, built on state\nspace models (SSM) with efficient hardware-aware designs, has showcased the\ntremendous potential for sequence modeling, presenting a compelling avenue for\naddressing the challenge effectively. Inspired by this, we propose a novel\ngeneric and efficient sequential recommendation backbone, SSD4Rec, which\nexplores the seamless adaptation of Mamba for sequential recommendations.\nSpecifically, SSD4Rec marks the variable- and long-length item sequences with\nsequence registers and processes the item representations with bidirectional\nStructured State Space Duality (SSD) blocks. This not only allows for\nhardware-aware matrix multiplication but also empowers outstanding capabilities\nin variable-length and long-range sequence modeling. Extensive evaluations on\nfour benchmark datasets demonstrate that the proposed model achieves\nstate-of-the-art performance while maintaining near-linear scalability with\nuser sequence length. Our code is publicly available at\nhttps://github.com/ZhangYifeng1995/SSD4Rec.\n","authors":["Haohao Qu","Yifeng Zhang","Liangbo Ning","Wenqi Fan","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2409.01192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01152v1","updated":"2024-09-02T10:37:53Z","published":"2024-09-02T10:37:53Z","title":"Real World Conversational Entity Linking Requires More Than Zeroshots","summary":" Entity linking (EL) in conversations faces notable challenges in practical\napplications, primarily due to the scarcity of entity-annotated conversational\ndatasets and sparse knowledge bases (KB) containing domain-specific, long-tail\nentities. We designed targeted evaluation scenarios to measure the efficacy of\nEL models under resource constraints. Our evaluation employs two KBs: Fandom,\nexemplifying real-world EL complexities, and the widely used Wikipedia. First,\nwe assess EL models' ability to generalize to a new unfamiliar KB using Fandom\nand a novel zero-shot conversational entity linking dataset that we curated\nbased on Reddit discussions on Fandom entities. We then evaluate the\nadaptability of EL models to conversational settings without prior training.\nOur results indicate that current zero-shot EL models falter when introduced to\nnew, domain-specific KBs without prior training, significantly dropping in\nperformance. Our findings reveal that previous evaluation approaches fall short\nof capturing real-world complexities for zero-shot EL, highlighting the\nnecessity for new approaches to design and assess conversational EL models to\nadapt to limited resources. The evaluation setup and the dataset proposed in\nthis research are made publicly available.\n","authors":["Mohanna Hoveyda","Arjen P. de Vries","Maarten de Rijke","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2409.01152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01140v1","updated":"2024-09-02T10:20:35Z","published":"2024-09-02T10:20:35Z","title":"LLM-PQA: LLM-enhanced Prediction Query Answering","summary":" The advent of Large Language Models (LLMs) provides an opportunity to change\nthe way queries are processed, moving beyond the constraints of conventional\nSQL-based database systems. However, using an LLM to answer a prediction query\nis still challenging, since an external ML model has to be employed and\ninference has to be performed in order to provide an answer. This paper\nintroduces LLM-PQA, a novel tool that addresses prediction queries formulated\nin natural language. LLM-PQA is the first to combine the capabilities of LLMs\nand retrieval-augmented mechanism for the needs of prediction queries by\nintegrating data lakes and model zoos. This integration provides users with\naccess to a vast spectrum of heterogeneous data and diverse ML models,\nfacilitating dynamic prediction query answering. In addition, LLM-PQA can\ndynamically train models on demand, based on specific query requirements,\nensuring reliable and relevant results even when no pre-trained model in a\nmodel zoo, available for the task.\n","authors":["Ziyu Li","Wenjie Zhao","Asterios Katsifodimos","Rihan Hai"],"pdf_url":"https://arxiv.org/pdf/2409.01140v1.pdf","comment":"This paper is accepted as a demo at CIKM 2024"},{"id":"http://arxiv.org/abs/2409.01082v1","updated":"2024-09-02T09:10:47Z","published":"2024-09-02T09:10:47Z","title":"Evidential Transformers for Improved Image Retrieval","summary":" We introduce the Evidential Transformer, an uncertainty-driven transformer\nmodel for improved and robust image retrieval. In this paper, we make several\ncontributions to content-based image retrieval (CBIR). We incorporate\nprobabilistic methods into image retrieval, achieving robust and reliable\nresults, with evidential classification surpassing traditional training based\non multiclass classification as a baseline for deep metric learning.\nFurthermore, we improve the state-of-the-art retrieval results on several\ndatasets by leveraging the Global Context Vision Transformer (GC ViT)\narchitecture. Our experimental results consistently demonstrate the reliability\nof our approach, setting a new benchmark in CBIR in all test settings on the\nStanford Online Products (SOP) and CUB-200-2011 datasets.\n","authors":["Danilo Dordevic","Suryansh Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.01082v1.pdf","comment":"6 pages, 6 figures, To be presented at the 3rd Workshop on\n Uncertainty Quantification for Computer Vision, at the ECCV 2024 conference\n in Milan, Italy"},{"id":"http://arxiv.org/abs/2409.01012v1","updated":"2024-09-02T07:44:48Z","published":"2024-09-02T07:44:48Z","title":"Improved Diversity-Promoting Collaborative Metric Learning for\n Recommendation","summary":" Collaborative Metric Learning (CML) has recently emerged as a popular method\nin recommendation systems (RS), closing the gap between metric learning and\ncollaborative filtering. Following the convention of RS, existing practices\nexploit unique user representation in their model design. This paper focuses on\na challenging scenario where a user has multiple categories of interests. Under\nthis setting, the unique user representation might induce preference bias,\nespecially when the item category distribution is imbalanced. To address this\nissue, we propose a novel method called \\textit{Diversity-Promoting\nCollaborative Metric Learning} (DPCML), with the hope of considering the\ncommonly ignored minority interest of the user. The key idea behind DPCML is to\nintroduce a set of multiple representations for each user in the system where\nusers' preference toward an item is aggregated by taking the minimum item-user\ndistance among their embedding set. Specifically, we instantiate two effective\nassignment strategies to explore a proper quantity of vectors for each user.\nMeanwhile, a \\textit{Diversity Control Regularization Scheme} (DCRS) is\ndeveloped to accommodate the multi-vector representation strategy better.\nTheoretically, we show that DPCML could induce a smaller generalization error\nthan traditional CML. Furthermore, we notice that CML-based approaches usually\nrequire \\textit{negative sampling} to reduce the heavy computational burden\ncaused by the pairwise objective therein. In this paper, we reveal the\nfundamental limitation of the widely adopted hard-aware sampling from the\nOne-Way Partial AUC (OPAUC) perspective and then develop an effective sampling\nalternative for the CML-based paradigm. Finally, comprehensive experiments over\na range of benchmark datasets speak to the efficacy of DPCML. Code are\navailable at \\url{https://github.com/statusrank/LibCML}.\n","authors":["Shilong Bao","Qianqian Xu","Zhiyong Yang","Yuan He","Xiaochun Cao","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.01012v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.15292"},{"id":"http://arxiv.org/abs/2409.00890v1","updated":"2024-09-02T01:54:33Z","published":"2024-09-02T01:54:33Z","title":"Towards Investigating Biases in Spoken Conversational Search","summary":" Voice-based systems like Amazon Alexa, Google Assistant, and Apple Siri,\nalong with the growing popularity of OpenAI's ChatGPT and Microsoft's Copilot,\nserve diverse populations, including visually impaired and low-literacy\ncommunities. This reflects a shift in user expectations from traditional search\nto more interactive question-answering models. However, presenting information\neffectively in voice-only channels remains challenging due to their linear\nnature. This limitation can impact the presentation of complex queries\ninvolving controversial topics with multiple perspectives. Failing to present\ndiverse viewpoints may perpetuate or introduce biases and affect user\nattitudes. Balancing information load and addressing biases is crucial in\ndesigning a fair and effective voice-based system. To address this, we (i)\nreview how biases and user attitude changes have been studied in screen-based\nweb search, (ii) address challenges in studying these changes in voice-based\nsettings like SCS, (iii) outline research questions, and (iv) propose an\nexperimental setup with variables, data, and instruments to explore biases in a\nvoice-based setting like Spoken Conversational Search.\n","authors":["Sachin Pathiyan Cherumanal","Falk Scholer","Johanne R. Trippas","Damiano Spina"],"pdf_url":"https://arxiv.org/pdf/2409.00890v1.pdf","comment":"Accepted Late-Breaking Results at ACM ICMI Companion 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.14211v2","updated":"2024-09-02T23:41:47Z","published":"2024-07-19T11:17:42Z","title":"Advanced Predictive Modeling for Enhanced Mortality Prediction in ICU\n Stroke Patients Using Clinical Data","summary":" Background: Stroke is second-leading cause of disability and death among\nadults. Approximately 17 million people suffer from a stroke annually, with\nabout 85% being ischemic strokes. Predicting mortality of ischemic stroke\npatients in intensive care unit (ICU) is crucial for optimizing treatment\nstrategies, allocating resources, and improving survival rates. Methods: We\nacquired data on ICU ischemic stroke patients from MIMIC-IV database, including\ndiagnoses, vital signs, laboratory tests, medications, procedures, treatments,\nand clinical notes. Stroke patients were randomly divided into training (70%,\nn=2441), test (15%, n=523), and validation (15%, n=523) sets. To address data\nimbalances, we applied Synthetic Minority Over-sampling Technique (SMOTE). We\nselected 30 features for model development, significantly reducing feature\nnumber from 1095 used in the best study. We developed a deep learning model to\nassess mortality risk and implemented several baseline machine learning models\nfor comparison. Results: XGB-DL model, combining XGBoost for feature selection\nand deep learning, effectively minimized false positives. Model's AUROC\nimproved from 0.865 (95% CI: 0.821 - 0.905) on first day to 0.903 (95% CI:\n0.868 - 0.936) by fourth day using data from 3,646 ICU mortality patients in\nthe MIMIC-IV database with 0.945 AUROC (95% CI: 0.944 - 0.947) during training.\nAlthough other ML models also performed well in terms of AUROC, we chose Deep\nLearning for its higher specificity. Conclusions: Through enhanced feature\nselection and data cleaning, proposed model demonstrates a 13% AUROC\nimprovement compared to existing models while reducing feature number from 1095\nin previous studies to 30.\n","authors":["Armin Abdollahi","Negin Ashrafi","Maryam Pishgar"],"pdf_url":"https://arxiv.org/pdf/2407.14211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04845v2","updated":"2024-09-02T22:43:33Z","published":"2024-02-07T13:44:47Z","title":"AlphaFold Meets Flow Matching for Generating Protein Ensembles","summary":" The biological functions of proteins often depend on dynamic structural\nensembles. In this work, we develop a flow-based generative modeling approach\nfor learning and sampling the conformational landscapes of proteins. We\nrepurpose highly accurate single-state predictors such as AlphaFold and ESMFold\nand fine-tune them under a custom flow matching framework to obtain\nsequence-conditoned generative models of protein structure called AlphaFlow and\nESMFlow. When trained and evaluated on the PDB, our method provides a superior\ncombination of precision and diversity compared to AlphaFold with MSA\nsubsampling. When further trained on ensembles from all-atom MD, our method\naccurately captures conformational flexibility, positional distributions, and\nhigher-order ensemble observables for unseen proteins. Moreover, our method can\ndiversify a static PDB structure with faster wall-clock convergence to certain\nequilibrium properties than replicate MD trajectories, demonstrating its\npotential as a proxy for expensive physics-based simulations. Code is available\nat https://github.com/bjing2016/alphaflow.\n","authors":["Bowen Jing","Bonnie Berger","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2402.04845v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.14099v3","updated":"2024-09-02T21:40:41Z","published":"2024-05-23T02:01:05Z","title":"Automatic Differentiation is Essential in Training Neural Networks for\n Solving Differential Equations","summary":" Neural network-based approaches have recently shown significant promise in\nsolving partial differential equations (PDEs) in science and engineering,\nespecially in scenarios featuring complex domains or incorporation of empirical\ndata. One advantage of the neural network methods for PDEs lies in its\nautomatic differentiation (AD), which necessitates only the sample points\nthemselves, unlike traditional finite difference (FD) approximations that\nrequire nearby local points to compute derivatives. In this paper, we\nquantitatively demonstrate the advantage of AD in training neural networks. The\nconcept of truncated entropy is introduced to characterize the training\nproperty. Specifically, through comprehensive experimental and theoretical\nanalyses conducted on random feature models and two-layer neural networks, we\ndiscover that the defined truncated entropy serves as a reliable metric for\nquantifying the residual loss of random feature models and the training speed\nof neural networks for both AD and FD methods. Our experimental and theoretical\nanalyses demonstrate that, from a training perspective, AD outperforms FD in\nsolving PDEs.\n","authors":["Chuqi Chen","Yahong Yang","Yang Xiang","Wenrui Hao"],"pdf_url":"https://arxiv.org/pdf/2405.14099v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07820v3","updated":"2024-09-02T21:17:39Z","published":"2023-01-18T23:16:53Z","title":"On the limits of neural network explainability via descrambling","summary":" We characterize the exact solutions to neural network descrambling--a\nmathematical model for explaining the fully connected layers of trained neural\nnetworks (NNs). By reformulating the problem to the minimization of the\nBrockett function arising in graph matching and complexity theory we show that\nthe principal components of the hidden layer preactivations can be\ncharacterized as the optimal explainers or descramblers for the layer weights,\nleading to descrambled weight matrices. We show that in typical deep learning\ncontexts these descramblers take diverse and interesting forms including (1)\nmatching largest principal components with the lowest frequency modes of the\nFourier basis for isotropic hidden data, (2) discovering the semantic\ndevelopment in two-layer linear NNs for signal recovery problems, and (3)\nexplaining CNNs by optimally permuting the neurons. Our numerical experiments\nindicate that the eigendecompositions of the hidden layer data--now understood\nas the descramblers--can also reveal the layer's underlying transformation.\nThese results illustrate that the SVD is more directly related to the\nexplainability of NNs than previously thought and offers a promising avenue for\ndiscovering interpretable motifs for the hidden action of NNs, especially in\ncontexts of operator learning or physics-informed NNs, where the input/output\ndata has limited human readability.\n","authors":["Shashank Sule","Richard G. Spencer","Wojciech Czaja"],"pdf_url":"https://arxiv.org/pdf/2301.07820v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07529v3","updated":"2024-09-02T20:42:08Z","published":"2024-06-11T17:55:25Z","title":"MAP: Low-compute Model Merging with Amortized Pareto Fronts via\n Quadratic Approximation","summary":" Model merging has emerged as an effective approach to combine multiple\nsingle-task models, fine-tuned from the same pre-trained model, into a\nmultitask model. This process typically involves computing a weighted average\nof the model parameters without any additional training. Existing model-merging\nmethods focus on enhancing average task accuracy. However, interference and\nconflicts between the objectives of different tasks can lead to trade-offs\nduring model merging. In real-world applications, a set of solutions with\nvarious trade-offs can be more informative, helping practitioners make\ndecisions based on diverse preferences. In this paper, we introduce a novel\nlow-compute algorithm, Model Merging with Amortized Pareto Front (MAP). MAP\nidentifies a Pareto set of scaling coefficients for merging multiple models to\nreflect the trade-offs. The core component of MAP is approximating the\nevaluation metrics of the various tasks using a quadratic approximation\nsurrogate model derived from a pre-selected set of scaling coefficients,\nenabling amortized inference. Experimental results on vision and natural\nlanguage processing tasks show that MAP can accurately identify the Pareto\nfront. To further reduce the required computation of MAP, we propose (1) a\nBayesian adaptive sampling algorithm and (2) a nested merging scheme with\nmultiple stages.\n","authors":["Lu Li","Tianyu Zhang","Zhiqi Bu","Suyuchen Wang","Huan He","Jie Fu","Yonghui Wu","Jiang Bian","Yong Chen","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2406.07529v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17095v2","updated":"2024-09-02T20:33:49Z","published":"2024-08-30T08:26:55Z","title":"RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation\n and Retrieval-Guidance","summary":" Diffusion-based models demonstrate impressive generation capabilities.\nHowever, they also have a massive number of parameters, resulting in enormous\nmodel sizes, thus making them unsuitable for deployment on resource-constraint\ndevices. Block-wise generation can be a promising alternative for designing\ncompact-sized (parameter-efficient) deep generative models since the model can\ngenerate one block at a time instead of generating the whole image at once.\nHowever, block-wise generation is also considerably challenging because\nensuring coherence across generated blocks can be non-trivial. To this end, we\ndesign a retrieval-augmented generation (RAG) approach and leverage the\ncorresponding blocks of the images retrieved by the RAG module to condition the\ntraining and generation stages of a block-wise denoising diffusion model. Our\nconditioning schemes ensure coherence across the different blocks during\ntraining and, consequently, during generation. While we showcase our approach\nusing the latent diffusion model (LDM) as the base model, it can be used with\nother variants of denoising diffusion models. We validate the solution of the\ncoherence problem through the proposed approach by reporting substantive\nexperiments to demonstrate our approach's effectiveness in compact model size\nand excellent generation quality.\n","authors":["Avideep Mukherjee","Soumya Banerjee","Piyush Rai","Vinay P. Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2408.17095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19289v4","updated":"2024-09-02T20:21:29Z","published":"2024-03-28T10:19:36Z","title":"Uplift Modeling Under Limited Supervision","summary":" Estimating causal effects in e-commerce tends to involve costly treatment\nassignments which can be impractical in large-scale settings. Leveraging\nmachine learning to predict such treatment effects without actual intervention\nis a standard practice to diminish the risk. However, existing methods for\ntreatment effect prediction tend to rely on training sets of substantial size,\nwhich are built from real experiments and are thus inherently risky to create.\nIn this work we propose a graph neural network to diminish the required\ntraining set size, relying on graphs that are common in e-commerce data.\nSpecifically, we view the problem as node regression with a restricted number\nof labeled instances, develop a two-model neural architecture akin to previous\ncausal effect estimators, and test varying message-passing layers for encoding.\nFurthermore, as an extra step, we combine the model with an acquisition\nfunction to guide the creation of the training set in settings with extremely\nlow experimental budget. The framework is flexible since each step can be used\nseparately with other models or treatment policies. The experiments on real\nlarge-scale networks indicate a clear advantage of our methodology over the\nstate of the art, which in many cases performs close to random, underlining the\nneed for models that can generalize with limited supervision to reduce\nexperimental risks.\n","authors":["George Panagopoulos","Daniele Malitesta","Fragkiskos D. Malliaros","Jun Pang"],"pdf_url":"https://arxiv.org/pdf/2403.19289v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04118v2","updated":"2024-09-02T18:03:26Z","published":"2024-03-07T00:20:11Z","title":"Globally Stable Neural Imitation Policies","summary":" Imitation learning presents an effective approach to alleviate the\nresource-intensive and time-consuming nature of policy learning from scratch in\nthe solution space. Even though the resulting policy can mimic expert\ndemonstrations reliably, it often lacks predictability in unexplored regions of\nthe state-space, giving rise to significant safety concerns in the face of\nperturbations. To address these challenges, we introduce the Stable Neural\nDynamical System (SNDS), an imitation learning regime which produces a policy\nwith formal stability guarantees. We deploy a neural policy architecture that\nfacilitates the representation of stability based on Lyapunov theorem, and\njointly train the policy and its corresponding Lyapunov candidate to ensure\nglobal stability. We validate our approach by conducting extensive experiments\nin simulation and successfully deploying the trained policies on a real-world\nmanipulator arm. The experimental results demonstrate that our method overcomes\nthe instability, accuracy, and computational intensity problems associated with\nprevious imitation learning methods, making our method a promising solution for\nstable policy learning in complex planning scenarios.\n","authors":["Amin Abyaneh","Mariana Sosa Guzmán","Hsiu-Chin Lin"],"pdf_url":"https://arxiv.org/pdf/2403.04118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21009v2","updated":"2024-09-02T18:01:44Z","published":"2024-07-30T17:55:36Z","title":"AI-Assisted Generation of Difficult Math Questions","summary":" Current LLM training positions mathematical reasoning as a core capability.\nWith publicly available sources fully tapped, there is unmet demand for diverse\nand challenging math questions. Relying solely on human experts is both\ntime-consuming and costly, while LLM-generated questions often lack the\nrequisite diversity and difficulty. We present a design framework that combines\nthe strengths of LLMs with a human-in-the-loop approach to generate a diverse\narray of challenging math questions. We leverage LLM metacognition skills\n[Didolkar et al., 2024] of a strong LLM to extract core \"skills\" from existing\nmath datasets. These skills serve as the basis for generating novel and\ndifficult questions by prompting the LLM with random pairs of core skills. The\nuse of two different skills within each question makes finding such questions\nan \"out of distribution\" task for both LLMs and humans. Our pipeline employs\nLLMs to iteratively generate and refine questions and solutions through\nmultiturn prompting. Human annotators then verify and further refine the\nquestions, with their efficiency enhanced via further LLM interactions.\nApplying this pipeline on skills extracted from the MATH dataset [Hendrycks et\nal., 2021] resulted in MATH$^2$ - a dataset of higher-quality math questions,\nas evidenced by: (a) Lower performance of all models on MATH$^2$ than on MATH\n(b) Higher performance on MATH when using MATH$^2$ questions as in-context\nexamples. Although focused on mathematics, our methodology seems applicable to\nother domains requiring structured reasoning, and potentially as a component of\nscalable oversight. Also of interest is a striking relationship observed\nbetween models' performance on the new dataset: the success rate on MATH$^2$ is\nthe square on MATH, suggesting that successfully solving the question in\nMATH$^2$ requires a nontrivial combination of two distinct math skills.\n","authors":["Vedant Shah","Dingli Yu","Kaifeng Lyu","Simon Park","Nan Rosemary Ke","Michael Mozer","Yoshua Bengio","Sanjeev Arora","Anirudh Goyal"],"pdf_url":"https://arxiv.org/pdf/2407.21009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00872v2","updated":"2024-09-02T17:41:24Z","published":"2024-08-01T18:46:05Z","title":"Online Detection of Anomalies in Temporal Knowledge Graphs with\n Interpretability","summary":" Temporal knowledge graphs (TKGs) are valuable resources for capturing\nevolving relationships among entities, yet they are often plagued by noise,\nnecessitating robust anomaly detection mechanisms. Existing dynamic graph\nanomaly detection approaches struggle to capture the rich semantics introduced\nby node and edge categories within TKGs, while TKG embedding methods lack\ninterpretability, undermining the credibility of anomaly detection. Moreover,\nthese methods falter in adapting to pattern changes and semantic drifts\nresulting from knowledge updates. To tackle these challenges, we introduce\nAnoT, an efficient TKG summarization method tailored for interpretable online\nanomaly detection in TKGs. AnoT begins by summarizing a TKG into a novel rule\ngraph, enabling flexible inference of complex patterns in TKGs. When new\nknowledge emerges, AnoT maps it onto a node in the rule graph and traverses the\nrule graph recursively to derive the anomaly score of the knowledge. The\ntraversal yields reachable nodes that furnish interpretable evidence for the\nvalidity or the anomalous of the new knowledge. Overall, AnoT embodies a\ndetector-updater-monitor architecture, encompassing a detector for offline TKG\nsummarization and online scoring, an updater for real-time rule graph updates\nbased on emerging knowledge, and a monitor for estimating the approximation\nerror of the rule graph. Experimental results on four real-world datasets\ndemonstrate that AnoT surpasses existing methods significantly in terms of\naccuracy and interoperability. All of the raw datasets and the implementation\nof AnoT are provided in https://github.com/zjs123/ANoT.\n","authors":["Jiasheng Zhang","Rex Ying","Jie Shao"],"pdf_url":"https://arxiv.org/pdf/2408.00872v2.pdf","comment":"26 pages, 10 figures. Accepted by SIGMOD 2025"},{"id":"http://arxiv.org/abs/2407.04268v3","updated":"2024-09-02T17:13:22Z","published":"2024-07-05T05:45:34Z","title":"NeuFair: Neural Network Fairness Repair with Dropout","summary":" This paper investigates neuron dropout as a post-processing bias mitigation\nfor deep neural networks (DNNs). Neural-driven software solutions are\nincreasingly applied in socially critical domains with significant fairness\nimplications. While neural networks are exceptionally good at finding\nstatistical patterns from data, they may encode and amplify existing biases\nfrom the historical data. Existing bias mitigation algorithms often require\nmodifying the input dataset or the learning algorithms. We posit that the\nprevalent dropout methods that prevent over-fitting during training by randomly\ndropping neurons may be an effective and less intrusive approach to improve the\nfairness of pre-trained DNNs. However, finding the ideal set of neurons to drop\nis a combinatorial problem. We propose NeuFair, a family of post-processing\nrandomized algorithms that mitigate unfairness in pre-trained DNNs via dropouts\nduring inference after training. Our randomized search is guided by an\nobjective to minimize discrimination while maintaining the model's utility. We\nshow that our design of randomized algorithms is effective and efficient in\nimproving fairness (up to 69%) with minimal or no model performance\ndegradation. We provide intuitive explanations of these phenomena and carefully\nexamine the influence of various hyperparameters of search algorithms on the\nresults. Finally, we empirically and conceptually compare NeuFair to different\nstate-of-the-art bias mitigators.\n","authors":["Vishnu Asutosh Dasu","Ashish Kumar","Saeid Tizpaz-Niari","Gang Tan"],"pdf_url":"https://arxiv.org/pdf/2407.04268v3.pdf","comment":"Paper accepted at ACM ISSTA 2024"},{"id":"http://arxiv.org/abs/2312.10108v2","updated":"2024-09-02T17:00:21Z","published":"2023-12-15T06:30:55Z","title":"Privacy-Aware Document Visual Question Answering","summary":" Document Visual Question Answering (DocVQA) has quickly grown into a central\ntask of document understanding. But despite the fact that documents contain\nsensitive or copyrighted information, none of the current DocVQA methods offers\nstrong privacy guarantees. In this work, we explore privacy in the domain of\nDocVQA for the first time, highlighting privacy issues in state of the art\nmulti-modal LLM models used for DocVQA, and explore possible solutions.\nSpecifically, we focus on invoice processing as a realistic document\nunderstanding scenario, and propose a large scale DocVQA dataset comprising\ninvoice documents and associated questions and answers. We employ a federated\nlearning scheme, that reflects the real-life distribution of documents in\ndifferent businesses, and we explore the use case where the data of the invoice\nprovider is the sensitive information to be protected. We demonstrate that\nnon-private models tend to memorise, a behaviour that can lead to exposing\nprivate information. We then evaluate baseline training schemes employing\nfederated learning and differential privacy in this multi-modal scenario, where\nthe sensitive information might be exposed through either or both of the two\ninput modalities: vision (document image) or language (OCR tokens). Finally, we\ndesign attacks exploiting the memorisation effect of the model, and demonstrate\ntheir effectiveness in probing a representative DocVQA models.\n","authors":["Rubèn Tito","Khanh Nguyen","Marlon Tobaben","Raouf Kerkouche","Mohamed Ali Souibgui","Kangsoo Jung","Joonas Jälkö","Vincent Poulain D'Andecy","Aurelie Joseph","Lei Kang","Ernest Valveny","Antti Honkela","Mario Fritz","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2312.10108v2.pdf","comment":"35 pages, 12 figures, accepted for publication at the 18th\n International Conference on Document Analysis and Recognition, ICDAR 2024"},{"id":"http://arxiv.org/abs/2408.13295v2","updated":"2024-09-02T17:00:05Z","published":"2024-08-23T14:47:10Z","title":"Exploring Bias and Prediction Metrics to Characterise the Fairness of\n Machine Learning for Equity-Centered Public Health Decision-Making: A\n Narrative Review","summary":" Background: The rapid advancement of Machine Learning (ML) represents novel\nopportunities to enhance public health research, surveillance, and\ndecision-making. However, there is a lack of comprehensive understanding of\nalgorithmic bias, systematic errors in predicted population health outcomes,\nresulting from the public health application of ML. The objective of this\nnarrative review is to explore the types of bias generated by ML and\nquantitative metrics to assess these biases.\n Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of\nElectrical and Electronics Engineers), ACM (Association for Computing\nMachinery) Digital Library, Science Direct, and Springer Nature. We used\nkeywords to identify studies describing types of bias and metrics to measure\nthese in the domain of ML and public and population health published in English\nbetween 2008 and 2023, inclusive.\n Results: A total of 72 articles met the inclusion criteria. Our review\nidentified the commonly described types of bias and quantitative metrics to\nassess these biases from an equity perspective.\n Conclusion : The review will help formalize the evaluation framework for ML\non public health from an equity perspective.\n","authors":["Shaina Raza","Arash Shaban-Nejad","Elham Dolatabadi","Hiroshi Mamiya"],"pdf_url":"https://arxiv.org/pdf/2408.13295v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.16154v2","updated":"2024-09-02T16:58:16Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":" Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","Luis Filipe Nakayama","André Anjos","Lilian Berton"],"pdf_url":"https://arxiv.org/pdf/2408.16154v2.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n ECCV 2024"},{"id":"http://arxiv.org/abs/2408.09493v2","updated":"2024-09-02T16:19:25Z","published":"2024-08-18T14:16:55Z","title":"Ancestral Reinforcement Learning: Unifying Zeroth-Order Optimization and\n Genetic Algorithms for Reinforcement Learning","summary":" Reinforcement Learning (RL) offers a fundamental framework for discovering\noptimal action strategies through interactions within unknown environments.\nRecent advancement have shown that the performance and applicability of RL can\nsignificantly be enhanced by exploiting a population of agents in various ways.\nZeroth-Order Optimization (ZOO) leverages an agent population to estimate the\ngradient of the objective function, enabling robust policy refinement even in\nnon-differentiable scenarios. As another application, Genetic Algorithms (GA)\nboosts the exploration of policy landscapes by mutational generation of policy\ndiversity in an agent population and its refinement by selection. A natural\nquestion is whether we can have the best of two worlds that the agent\npopulation can have. In this work, we propose Ancestral Reinforcement Learning\n(ARL), which synergistically combines the robust gradient estimation of ZOO\nwith the exploratory power of GA. The key idea in ARL is that each agent within\na population infers gradient by exploiting the history of its ancestors, i.e.,\nthe ancestor population in the past, while maintaining the diversity of\npolicies in the current population as in GA. We also theoretically reveal that\nthe populational search in ARL implicitly induces the KL-regularization of the\nobjective function, resulting in the enhanced exploration. Our results extend\nthe applicability of populational algorithms for RL.\n","authors":["So Nakashima","Tetsuya J. Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2408.09493v2.pdf","comment":"16pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.17708v4","updated":"2024-09-02T15:23:52Z","published":"2023-03-30T21:00:38Z","title":"Analysis of Failures and Risks in Deep Learning Model Converters: A Case\n Study in the ONNX Ecosystem","summary":" Software engineers develop, fine-tune, and deploy deep learning (DL) models\nusing a variety of development frameworks and runtime environments. DL model\nconverters move models between frameworks and to runtime environments.\nConversion errors compromise model quality and disrupt deployment. However, the\nfailure characteristics of DL model converters are unknown, adding risk when\nusing DL interoperability technologies.\n This paper analyzes failures in DL model converters. We survey software\nengineers about DL interoperability tools, use cases, and pain points (N=92).\nThen, we characterize failures in model converters associated with the main\ninteroperability tool, ONNX (N=200 issues in PyTorch and TensorFlow). Finally,\nwe formulate and test two hypotheses about structural causes for the failures\nwe studied. We find that the node conversion stage of a model converter\naccounts for ~75% of the defects and 33% of reported failure are related to\nsemantically incorrect models. The cause of semantically incorrect models is\nelusive, but models with behaviour inconsistencies share operator sequences.\nOur results motivate future research on making DL interoperability software\nsimpler to maintain, extend, and validate. Research into behavioural tolerances\nand architectural coverage metrics could be fruitful.\n","authors":["Purvish Jajal","Wenxin Jiang","Arav Tewari","Erik Kocinare","Joseph Woo","Anusha Sarraf","Yung-Hsiang Lu","George K. Thiruvathukal","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2303.17708v4.pdf","comment":"[ISSTA'24] Proceedings of the 33rd ACM SIGSOFT International\n Symposium on Software Testing and Analysis (ISSTA) 2024"},{"id":"http://arxiv.org/abs/2405.15444v3","updated":"2024-09-02T15:09:05Z","published":"2024-05-24T11:20:41Z","title":"HyperInterval: Hypernetwork approach to training weight interval regions\n in continual learning","summary":" Recently, a new Continual Learning (CL) paradigm was presented to control\ncatastrophic forgetting, called Interval Continual Learning (InterContiNet),\nwhich relies on enforcing interval constraints on the neural network parameter\nspace. Unfortunately, InterContiNet training is challenging due to the high\ndimensionality of the weight space, making intervals difficult to manage. To\naddress this issue, we introduce \\our{} \\footnote{The source code is available\nat https://github.com/gmum/HyperInterval}, a technique that employs interval\narithmetic within the embedding space and utilizes a hypernetwork to map these\nintervals to the target network parameter space. We train interval embeddings\nfor consecutive tasks and train a hypernetwork to transform these embeddings\ninto weights of the target network. An embedding for a given task is trained\nalong with the hypernetwork, preserving the response of the target network for\nthe previous task embeddings. Interval arithmetic works with a more manageable,\nlower-dimensional embedding space rather than directly preparing intervals in a\nhigh-dimensional weight space. Our model allows faster and more efficient\ntraining. Furthermore, \\our{} maintains the guarantee of not forgetting. At the\nend of training, we can choose one universal embedding to produce a single\nnetwork dedicated to all tasks. In such a framework, hypernetwork is used only\nfor training and, finally, we can utilize one set of weights. \\our{} obtains\nsignificantly better results than InterContiNet and gives SOTA results on\nseveral benchmarks.\n","authors":["Patryk Krukowski","Anna Bielawska","Kamil Książek","Paweł Wawrzyński","Paweł Batorski","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2405.15444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09146v5","updated":"2024-09-02T14:38:01Z","published":"2024-02-14T12:55:28Z","title":"ResQuNNs:Towards Enabling Deep Learning in Quantum Convolution Neural\n Networks","summary":" In this paper, we present a novel framework for enhancing the performance of\nQuanvolutional Neural Networks (QuNNs) by introducing trainable quanvolutional\nlayers and addressing the critical challenges associated with them. Traditional\nquanvolutional layers, although beneficial for feature extraction, have largely\nbeen static, offering limited adaptability. Unlike state-of-the-art, our\nresearch overcomes this limitation by enabling training within these layers,\nsignificantly increasing the flexibility and potential of QuNNs. However, the\nintroduction of multiple trainable quanvolutional layers induces complexities\nin gradient-based optimization, primarily due to the difficulty in accessing\ngradients across these layers. To resolve this, we propose a novel\narchitecture, Residual Quanvolutional Neural Networks (ResQuNNs), leveraging\nthe concept of residual learning, which facilitates the flow of gradients by\nadding skip connections between layers. By inserting residual blocks between\nquanvolutional layers, we ensure enhanced gradient access throughout the\nnetwork, leading to improved training performance. Moreover, we provide\nempirical evidence on the strategic placement of these residual blocks within\nQuNNs. Through extensive experimentation, we identify an efficient\nconfiguration of residual blocks, which enables gradients across all the layers\nin the network that eventually results in efficient training. Our findings\nsuggest that the precise location of residual blocks plays a crucial role in\nmaximizing the performance gains in QuNNs. Our results mark a substantial step\nforward in the evolution of quantum deep learning, offering new avenues for\nboth theoretical development and practical quantum computing applications.\n","authors":["Muhammad Kashif","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2402.09146v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04896v2","updated":"2024-09-02T13:55:25Z","published":"2024-06-07T12:43:17Z","title":"Stabilizing Extreme Q-learning by Maclaurin Expansion","summary":" In offline reinforcement learning, in-sample learning methods have been\nwidely used to prevent performance degradation caused by evaluating\nout-of-distribution actions from the dataset. Extreme Q-learning (XQL) employs\na loss function based on the assumption that Bellman error follows a Gumbel\ndistribution, enabling it to model the soft optimal value function in an\nin-sample manner. It has demonstrated strong performance in both offline and\nonline reinforcement learning settings. However, issues remain, such as the\ninstability caused by the exponential term in the loss function and the risk of\nthe error distribution deviating from the Gumbel distribution. Therefore, we\npropose Maclaurin Expanded Extreme Q-learning to enhance stability. In this\nmethod, applying Maclaurin expansion to the loss function in XQL enhances\nstability against large errors. This approach involves adjusting the modeled\nvalue function between the value function under the behavior policy and the\nsoft optimal value function, thus achieving a trade-off between stability and\noptimality depending on the order of expansion. It also enables adjustment of\nthe error distribution assumption from a normal distribution to a Gumbel\ndistribution. Our method significantly stabilizes learning in online RL tasks\nfrom DM Control, where XQL was previously unstable. Additionally, it improves\nperformance in several offline RL tasks from D4RL.\n","authors":["Motoki Omura","Takayuki Osa","Yusuke Mukuta","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2406.04896v2.pdf","comment":"Accepted at RLC 2024: The first Reinforcement Learning Conference"},{"id":"http://arxiv.org/abs/2408.16772v2","updated":"2024-09-02T13:19:40Z","published":"2024-08-14T17:19:56Z","title":"An Effective Information Theoretic Framework for Channel Pruning","summary":" Channel pruning is a promising method for accelerating and compressing\nconvolutional neural networks. However, current pruning algorithms still remain\nunsolved problems that how to assign layer-wise pruning ratios properly and\ndiscard the least important channels with a convincing criterion. In this\npaper, we present a novel channel pruning approach via information theory and\ninterpretability of neural networks. Specifically, we regard information\nentropy as the expected amount of information for convolutional layers. In\naddition, if we suppose a matrix as a system of linear equations, a higher-rank\nmatrix represents there exist more solutions to it, which indicates more\nuncertainty. From the point of view of information theory, the rank can also\ndescribe the amount of information. In a neural network, considering the rank\nand entropy as two information indicators of convolutional layers, we propose a\nfusion function to reach a compromise of them, where the fusion results are\ndefined as ``information concentration''. When pre-defining layer-wise pruning\nratios, we employ the information concentration as a reference instead of\nheuristic and engineering tuning to provide a more interpretable solution.\nMoreover, we leverage Shapley values, which are a potent tool in the\ninterpretability of neural networks, to evaluate the channel contributions and\ndiscard the least important channels for model compression while maintaining\nits performance. Extensive experiments demonstrate the effectiveness and\npromising performance of our method. For example, our method improves the\naccuracy by 0.21% when reducing 45.5% FLOPs and removing 40.3% parameters for\nResNet-56 on CIFAR-10. Moreover, our method obtains loss in Top-1/Top-5\naccuracies of 0.43%/0.11% by reducing 41.6% FLOPs and removing 35.0% parameters\nfor ResNet-50 on ImageNet.\n","authors":["Yihao Chen","Zefang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14966v2","updated":"2024-09-02T12:55:04Z","published":"2024-04-23T12:20:27Z","title":"Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State\n Space Model","summary":" Existing Transformer-based models for point cloud analysis suffer from\nquadratic complexity, leading to compromised point cloud resolution and\ninformation loss. In contrast, the newly proposed Mamba model, based on state\nspace models (SSM), outperforms Transformer in multiple areas with only linear\ncomplexity. However, the straightforward adoption of Mamba does not achieve\nsatisfactory performance on point cloud tasks. In this work, we present\nMamba3D, a state space model tailored for point cloud learning to enhance local\nfeature extraction, achieving superior performance, high efficiency, and\nscalability potential. Specifically, we propose a simple yet effective Local\nNorm Pooling (LNP) block to extract local geometric features. Additionally, to\nobtain better global features, we introduce a bidirectional SSM (bi-SSM) with\nboth a token forward SSM and a novel backward SSM that operates on the feature\nchannel. Extensive experimental results show that Mamba3D surpasses\nTransformer-based counterparts and concurrent works in multiple tasks, with or\nwithout pre-training. Notably, Mamba3D achieves multiple SoTA, including an\noverall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1%\n(with single-modal pre-training) on the ModelNet40 classification task, with\nonly linear complexity. Our code and weights are available at\nhttps://github.com/xhanxu/Mamba3D.\n","authors":["Xu Han","Yuan Tang","Zhaoxuan Wang","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2404.14966v2.pdf","comment":"ACM MM 2024. Code and weights are available at\n https://github.com/xhanxu/Mamba3D"},{"id":"http://arxiv.org/abs/2208.14153v6","updated":"2024-09-02T12:44:58Z","published":"2022-08-30T11:12:59Z","title":"Identifying Weight-Variant Latent Causal Models","summary":" The task of causal representation learning aims to uncover latent\nhigher-level causal representations that affect lower-level observations.\nIdentifying true latent causal representations from observed data, while\nallowing instantaneous causal relations among latent variables, remains a\nchallenge, however. To this end, we start from the analysis of three intrinsic\nproperties in identifying latent space from observations: transitivity,\npermutation indeterminacy, and scaling indeterminacy. We find that transitivity\nacts as a key role in impeding the identifiability of latent causal\nrepresentations. To address the unidentifiable issue due to transitivity, we\nintroduce a novel identifiability condition where the underlying latent causal\nmodel satisfies a linear-Gaussian model, in which the causal coefficients and\nthe distribution of Gaussian noise are modulated by an additional observed\nvariable. Under some mild assumptions, we can show that the latent causal\nrepresentations can be identified up to trivial permutation and scaling.\nFurthermore, based on this theoretical result, we propose a novel method,\ntermed Structural caUsAl Variational autoEncoder, which directly learns latent\ncausal representations and causal relationships among them, together with the\nmapping from the latent causal variables to the observed ones. We show that the\nproposed method learns the true parameters asymptotically. Experimental results\non synthetic and real data demonstrate the identifiability and consistency\nresults and the efficacy of the proposed method in learning latent causal\nrepresentations.\n","authors":["Yuhang Liu","Zhen Zhang","Dong Gong","Mingming Gong","Biwei Huang","Anton van den Hengel","Kun Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2208.14153v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12801v2","updated":"2024-09-02T12:37:27Z","published":"2024-01-11T00:45:33Z","title":"Deep Learning-based Target-To-User Association in Integrated Sensing and\n Communication Systems","summary":" In Integrated Sensing and Communication (ISAC) systems, matching the radar\ntargets with communication user equipments (UEs) is functional to several\ncommunication tasks, such as proactive handover and beam prediction. In this\npaper, we consider a radar-assisted communication system where a base station\n(BS) is equipped with a multiple-input-multiple-output (MIMO) radar that has a\ndouble aim: (i) associate vehicular radar targets to vehicular equipments (VEs)\nin the communication beamspace and (ii) predict the beamforming vector for each\nVE from radar data. The proposed target-to-user (T2U) association consists of\ntwo stages. First, vehicular radar targets are detected from range-angle\nimages, and, for each, a beamforming vector is estimated. Then, the inferred\nper-target beamforming vectors are matched with the ones utilized at the BS for\ncommunication to perform target-to-user (T2U) association. Joint multi-target\ndetection and beam inference is obtained by modifying the you only look once\n(YOLO) model, which is trained over simulated range-angle radar images.\nSimulation results over different urban vehicular mobility scenarios show that\nthe proposed T2U method provides a probability of correct association that\nincreases with the size of the BS antenna array, highlighting the respective\nincrease of the separability of the VEs in the beamspace. Moreover, we show\nthat the modified YOLO architecture can effectively perform both beam\nprediction and radar target detection, with similar performance in mean average\nprecision on the latter over different antenna array sizes.\n","authors":["Lorenzo Cazzella","Marouan Mizmizi","Dario Tagliaferri","Damiano Badini","Matteo Matteucci","Umberto Spagnolini"],"pdf_url":"https://arxiv.org/pdf/2401.12801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00040v2","updated":"2024-09-02T12:00:23Z","published":"2024-07-31T14:06:18Z","title":"Barlow Twins Deep Neural Network for Advanced 1D Drug-Target Interaction\n Prediction","summary":" Accurate prediction of drug-target interactions is critical for advancing\ndrug discovery. By reducing time and cost, machine learning and deep learning\ncan accelerate this laborious discovery process. In a novel approach,\nBarlowDTI, we utilise the powerful Barlow Twins architecture for\nfeature-extraction while considering the structure of the target protein. Our\nmethod achieves state-of-the-art predictive performance against multiple\nestablished benchmarks using only one-dimensional input. The use of gradient\nboosting machine as the underlying predictor ensures fast and efficient\npredictions without the need for substantial computational resources. We also\ninvestigate how the model reaches its decision based on individual training\nsamples. By comparing co-crystal structures, we find that BarlowDTI effectively\nexploits catalytically active and stabilising residues, highlighting the\nmodel's ability to generalise from one-dimensional input data. In addition, we\nfurther benchmark new baselines against existing methods. Together, these\ninnovations improve the efficiency and effectiveness of drug-target interaction\npredictions, providing robust tools for accelerating drug development and\ndeepening the understanding of molecular interactions. Therefore, we provide an\neasy-to-use web interface that can be freely accessed at\nhttps://www.bio.nat.tum.de/oc2/barlowdti .\n","authors":["Maximilian G. Schuh","Davide Boldini","Annkathrin I. Bohne","Stephan A. Sieber"],"pdf_url":"https://arxiv.org/pdf/2408.00040v2.pdf","comment":"Refined model architecture, additional results added"},{"id":"http://arxiv.org/abs/2404.12979v2","updated":"2024-09-02T11:52:47Z","published":"2024-04-19T16:09:17Z","title":"TRNet: Two-level Refinement Network leveraging Speech Enhancement for\n Noise Robust Speech Emotion Recognition","summary":" One persistent challenge in Speech Emotion Recognition (SER) is the\nubiquitous environmental noise, which frequently results in deteriorating SER\nperformance in practice. In this paper, we introduce a Two-level Refinement\nNetwork, dubbed TRNet, to address this challenge. Specifically, a pre-trained\nspeech enhancement module is employed for front-end noise reduction and noise\nlevel estimation. Later, we utilize clean speech spectrograms and their\ncorresponding deep representations as reference signals to refine the\nspectrogram distortion and representation shift of enhanced speech during model\ntraining. Experimental results validate that the proposed TRNet substantially\npromotes the robustness of the proposed system in both matched and unmatched\nnoisy environments, without compromising its performance in noise-free\nenvironments.\n","authors":["Chengxin Chen","Pengyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12979v2.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.01001v5","updated":"2024-09-02T11:31:16Z","published":"2023-05-31T05:04:50Z","title":"DiffLoad: Uncertainty Quantification in Electrical Load Forecasting with\n the Diffusion Model","summary":" Electrical load forecasting plays a crucial role in decision-making for power\nsystems, including unit commitment and economic dispatch. The integration of\nrenewable energy sources and the occurrence of external events, such as the\nCOVID-19 pandemic, have rapidly increased uncertainties in load forecasting.\nThe uncertainties in load forecasting can be divided into two types: epistemic\nuncertainty and aleatoric uncertainty. Separating these types of uncertainties\ncan help decision-makers better understand where and to what extent the\nuncertainty is, thereby enhancing their confidence in the following\ndecision-making. This paper proposes a diffusion-based Seq2Seq structure to\nestimate epistemic uncertainty and employs the robust additive Cauchy\ndistribution to estimate aleatoric uncertainty. Our method not only ensures the\naccuracy of load forecasting but also demonstrates the ability to separate the\ntwo types of uncertainties and be applicable to different levels of loads. The\nrelevant code can be found at\n\\url{https://anonymous.4open.science/r/DiffLoad-4714/}.\n","authors":["Zhixian Wang","Qingsong Wen","Chaoli Zhang","Liang Sun","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01001v5.pdf","comment":"Accepted by IEEE Transactions on Power Systems, 2024"},{"id":"http://arxiv.org/abs/2408.16286v2","updated":"2024-09-02T10:56:20Z","published":"2024-08-29T06:37:16Z","title":"Near-Optimal Policy Identification in Robust Constrained Markov Decision\n Processes via Epigraph Form","summary":" Designing a safe policy for uncertain environments is crucial in real-world\ncontrol applications. However, this challenge remains inadequately addressed\nwithin the Markov decision process (MDP) framework. This paper presents the\nfirst algorithm capable of identifying a near-optimal policy in a robust\nconstrained MDP (RCMDP), where an optimal policy minimizes cumulative cost\nwhile satisfying constraints in the worst-case scenario across a set of\nenvironments. We first prove that the conventional Lagrangian max-min\nformulation with policy gradient methods can become trapped in suboptimal\nsolutions by encountering a sum of conflicting gradients from the objective and\nconstraint functions during its inner minimization problem. To address this, we\nleverage the epigraph form of the RCMDP problem, which resolves the conflict by\nselecting a single gradient from either the objective or the constraints.\nBuilding on the epigraph form, we propose a binary search algorithm with a\npolicy gradient subroutine and prove that it identifies an\n$\\varepsilon$-optimal policy in an RCMDP with\n$\\tilde{\\mathcal{O}}(\\varepsilon^{-4})$ policy evaluations.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Wataru Kumagai","Kenta Hoshino","Yohei Hosoe","Kazumi Kasaura","Masashi Hamaya","Paavo Parmas","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2408.16286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00760v2","updated":"2024-09-02T10:55:37Z","published":"2024-02-01T16:50:41Z","title":"EuroPED-NN: Uncertainty aware surrogate model","summary":" This work successfully generates an uncertainty-aware surrogate model of the\nEuroPED plasma pedestal model using the Bayesian neural network with noise\ncontrastive prior (BNN-NCP) technique. This model is trained using data from\nthe JET-ILW pedestal database and subsequent model evaluations, conforming to\nEuroPED-NN. The BNN-NCP technique has been proven to be a suitable method for\ngenerating uncertainty-aware surrogate models. It matches the output results of\na regular neural network while providing confidence estimates for predictions\nas uncertainties. Additionally, it highlights out-of-distribution (OOD) regions\nusing surrogate model uncertainties. This provides critical insights into model\nrobustness and reliability. EuroPED-NN has been physically validated, first,\nanalyzing electron density $n_e\\!\\left(\\psi_{\\text{pol}}=0.94\\right)$ with\nrespect to increasing plasma current, $I_p$, and second, validating the\n$\\Delta-\\beta_{p,ped}$ relation associated with the EuroPED model. This affirms\nthe robustness of the underlying physics learned by the surrogate model. On top\nof that, the method was used to develop a EuroPED-like model fed with\nexperimental data, i.e. an uncertainty aware experimental model, which is\nfunctional in JET database. Both models have been also tested in $\\sim 50$ AUG\nshots.\n","authors":["A. Panera Alvarez","A. Ho","A. Jarvinen","S. Saarelma","S. Wiesen","JET Contributors","the AUG team"],"pdf_url":"https://arxiv.org/pdf/2402.00760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15398v4","updated":"2024-09-02T10:35:42Z","published":"2023-07-28T08:48:32Z","title":"The Initial Screening Order Problem","summary":" We investigate the role of the initial screening order (ISO) in candidate\nscreening tasks, such as employee hiring and academic admissions, in which a\nscreener is tasked with selecting $k$ candidates from a candidate pool. The ISO\nrefers to the order in which the screener searches the candidate pool. Today,\nit is common for the ISO to be the product of an information access system,\nsuch as an online platform or a database query. The ISO has been largely\noverlooked in the literature, despite its potential impact on the optimality\nand fairness of the chosen $k$ candidates, especially under a human screener.\nWe define two problem formulations describing the search behavior of the\nscreener under the ISO: the best-$k$, where the screener selects the $k$ best\ncandidates; and the good-$k$, where the screener selects the $k$ first\ngood-enough candidates. To study the impact of the ISO, we introduce a\nhuman-like screener and compare it to its algorithmic counterpart, where the\nhuman-like screener is conceived to be inconsistent over time due to fatigue.\nIn particular, our analysis shows that the ISO, under a human-like screener\nsolving for the good-$k$ problem, hinders individual fairness despite meeting\ngroup level fairness, and hampers the optimality of the selected $k$\ncandidates. This is due to position bias, where a candidate's evaluation is\naffected by its position within the ISO. We report extensive simulated\nexperiments exploring the parameters of the best-$k$ and good-$k$ problems for\nthe algorithmic and human-like screeners. The simulation framework is flexible\nenough to account for multiple screening settings, being an alternative to\nrunning real-world candidate screening procedures. This work is motivated by a\nreal-world candidate screening problem studied in collaboration with an\nEuropean company.\n","authors":["Jose M. Alvarez","Antonio Mastropietro","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2307.15398v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16845v2","updated":"2024-09-02T10:33:48Z","published":"2024-08-29T18:21:50Z","title":"Enabling Local Editing in Diffusion Models by Joint and Individual\n Component Analysis","summary":" Recent advances in Diffusion Models (DMs) have led to significant progress in\nvisual synthesis and editing tasks, establishing them as a strong competitor to\nGenerative Adversarial Networks (GANs). However, the latent space of DMs is not\nas well understood as that of GANs. Recent research has focused on unsupervised\nsemantic discovery in the latent space of DMs by leveraging the bottleneck\nlayer of the denoising network, which has been shown to exhibit properties of a\nsemantic latent space. However, these approaches are limited to discovering\nglobal attributes. In this paper we address, the challenge of local image\nmanipulation in DMs and introduce an unsupervised method to factorize the\nlatent semantics learned by the denoising network of pre-trained DMs. Given an\narbitrary image and defined regions of interest, we utilize the Jacobian of the\ndenoising network to establish a relation between the regions of interest and\ntheir corresponding subspaces in the latent space. Furthermore, we disentangle\nthe joint and individual components of these subspaces to identify latent\ndirections that enable local image manipulation. Once discovered, these\ndirections can be applied to different images to produce semantically\nconsistent edits, making our method suitable for practical applications.\nExperimental results on various datasets demonstrate that our method can\nproduce semantic edits that are more localized and have better fidelity\ncompared to the state-of-the-art.\n","authors":["Theodoros Kouzelis","Manos Plitsis","Mihalis A. Nicolaou","Yannis Panagakis"],"pdf_url":"https://arxiv.org/pdf/2408.16845v2.pdf","comment":"Accepted at BMVC2024"},{"id":"http://arxiv.org/abs/2307.15438v3","updated":"2024-09-02T10:23:41Z","published":"2023-07-28T09:40:19Z","title":"Autonomous Payload Thermal Control","summary":" In small satellites there is less room for heat control equipment, scientific\ninstruments, and electronic components. Furthermore, the near proximity of\nelectronic components makes power dissipation difficult, with the risk of not\nbeing able to control the temperature appropriately, reducing component\nlifetime and mission performance. To address this challenge, taking advantage\nof the advent of increasing intelligence on board satellites, an autonomous\nthermal control tool that uses deep reinforcement learning is proposed for\nlearning the thermal control policy onboard. The tool was evaluated in a real\nspace edge processing computer that will be used in a demonstration payload\nhosted in the International Space Station (ISS). The experiment results show\nthat the proposed framework is able to learn to control the payload processing\npower to maintain the temperature under operational ranges, complementing\ntraditional thermal control systems.\n","authors":["Alejandro D. Mousist"],"pdf_url":"https://arxiv.org/pdf/2307.15438v3.pdf","comment":"To be included in the proceedings of ESA's SPAICE conference at\n ECSAT, UK, 2024"},{"id":"http://arxiv.org/abs/2306.16838v6","updated":"2024-09-02T09:54:53Z","published":"2023-06-29T10:29:29Z","title":"Fast Robust Kernel Regression through Sign Gradient Descent with Early\n Stopping","summary":" Kernel ridge regression, KRR, is a generalization of linear ridge regression\nthat is non-linear in the data, but linear in the model parameters. Here, we\nintroduce an equivalent formulation of the objective function of KRR, which\nopens up both for replacing the ridge penalty with the $\\ell_\\infty$ and\n$\\ell_1$ penalties and for studying kernel ridge regression from the\nperspective of gradient descent.\n Using the $\\ell_\\infty$ and $\\ell_1$ penalties, we obtain robust and sparse\nkernel regression, respectively. We further study the similarities between\nexplicitly regularized kernel regression and the solutions obtained by early\nstopping of iterative gradient-based methods, where we connect $\\ell_\\infty$\nregularization to sign gradient descent, $\\ell_1$ regularization to forward\nstagewise regression (also known as coordinate descent), and $\\ell_2$\nregularization to gradient descent, and, in the last case, theoretically bound\nfor the differences. We exploit the close relations between $\\ell_\\infty$\nregularization and sign gradient descent, and between $\\ell_1$ regularization\nand coordinate descent to propose computationally efficient methods for robust\nand sparse kernel regression.\n We finally compare robust kernel regression through sign gradient descent to\nexisting methods for robust kernel regression on five real data sets,\ndemonstrating that our method is one to two orders of magnitude faster, without\ncompromising accuracy.\n","authors":["Oskar Allerbo"],"pdf_url":"https://arxiv.org/pdf/2306.16838v6.pdf","comment":"Article arXiv:2306.16838v1 has been updated and split into two\n articles: this article and arXiv:2311.01762. Thus, some of the content in\n arXiv:2306.16838v1 is not a part of arXiv:2306.16838v2, but of\n arXiv:2311.01762"},{"id":"http://arxiv.org/abs/2407.11876v2","updated":"2024-09-02T09:49:49Z","published":"2024-07-16T16:00:42Z","title":"Simplifying the Theory on Over-Smoothing","summary":" Graph convolutions have gained popularity due to their ability to efficiently\noperate on data with an irregular geometric structure. However, graph\nconvolutions cause over-smoothing, which refers to representations becoming\nmore similar with increased depth. However, many different definitions and\nintuitions currently coexist, leading to research efforts focusing on\nincompatible directions. This paper attempts to align these directions by\nshowing that over-smoothing is merely a special case of power iteration. This\ngreatly simplifies the existing theory on over-smoothing, making it more\naccessible. Based on the theory, we provide a novel comprehensive definition of\nrank collapse as a generalized form of over-smoothing and introduce the\nrank-one distance as a corresponding metric. Our empirical evaluation of 14\ncommonly used methods shows that more models than were previously known suffer\nfrom this issue.\n","authors":["Andreas Roth"],"pdf_url":"https://arxiv.org/pdf/2407.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07362v2","updated":"2024-09-02T08:28:44Z","published":"2024-08-14T08:19:23Z","title":"BadMerging: Backdoor Attacks Against Model Merging","summary":" Fine-tuning pre-trained models for downstream tasks has led to a\nproliferation of open-sourced task-specific models. Recently, Model Merging\n(MM) has emerged as an effective approach to facilitate knowledge transfer\namong these independently fine-tuned models. MM directly combines multiple\nfine-tuned task-specific models into a merged model without additional\ntraining, and the resulting model shows enhanced capabilities in multiple\ntasks. Although MM provides great utility, it may come with security risks\nbecause an adversary can exploit MM to affect multiple downstream tasks.\nHowever, the security risks of MM have barely been studied. In this paper, we\nfirst find that MM, as a new learning paradigm, introduces unique challenges\nfor existing backdoor attacks due to the merging process. To address these\nchallenges, we introduce BadMerging, the first backdoor attack specifically\ndesigned for MM. Notably, BadMerging allows an adversary to compromise the\nentire merged model by contributing as few as one backdoored task-specific\nmodel. BadMerging comprises a two-stage attack mechanism and a novel\nfeature-interpolation-based loss to enhance the robustness of embedded\nbackdoors against the changes of different merging parameters. Considering that\na merged model may incorporate tasks from different domains, BadMerging can\njointly compromise the tasks provided by the adversary (on-task attack) and\nother contributors (off-task attack) and solve the corresponding unique\nchallenges with novel attack designs. Extensive experiments show that\nBadMerging achieves remarkable attacks against various MM algorithms. Our\nablation study demonstrates that the proposed attack designs can progressively\ncontribute to the attack performance. Finally, we show that prior defense\nmechanisms fail to defend against our attacks, highlighting the need for more\nadvanced defense.\n","authors":["Jinghuai Zhang","Jianfeng Chi","Zheng Li","Kunlin Cai","Yang Zhang","Yuan Tian"],"pdf_url":"https://arxiv.org/pdf/2408.07362v2.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n (CCS), 2024"},{"id":"http://arxiv.org/abs/2407.16237v2","updated":"2024-09-02T07:25:21Z","published":"2024-07-23T07:22:25Z","title":"OriGen:Enhancing RTL Code Generation with Code-to-Code Augmentation and\n Self-Reflection","summary":" Recent studies have demonstrated the significant potential of Large Language\nModels (LLMs) in generating Register Transfer Level (RTL) code, with notable\nadvancements showcased by commercial models such as GPT-4 and Claude3-Opus.\nHowever, these proprietary LLMs often raise concerns regarding privacy and\nsecurity. While open-source LLMs offer solutions to these concerns, they\ntypically underperform commercial models in RTL code generation tasks,\nprimarily due to the scarcity of high-quality open-source RTL datasets. To\naddress this challenge, we introduce OriGen , a fully open-source framework\nthat incorporates self-reflection capabilities and a novel dataset augmentation\nmethodology for generating high-quality, large-scale RTL code. Our approach\nemploys a code-tocode augmentation technique to enhance the quality of\nopen-source RTL code datasets. Furthermore, OriGen can rectify syntactic errors\nthrough a self-reflection process that leverages compiler feedback.\nExperimental results demonstrate that OriGen significantly outperforms other\nopen-source alternatives in RTL code generation. It surpasses the previous\nbest-performing open-source LLM by 12.8% and even exceeds GPT-4 Turbo in the\npass@1 metric on the VerilogEval-Human benchmark. Moreover, OriGen exhibits\nsuperior capabilities in self-reflection and error correction, outperforming\nGPT-4 by 19.9% on a benchmark designed to evaluate self-reflection\ncapabilities.\n","authors":["Fan Cui","Chenyang Yin","Kexing Zhou","Youwei Xiao","Guangyu Sun","Qiang Xu","Qipeng Guo","Demin Song","Dahua Lin","Xingcheng Zhang"," Yun"," Liang"],"pdf_url":"https://arxiv.org/pdf/2407.16237v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15434v3","updated":"2024-09-02T07:18:16Z","published":"2024-05-24T11:02:55Z","title":"Biometrics and Behavior Analysis for Detecting Distractions in\n e-Learning","summary":" In this article, we explore computer vision approaches to detect abnormal\nhead pose during e-learning sessions and we introduce a study on the effects of\nmobile phone usage during these sessions. We utilize behavioral data collected\nfrom 120 learners monitored while participating in a MOOC learning sessions.\nOur study focuses on the influence of phone-usage events on behavior and\nphysiological responses, specifically attention, heart rate, and meditation,\nbefore, during, and after phone usage. Additionally, we propose an approach for\nestimating head pose events using images taken by the webcam during the MOOC\nlearning sessions to detect phone-usage events. Our hypothesis suggests that\nhead posture undergoes significant changes when learners interact with a mobile\nphone, contrasting with the typical behavior seen when learners face a computer\nduring e-learning sessions. We propose an approach designed to detect\ndeviations in head posture from the average observed during a learner's\nsession, operating as a semi-supervised method. This system flags events\nindicating alterations in head posture for subsequent human review and\nselection of mobile phone usage occurrences with a sensitivity over 90%.\n","authors":["Álvaro Becerra","Javier Irigoyen","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez","Mutlu Cukurova"],"pdf_url":"https://arxiv.org/pdf/2405.15434v3.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2405.20091v4","updated":"2024-09-02T07:15:02Z","published":"2024-05-30T14:27:40Z","title":"VAAD: Visual Attention Analysis Dashboard applied to e-Learning","summary":" In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v4.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2208.10230v4","updated":"2024-09-02T07:10:37Z","published":"2022-08-19T14:55:12Z","title":"From Static to Dynamic Structures: Improving Binding Affinity Prediction\n with Graph-Based Deep Learning","summary":" Accurate prediction of protein-ligand binding affinities is an essential\nchallenge in structure-based drug design. Despite recent advances in\ndata-driven methods for affinity prediction, their accuracy is still limited,\npartially because they only take advantage of static crystal structures while\nthe actual binding affinities are generally determined by the thermodynamic\nensembles between proteins and ligands. One effective way to approximate such a\nthermodynamic ensemble is to use molecular dynamics (MD) simulation. Here, an\nMD dataset containing 3,218 different protein-ligand complexes is curated, and\nDynaformer, a graph-based deep learning model is further developed to predict\nthe binding affinities by learning the geometric characteristics of the\nprotein-ligand interactions from the MD trajectories. In silico experiments\ndemonstrated that the model exhibits state-of-the-art scoring and ranking power\non the CASF-2016 benchmark dataset, outperforming the methods hitherto\nreported. Moreover, in a virtual screening on heat shock protein 90 (HSP90)\nusing Dynaformer, 20 candidates are identified and their binding affinities are\nfurther experimentally validated. Dynaformer displayed promising results in\nvirtual drug screening, revealing 12 hit compounds (two are in the\nsubmicromolar range), including several novel scaffolds. Overall, these results\ndemonstrated that the approach offer a promising avenue for accelerating the\nearly drug discovery process.\n","authors":["Yaosen Min","Ye Wei","Peizhuo Wang","Xiaoting Wang","Han Li","Nian Wu","Stefan Bauer","Shuxin Zheng","Yu Shi","Yingheng Wang","Ji Wu","Dan Zhao","Jianyang Zeng"],"pdf_url":"https://arxiv.org/pdf/2208.10230v4.pdf","comment":"Update the content according to the published version on Advanced\n Science (https://doi.org/10.1002/advs.202405404)"},{"id":"http://arxiv.org/abs/2407.00710v2","updated":"2024-09-02T07:01:31Z","published":"2024-06-30T14:21:32Z","title":"Directly Handling Missing Data in Linear Discriminant Analysis for\n Enhancing Classification Accuracy and Interpretability","summary":" As the adoption of Artificial Intelligence (AI) models expands into critical\nreal-world applications, ensuring the explainability of these models becomes\nparamount, particularly in sensitive fields such as medicine and finance.\nLinear Discriminant Analysis (LDA) remains a popular choice for classification\ndue to its interpretable nature, derived from its capacity to model class\ndistributions and enhance class separation through linear combinations of\nfeatures. However, real-world datasets often suffer from incomplete data,\nposing substantial challenges for both classification accuracy and model\ninterpretability. In this paper, we introduce a novel and robust classification\nmethod, termed Weighted missing Linear Discriminant Analysis (WLDA), which\nextends LDA to handle datasets with missing values without the need for\nimputation. Our approach innovatively incorporates a weight matrix that\npenalizes missing entries, thereby refining parameter estimation directly on\nincomplete data. This methodology not only preserves the interpretability of\nLDA but also significantly enhances classification performance in scenarios\nplagued by missing data. We conduct an in-depth theoretical analysis to\nestablish the properties of WLDA and thoroughly evaluate its explainability.\nExperimental results across various datasets demonstrate that WLDA consistently\noutperforms traditional methods, especially in challenging environments where\nmissing values are prevalent in both training and test datasets. This\nadvancement provides a critical tool for improving classification accuracy and\nmaintaining model transparency in the face of incomplete data.\n","authors":["Tuan L. Vo","Uyen Dang","Thu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.00710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03963v3","updated":"2024-09-02T06:51:36Z","published":"2024-05-07T02:49:59Z","title":"ERATTA: Extreme RAG for Table To Answers with Large Language Models","summary":" Large language models (LLMs) with retrieval augmented-generation (RAG) have\nbeen the optimal choice for scalable generative AI solutions in the recent\npast. Although RAG implemented with AI agents (agentic-RAG) has been recently\npopularized, its suffers from unstable cost and unreliable performances for\nEnterprise-level data-practices. Most existing use-cases that incorporate RAG\nwith LLMs have been either generic or extremely domain specific, thereby\nquestioning the scalability and generalizability of RAG-LLM approaches. In this\nwork, we propose a unique LLM-based system where multiple LLMs can be invoked\nto enable data authentication, user-query routing, data-retrieval and custom\nprompting for question-answering capabilities from Enterprise-data tables. The\nsource tables here are highly fluctuating and large in size and the proposed\nframework enables structured responses in under 10 seconds per query.\nAdditionally, we propose a five metric scoring module that detects and reports\nhallucinations in the LLM responses. Our proposed system and scoring metrics\nachieve >90% confidence scores across hundreds of user queries in the\nsustainability, financial health and social media domains. Extensions to the\nproposed extreme RAG architectures can enable heterogeneous source querying\nusing LLMs.\n","authors":["Sohini Roychowdhury","Marko Krema","Anvar Mahammad","Brian Moore","Arijit Mukherjee","Punit Prakashchandra"],"pdf_url":"https://arxiv.org/pdf/2405.03963v3.pdf","comment":"5 pages, 4 tables, IEEE Big Data, 2024"},{"id":"http://arxiv.org/abs/2408.17011v2","updated":"2024-09-02T06:31:48Z","published":"2024-08-30T04:51:19Z","title":"Disease Classification and Impact of Pretrained Deep Convolution Neural\n Networks on Diverse Medical Imaging Datasets across Imaging Modalities","summary":" Imaging techniques such as Chest X-rays, whole slide images, and optical\ncoherence tomography serve as the initial screening and detection for a wide\nvariety of medical pulmonary and ophthalmic conditions respectively. This paper\ninvestigates the intricacies of using pretrained deep convolutional neural\nnetworks with transfer learning across diverse medical imaging datasets with\nvarying modalities for binary and multiclass classification. We conducted a\ncomprehensive performance analysis with ten network architectures and model\nfamilies each with pretraining and random initialization. Our finding showed\nthat the use of pretrained models as fixed feature extractors yields poor\nperformance irrespective of the datasets. Contrary, histopathology microscopy\nwhole slide images have better performance. It is also found that deeper and\nmore complex architectures did not necessarily result in the best performance.\nThis observation implies that the improvements in ImageNet are not parallel to\nthe medical imaging tasks. Within a medical domain, the performance of the\nnetwork architectures varies within model families with shifts in datasets.\nThis indicates that the performance of models within a specific modality may\nnot be conclusive for another modality within the same domain. This study\nprovides a deeper understanding of the applications of deep learning techniques\nin medical imaging and highlights the impact of pretrained networks across\ndifferent medical imaging datasets under five different experimental settings.\n","authors":["Jutika Borah","Kumaresh Sarmah","Hidam Kumarjit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.17011v2.pdf","comment":"15 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.07510v5","updated":"2024-09-02T06:27:05Z","published":"2024-05-13T07:10:53Z","title":"PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator","summary":" We present Piecewise Rectified Flow (PeRFlow), a flow-based method for\naccelerating diffusion models. PeRFlow divides the sampling process of\ngenerative flows into several time windows and straightens the trajectories in\neach interval via the reflow operation, thereby approaching piecewise linear\nflows. PeRFlow achieves superior performance in a few-step generation.\nMoreover, through dedicated parameterizations, the PeRFlow models inherit\nknowledge from the pretrained diffusion models. Thus, the training converges\nfast and the obtained models show advantageous transfer ability, serving as\nuniversal plug-and-play accelerators that are compatible with various workflows\nbased on the pre-trained diffusion models. Codes for training and inference are\npublicly released. https://github.com/magic-research/piecewise-rectified-flow\n","authors":["Hanshu Yan","Xingchao Liu","Jiachun Pan","Jun Hao Liew","Qiang Liu","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2405.07510v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17064v2","updated":"2024-09-02T06:25:09Z","published":"2024-08-30T07:49:35Z","title":"Instant Adversarial Purification with Adversarial Consistency\n Distillation","summary":" Neural networks, despite their remarkable performance in widespread\napplications, including image classification, are also known to be vulnerable\nto subtle adversarial noise. Although some diffusion-based purification methods\nhave been proposed, for example, DiffPure, those methods are time-consuming. In\nthis paper, we propose One Step Control Purification (OSCP), a diffusion-based\npurification model that can purify the adversarial image in one Neural Function\nEvaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and\nControlNet for our one-step purification. OSCP is computationally friendly and\ntime efficient compared to other diffusion-based purification methods; we\nachieve defense success rate of 74.19\\% on ImageNet, only requiring 0.1s for\neach purification. Moreover, there is a fundamental incongruence between\nconsistency distillation and adversarial perturbation. To address this\nontological dissonance, we propose Gaussian Adversarial Noise Distillation\n(GAND), a novel consistency distillation framework that facilitates a more\nnuanced reconciliation of the latent space dynamics, effectively bridging the\nnatural and adversarial manifolds. Our experiments show that the GAND does not\nneed a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient.\n","authors":["Chun Tong Lei","Hon Ming Yam","Zhongliang Guo","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.17064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14033v2","updated":"2024-09-02T05:55:06Z","published":"2024-08-26T05:55:48Z","title":"MLR-Copilot: Autonomous Machine Learning Research based on Large\n Language Models Agents","summary":" Machine learning research, crucial for technological advancements and\ninnovation, often faces significant challenges due to its inherent complexity,\nslow pace of experimentation, and the necessity for specialized expertise.\nMotivated by this, we present a new systematic framework, autonomous Machine\nLearning Research with large language models (MLR-Copilot), designed to enhance\nmachine learning research productivity through the automatic generation and\nimplementation of research ideas using Large Language Model (LLM) agents. The\nframework consists of three phases: research idea generation, experiment\nimplementation, and implementation execution. First, existing research papers\nare used to generate hypotheses and experimental plans vis IdeaAgent powered by\nLLMs. Next, the implementation generation phase translates these plans into\nexecutables with ExperimentAgent. This phase leverages retrieved prototype code\nand optionally retrieves candidate models and data. Finally, the execution\nphase, also managed by ExperimentAgent, involves running experiments with\nmechanisms for human feedback and iterative debugging to enhance the likelihood\nof achieving executable research outcomes. We evaluate our framework on five\nmachine learning research tasks and the experimental results show the\nframework's potential to facilitate the research progress and innovations.\n","authors":["Ruochen Li","Teerth Patel","Qingyun Wang","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2408.14033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10901v2","updated":"2024-09-02T05:25:06Z","published":"2024-08-20T14:43:53Z","title":"A Grey-box Attack against Latent Diffusion Model-based Image Editing by\n Posterior Collapse","summary":" Recent advancements in generative AI, particularly Latent Diffusion Models\n(LDMs), have revolutionized image synthesis and manipulation. However, these\ngenerative techniques raises concerns about data misappropriation and\nintellectual property infringement. Adversarial attacks on machine learning\nmodels have been extensively studied, and a well-established body of research\nhas extended these techniques as a benign metric to prevent the underlying\nmisuse of generative AI. Current approaches to safeguarding images from\nmanipulation by LDMs are limited by their reliance on model-specific knowledge\nand their inability to significantly degrade semantic quality of generated\nimages. In response to these shortcomings, we propose the Posterior Collapse\nAttack (PCA) based on the observation that VAEs suffer from posterior collapse\nduring training. Our method minimizes dependence on the white-box information\nof target models to get rid of the implicit reliance on model-specific\nknowledge. By accessing merely a small amount of LDM parameters, in specific\nmerely the VAE encoder of LDMs, our method causes a substantial semantic\ncollapse in generation quality, particularly in perceptual consistency, and\ndemonstrates strong transferability across various model architectures.\nExperimental results show that PCA achieves superior perturbation effects on\nimage generation of LDMs with lower runtime and VRAM. Our method outperforms\nexisting techniques, offering a more robust and generalizable solution that is\nhelpful in alleviating the socio-technical challenges posed by the rapidly\nevolving landscape of generative AI.\n","authors":["Zhongliang Guo","Lei Fang","Jingyu Lin","Yifei Qian","Shuai Zhao","Zeyu Wang","Junhao Dong","Cunjian Chen","Ognjen Arandjelović","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.10901v2.pdf","comment":"21 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2312.03814v2","updated":"2024-09-02T05:18:49Z","published":"2023-12-06T18:29:23Z","title":"Pearl: A Production-ready Reinforcement Learning Agent","summary":" Reinforcement learning (RL) is a versatile framework for optimizing long-term\ngoals. Although many real-world problems can be formalized with RL, learning\nand deploying a performant RL policy requires a system designed to address\nseveral important challenges, including the exploration-exploitation dilemma,\npartial observability, dynamic action spaces, and safety concerns. While the\nimportance of these challenges has been well recognized, existing open-source\nRL libraries do not explicitly address them. This paper introduces Pearl, a\nProduction-Ready RL software package designed to embrace these challenges in a\nmodular way. In addition to presenting benchmarking results, we also highlight\nexamples of Pearl's ongoing industry adoption to demonstrate its advantages for\nproduction use cases. Pearl is open sourced on GitHub at\ngithub.com/facebookresearch/pearl and its official website is\npearlagent.github.io.\n","authors":["Zheqing Zhu","Rodrigo de Salvo Braz","Jalaj Bhandari","Daniel Jiang","Yi Wan","Yonathan Efroni","Liyuan Wang","Ruiyang Xu","Hongbo Guo","Alex Nikulkov","Dmytro Korenkevych","Urun Dogan","Frank Cheng","Zheng Wu","Wanqiao Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14759v3","updated":"2024-09-02T04:51:17Z","published":"2024-05-23T16:29:30Z","title":"Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training","summary":" In this paper, we investigate the challenging framework of Byzantine-robust\ntraining in distributed machine learning (ML) systems, focusing on enhancing\nboth efficiency and practicality. As distributed ML systems become integral for\ncomplex ML tasks, ensuring resilience against Byzantine failures-where workers\nmay contribute incorrect updates due to malice or error-gains paramount\nimportance. Our first contribution is the introduction of the Centered Trimmed\nMeta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline\naggregators to optimal performance levels, while requiring low computational\ndemands. Additionally, we propose harnessing a recently developed gradient\nestimation technique based on a double-momentum strategy within the Byzantine\ncontext. Our paper highlights its theoretical and practical advantages for\nByzantine-robust training, especially in simplifying the tuning process and\nreducing the reliance on numerous hyperparameters. The effectiveness of this\ntechnique is supported by theoretical insights within the stochastic convex\noptimization (SCO) framework and corroborated by empirical evidence.\n","authors":["Tehila Dahan","Kfir Y. Levy"],"pdf_url":"https://arxiv.org/pdf/2405.14759v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14951v2","updated":"2024-09-02T04:13:50Z","published":"2024-06-21T08:03:25Z","title":"An Idiosyncrasy of Time-discretization in Reinforcement Learning","summary":" Many reinforcement learning algorithms are built on an assumption that an\nagent interacts with an environment over fixed-duration, discrete time steps.\nHowever, physical systems are continuous in time, requiring a choice of\ntime-discretization granularity when digitally controlling them. Furthermore,\nsuch systems do not wait for decisions to be made before advancing the\nenvironment state, necessitating the study of how the choice of discretization\nmay affect a reinforcement learning algorithm. In this work, we consider the\nrelationship between the definitions of the continuous-time and discrete-time\nreturns. Specifically, we acknowledge an idiosyncrasy with naively applying a\ndiscrete-time algorithm to a discretized continuous-time environment, and note\nhow a simple modification can better align the return definitions. This\nobservation is of practical consideration when dealing with environments where\ntime-discretization granularity is a choice, or situations where such\ngranularity is inherently stochastic.\n","authors":["Kris De Asis","Richard S. Sutton"],"pdf_url":"https://arxiv.org/pdf/2406.14951v2.pdf","comment":"RLC 2024"},{"id":"http://arxiv.org/abs/2312.15551v4","updated":"2024-09-02T03:26:58Z","published":"2023-12-24T21:46:14Z","title":"On the Benefits of Public Representations for Private Transfer Learning\n under Distribution Shift","summary":" Public pretraining is a promising approach to improve differentially private\nmodel training. However, recent work has noted that many positive research\nresults studying this paradigm only consider in-distribution tasks, and may not\napply to settings where there is distribution shift between the pretraining and\nfinetuning data -- a scenario that is likely when finetuning private tasks due\nto the sensitive nature of the data. In this work, we show empirically across\nthree tasks that even in settings with large distribution shift, where both\nzero-shot performance from public data and training from scratch with private\ndata give unusably weak results, public features can in fact improve private\ntraining accuracy by up to 67\\% over private training from scratch. We provide\na theoretical explanation for this phenomenon, showing that if the public and\nprivate data share a low-dimensional representation, public representations can\nimprove the sample complexity of private training even if it is impossible to\nlearn the private task from the public data alone. Altogether, our results\nprovide evidence that public data can indeed make private training practical in\nrealistic settings of extreme distribution shift.\n","authors":["Pratiksha Thaker","Amrith Setlur","Zhiwei Steven Wu","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2312.15551v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12816v4","updated":"2024-09-02T01:48:34Z","published":"2023-03-22T07:34:33Z","title":"From Wide to Deep: Dimension Lifting Network for Parameter-efficient\n Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) that maps entities and relations into vector\nrepresentations is essential for downstream applications. Conventional KGE\nmethods require high-dimensional representations to learn the complex structure\nof knowledge graph, but lead to oversized model parameters. Recent advances\nreduce parameters by low-dimensional entity representations, while developing\ntechniques (e.g., knowledge distillation or reinvented representation forms) to\ncompensate for reduced dimension. However, such operations introduce\ncomplicated computations and model designs that may not benefit large knowledge\ngraphs. To seek a simple strategy to improve the parameter efficiency of\nconventional KGE models, we take inspiration from that deeper neural networks\nrequire exponentially fewer parameters to achieve expressiveness comparable to\nwider networks for compositional structures. We view all entity representations\nas a single-layer embedding network, and conventional KGE methods that adopt\nhigh-dimensional entity representations equal widening the embedding network to\ngain expressiveness. To achieve parameter efficiency, we instead propose a\ndeeper embedding network for entity representations, i.e., a narrow entity\nembedding layer plus a multi-layer dimension lifting network (LiftNet).\nExperiments on three public datasets show that by integrating LiftNet, four\nconventional KGE methods with 16-dimensional representations achieve comparable\nlink prediction accuracy as original models that adopt 512-dimensional\nrepresentations, saving 68.4% to 96.9% parameters.\n","authors":["Borui Cai","Yong Xiang","Longxiang Gao","Di Wu","He Zhang","Jiong Jin","Tom Luan"],"pdf_url":"https://arxiv.org/pdf/2303.12816v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12530v2","updated":"2024-09-02T01:39:58Z","published":"2024-04-18T22:23:24Z","title":"TrajDeleter: Enabling Trajectory Forgetting in Offline Reinforcement\n Learning Agents","summary":" Reinforcement learning (RL) trains an agent from experiences interacting with\nthe environment. In scenarios where online interactions are impractical,\noffline RL, which trains the agent using pre-collected datasets, has become\npopular. While this new paradigm presents remarkable effectiveness across\nvarious real-world domains, like healthcare and energy management, there is a\ngrowing demand to enable agents to rapidly and completely eliminate the\ninfluence of specific trajectories from both the training dataset and the\ntrained agents. To meet this problem, this paper advocates Trajdeleter, the\nfirst practical approach to trajectory unlearning for offline RL agents. The\nkey idea of Trajdeleter is to guide the agent to demonstrate deteriorating\nperformance when it encounters states associated with unlearning trajectories.\nSimultaneously, it ensures the agent maintains its original performance level\nwhen facing other remaining trajectories. Additionally, we introduce\nTrajauditor, a simple yet efficient method to evaluate whether Trajdeleter\nsuccessfully eliminates the specific trajectories of influence from the offline\nRL agent. Extensive experiments conducted on six offline RL algorithms and\nthree tasks demonstrate that Trajdeleter requires only about 1.5% of the time\nneeded for retraining from scratch. It effectively unlearns an average of 94.8%\nof the targeted trajectories yet still performs well in actual environment\ninteractions after unlearning. The replication package and agent parameters are\navailable online.\n","authors":["Chen Gong","Kecen Li","Jin Yao","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12530v2.pdf","comment":"Accepted at NDSS 2025. The presented document here is the full\n version of our paper"},{"id":"http://arxiv.org/abs/2208.13273v2","updated":"2024-09-02T00:22:45Z","published":"2022-08-28T19:07:54Z","title":"Blending Neural Operators and Relaxation Methods in PDE Numerical\n Solvers","summary":" Neural networks suffer from spectral bias having difficulty in representing\nthe high frequency components of a function while relaxation methods can\nresolve high frequencies efficiently but stall at moderate to low frequencies.\nWe exploit the weaknesses of the two approaches by combining them\nsynergistically to develop a fast numerical solver of partial differential\nequations (PDEs) at scale. Specifically, we propose HINTS, a hybrid, iterative,\nnumerical, and transferable solver by integrating a Deep Operator Network\n(DeepONet) with standard relaxation methods, leading to parallel efficiency and\nalgorithmic scalability for a wide class of PDEs, not tractable with existing\nmonolithic solvers. HINTS balances the convergence behavior across the spectrum\nof eigenmodes by utilizing the spectral bias of DeepONet, resulting in a\nuniform convergence rate and hence exceptional performance of the hybrid solver\noverall. Moreover, HINTS applies to large-scale, multidimensional systems, it\nis flexible with regards to discretizations, computational domain, and boundary\nconditions.\n","authors":["Enrui Zhang","Adar Kahana","Alena Kopaničáková","Eli Turkel","Rishikesh Ranade","Jay Pathak","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2208.13273v2.pdf","comment":"Main text: 17 pages, 6 figures. Supplementary Information: 30 pages,\n 8 figures, 2 tables, 4 algorithms"}],"Multimedia":[{"id":"http://arxiv.org/abs/2207.12554v2","updated":"2024-09-02T22:49:21Z","published":"2022-07-25T22:17:19Z","title":"Inter-Frame Compression for Dynamic Point Cloud Geometry Coding","summary":" Efficient point cloud compression is essential for applications like virtual\nand mixed reality, autonomous driving, and cultural heritage. This paper\nproposes a deep learning-based inter-frame encoding scheme for dynamic point\ncloud geometry compression. We propose a lossy geometry compression scheme that\npredicts the latent representation of the current frame using the previous\nframe by employing a novel feature space inter-prediction network. The proposed\nnetwork utilizes sparse convolutions with hierarchical multiscale 3D feature\nlearning to encode the current frame using the previous frame. The proposed\nmethod introduces a novel predictor network for motion compensation in the\nfeature domain to map the latent representation of the previous frame to the\ncoordinates of the current frame to predict the current frame's feature\nembedding. The framework transmits the residual of the predicted features and\nthe actual features by compressing them using a learned probabilistic\nfactorized entropy model. At the receiver, the decoder hierarchically\nreconstructs the current frame by progressively rescaling the feature\nembedding. The proposed framework is compared to the state-of-the-art\nVideo-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud\nCompression (G-PCC) schemes standardized by the Moving Picture Experts Group\n(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta\nRate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against\nG-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame\nencoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based\ninter-frame encoding mode using HEVC. These significant performance gains are\ncross-checked and verified in the MPEG working group.\n","authors":["Anique Akhtar","Zhu Li","Geert Van der Auwera"],"pdf_url":"https://arxiv.org/pdf/2207.12554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v2","updated":"2024-09-02T09:04:51Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v2.pdf","comment":"Accepted by NCMMSC2024"},{"id":"http://arxiv.org/abs/2308.03024v3","updated":"2024-09-02T05:51:02Z","published":"2023-08-06T05:23:25Z","title":"Show Me the World in My Language: Establishing the First Baseline for\n Scene-Text to Scene-Text Translation","summary":" In this work, we study the task of ``visually'' translating scene text from a\nsource language (e.g., Hindi) to a target language (e.g., English). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe source scene text, such as font, size, and background. There are several\nchallenges associated with this task, such as translation with limited context,\ndeciding between translation and transliteration, accommodating varying text\nlengths within fixed spatial boundaries, and preserving the font and background\nstyles of the source scene text in the target language. To address this\nproblem, we make the following contributions: (i) We study visual translation\nas a standalone problem for the first time in the literature. (ii) We present a\ncascaded framework for visual translation that combines state-of-the-art\nmodules for scene text recognition, machine translation, and scene text\nsynthesis as a baseline for the task. (iii) We propose a set of task-specific\ndesign enhancements to design a variant of the baseline to obtain performance\nimprovements. (iv) Currently, the existing related literature lacks any\ncomprehensive performance evaluation for this novel task. To fill this gap, we\nintroduce several automatic and user-assisted evaluation metrics designed\nexplicitly for evaluating visual translation. Further, we evaluate presented\nbaselines for translating scene text between Hindi and English. Our experiments\ndemonstrate that although we can effectively perform visual translation over a\nlarge collection of scene text images, the presented baseline only partially\naddresses challenges posed by visual translation tasks. We firmly believe that\nthis new task and the limitations of existing models, as reported in this\npaper, should encourage further research in visual translation.\n","authors":["Shreyas Vaidya","Arvind Kumar Sharma","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v3.pdf","comment":"Accepted at ICPR 2024, Project Website:\n https://vl2g.github.io/projects/visTrans/"},{"id":"http://arxiv.org/abs/2409.01352v1","updated":"2024-09-02T16:11:12Z","published":"2024-09-02T16:11:12Z","title":"Spectron: Target Speaker Extraction using Conditional Transformer with\n Adversarial Refinement","summary":" Recently, attention-based transformers have become a de facto standard in\nmany deep learning applications including natural language processing, computer\nvision, signal processing, etc.. In this paper, we propose a transformer-based\nend-to-end model to extract a target speaker's speech from a monaural\nmulti-speaker mixed audio signal. Unlike existing speaker extraction methods,\nwe introduce two additional objectives to impose speaker embedding consistency\nand waveform encoder invertibility and jointly train both speaker encoder and\nspeech separator to better capture the speaker conditional embedding.\nFurthermore, we leverage a multi-scale discriminator to refine the perceptual\nquality of the extracted speech. Our experiments show that the use of a dual\npath transformer in the separator backbone along with proposed training\nparadigm improves the CNN baseline by $3.12$ dB points. Finally, we compare our\napproach with recent state-of-the-arts and show that our model outperforms\nexisting methods by $4.1$ dB points on an average without creating additional\ndata dependency.\n","authors":["Tathagata Bandyopadhyay"],"pdf_url":"https://arxiv.org/pdf/2409.01352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01029v1","updated":"2024-09-02T08:06:47Z","published":"2024-09-02T08:06:47Z","title":"Multi-Reference Generative Face Video Compression with Contrastive\n Learning","summary":" Generative face video coding (GFVC) has been demonstrated as a potential\napproach to low-latency, low bitrate video conferencing. GFVC frameworks\nachieve an extreme gain in coding efficiency with over 70% bitrate savings when\ncompared to conventional codecs at bitrates below 10kbps. In recent MPEG/JVET\nstandardization efforts, all the information required to reconstruct video\nsequences using GFVC frameworks are adopted as part of the supplemental\nenhancement information (SEI) in existing compression pipelines. In light of\nthis development, we aim to address a challenge that has been weakly addressed\nin prior GFVC frameworks, i.e., reconstruction drift as the distance between\nthe reference and target frames increases. This challenge creates the need to\nupdate the reference buffer more frequently by transmitting more Intra-refresh\nframes, which are the most expensive element of the GFVC bitstream. To overcome\nthis problem, we propose instead multiple reference animation as a robust\napproach to minimizing reconstruction drift, especially when used in a\nbi-directional prediction mode. Further, we propose a contrastive learning\nformulation for multi-reference animation. We observe that using a contrastive\nlearning framework enhances the representation capabilities of the animation\ngenerator. The resulting framework, MRDAC (Multi-Reference Deep Animation\nCodec) can therefore be used to compress longer sequences with fewer reference\nframes or achieve a significant gain in reconstruction accuracy at comparable\nbitrates to previous frameworks. Quantitative and qualitative results show\nsignificant coding and reconstruction quality gains compared to previous GFVC\nmethods, and more accurate animation quality in presence of large pose and\nfacial expression changes.\n","authors":["Goluck Konuko","Giuseppe Valenzise"],"pdf_url":"https://arxiv.org/pdf/2409.01029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00971v1","updated":"2024-09-02T06:26:48Z","published":"2024-09-02T06:26:48Z","title":"Interpretable Convolutional SyncNet","summary":" Because videos in the wild can be out of sync for various reasons, a sync-net\nis used to bring the video back into sync for tasks that require synchronized\nvideos. Previous state-of-the-art (SOTA) sync-nets use InfoNCE loss, rely on\nthe transformer architecture, or both. Unfortunately, the former makes the\nmodel's output difficult to interpret, and the latter is unfriendly with large\nimages, thus limiting the usefulness of sync-nets. In this work, we train a\nconvolutional sync-net using the balanced BCE loss (BBCE), a loss inspired by\nthe binary cross entropy (BCE) and the InfoNCE losses. In contrast to the\nInfoNCE loss, the BBCE loss does not require complicated sampling schemes. Our\nmodel can better handle larger images, and its output can be given a\nprobabilistic interpretation. The probabilistic interpretation allows us to\ndefine metrics such as probability at offset and offscreen ratio to evaluate\nthe sync quality of audio-visual (AV) speech datasets. Furthermore, our model\nachieves SOTA accuracy of $96.5\\%$ on the LRS2 dataset and $93.8\\%$ on the LRS3\ndataset.\n","authors":["Sungjoon Park","Jaesub Yun","Donggeon Lee","Minsik Park"],"pdf_url":"https://arxiv.org/pdf/2409.00971v1.pdf","comment":"8+5 pages"}]},"2024-09-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2404.01663v4","updated":"2024-09-01T22:02:32Z","published":"2024-04-02T06:07:35Z","title":"CMAT: A Multi-Agent Collaboration Tuning Framework for Enhancing Small\n Language Models","summary":" Open large language models (LLMs) have significantly advanced the field of\nnatural language processing, showcasing impressive performance across various\ntasks.Despite the significant advancements in LLMs, their effective operation\nstill relies heavily on human input to accurately guide the dialogue flow, with\nagent tuning being a crucial optimization technique that involves human\nadjustments to the model for better response to such guidance.Addressing this\ndependency, our work introduces the TinyAgent model, trained on a meticulously\ncurated high-quality dataset. We also present the Collaborative Multi-Agent\nTuning (CMAT) framework, an innovative system designed to augment language\nagent capabilities through adaptive weight updates based on environmental\nfeedback. This framework fosters collaborative learning and real-time\nadaptation among multiple intelligent agents, enhancing their context-awareness\nand long-term memory. In this research, we propose a new communication agent\nframework that integrates multi-agent systems with environmental feedback\nmechanisms, offering a scalable method to explore cooperative behaviors.\nNotably, our TinyAgent-7B model exhibits performance on par with GPT-3.5,\ndespite having fewer parameters, signifying a substantial improvement in the\nefficiency and effectiveness of LLMs.\n","authors":["Xuechen Liang","Meiling Tao","Yinghui Xia","Tianyu Shi","Jun Wang","JingSong Yang"],"pdf_url":"https://arxiv.org/pdf/2404.01663v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.14464v2","updated":"2024-09-01T20:11:27Z","published":"2020-10-27T17:23:18Z","title":"Dynamic Boundary Time Warping for Sub-sequence Matching with Few\n Examples","summary":" The paper presents a novel method of finding a fragment in a long temporal\nsequence similar to the set of shorter sequences. We are the first to propose\nan algorithm for such a search that does not rely on computing the average\nsequence from query examples. Instead, we use query examples as is, utilizing\nall of them simultaneously. The introduced method based on the Dynamic Time\nWarping (DTW) technique is suited explicitly for few-shot query-by-example\nretrieval tasks. We evaluate it on two different few-shot problems from the\nfield of Natural Language Processing. The results show it either outperforms\nbaselines and previous approaches or achieves comparable results when a low\nnumber of examples is available.\n","authors":["Łukasz Borchmann","Dawid Jurkiewicz","Filip Graliński","Tomasz Górecki"],"pdf_url":"https://arxiv.org/pdf/2010.14464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16767v3","updated":"2024-09-01T19:54:56Z","published":"2024-04-25T17:20:45Z","title":"REBEL: Reinforcement Learning via Regressing Relative Rewards","summary":" While originally developed for continuous control problems, Proximal Policy\nOptimization (PPO) has emerged as the work-horse of a variety of reinforcement\nlearning (RL) applications, including the fine-tuning of generative models.\nUnfortunately, PPO requires multiple heuristics to enable stable convergence\n(e.g. value networks, clipping), and is notorious for its sensitivity to the\nprecise implementation of these components. In response, we take a step back\nand ask what a minimalist RL algorithm for the era of generative models would\nlook like. We propose REBEL, an algorithm that cleanly reduces the problem of\npolicy optimization to regressing the relative reward between two completions\nto a prompt in terms of the policy, enabling strikingly lightweight\nimplementation. In theory, we prove that fundamental RL algorithms like Natural\nPolicy Gradient can be seen as variants of REBEL, which allows us to match the\nstrongest known theoretical guarantees in terms of convergence and sample\ncomplexity in the RL literature. REBEL can also cleanly incorporate offline\ndata and be extended to handle the intransitive preferences we frequently see\nin practice. Empirically, we find that REBEL provides a unified approach to\nlanguage modeling and image generation with stronger or similar performance as\nPPO and DPO, all while being simpler to implement and more computationally\nefficient than PPO. When fine-tuning Llama-3-8B-Instruct, REBEL achieves strong\nperformance in AlpacaEval 2.0, MT-Bench, and Open LLM Leaderboard.\n","authors":["Zhaolin Gao","Jonathan D. Chang","Wenhao Zhan","Owen Oertell","Gokul Swamy","Kianté Brantley","Thorsten Joachims","J. Andrew Bagnell","Jason D. Lee","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.16767v3.pdf","comment":"New experimental results on general chat"},{"id":"http://arxiv.org/abs/2406.06573v2","updated":"2024-09-01T19:38:02Z","published":"2024-06-03T18:15:56Z","title":"MedFuzz: Exploring the Robustness of Large Language Models in Medical\n Question Answering","summary":" Large language models (LLM) have achieved impressive performance on medical\nquestion-answering benchmarks. However, high benchmark accuracy does not imply\nthat the performance generalizes to real-world clinical settings. Medical\nquestion-answering benchmarks rely on assumptions consistent with quantifying\nLLM performance but that may not hold in the open world of the clinic. Yet LLMs\nlearn broad knowledge that can help the LLM generalize to practical conditions\nregardless of unrealistic assumptions in celebrated benchmarks. We seek to\nquantify how well LLM medical question-answering benchmark performance\ngeneralizes when benchmark assumptions are violated. Specifically, we present\nan adversarial method that we call MedFuzz (for medical fuzzing). MedFuzz\nattempts to modify benchmark questions in ways aimed at confounding the LLM. We\ndemonstrate the approach by targeting strong assumptions about patient\ncharacteristics presented in the MedQA benchmark. Successful \"attacks\" modify a\nbenchmark item in ways that would be unlikely to fool a medical expert but\nnonetheless \"trick\" the LLM into changing from a correct to an incorrect\nanswer. Further, we present a permutation test technique that can ensure a\nsuccessful attack is statistically significant. We show how to use performance\non a \"MedFuzzed\" benchmark, as well as individual successful attacks. The\nmethods show promise at providing insights into the ability of an LLM to\noperate robustly in more realistic settings.\n","authors":["Robert Osazuwa Ness","Katie Matton","Hayden Helm","Sheng Zhang","Junaid Bajwa","Carey E. Priebe","Eric Horvitz"],"pdf_url":"https://arxiv.org/pdf/2406.06573v2.pdf","comment":"9 pages, 3 figures, 2 algorithms, appendix"},{"id":"http://arxiv.org/abs/2408.04668v2","updated":"2024-09-01T19:00:25Z","published":"2024-08-07T01:50:59Z","title":"Forecasting Live Chat Intent from Browsing History","summary":" Customers reach out to online live chat agents with various intents, such as\nasking about product details or requesting a return. In this paper, we propose\nthe problem of predicting user intent from browsing history and address it\nthrough a two-stage approach. The first stage classifies a user's browsing\nhistory into high-level intent categories. Here, we represent each browsing\nhistory as a text sequence of page attributes and use the ground-truth class\nlabels to fine-tune pretrained Transformers. The second stage provides a large\nlanguage model (LLM) with the browsing history and predicted intent class to\ngenerate fine-grained intents. For automatic evaluation, we use a separate LLM\nto judge the similarity between generated and ground-truth intents, which\nclosely aligns with human judgments. Our two-stage approach yields significant\nperformance gains compared to generating intents without the classification\nstage.\n","authors":["Se-eun Yoon","Ahmad Bin Rabiah","Zaid Alibadi","Surya Kallumadi","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2408.04668v2.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2312.15321v2","updated":"2024-09-01T18:15:27Z","published":"2023-12-23T18:43:56Z","title":"Greedy Grammar Induction with Indirect Negative Evidence","summary":" This paper offers a fresh look at the pumping lemma constant as an upper\nbound on the information required for learning Context Free Grammars. An\nobjective function based on indirect negative evidence considers the\noccurrences, and non-occurrences, of a finite number of strings, encountered\nafter a sufficiently long presentation. This function has optimal substructure\nin the hypotheses space, giving rise to a greedy search learner in a branch and\nbound method. A hierarchy of learnable classes is defined in terms of the\nnumber of production rules that must be added to interim solutions in order to\nincrementally fit the input. Efficiency strongly depends on the position of the\ntarget grammar in the hierarchy and on the richness of the input.\n","authors":["Joseph Potashnik"],"pdf_url":"https://arxiv.org/pdf/2312.15321v2.pdf","comment":"12 pages (including references), 1 png files. 5 anciliary files\n (dataset)"},{"id":"http://arxiv.org/abs/2406.15319v3","updated":"2024-09-01T17:21:18Z","published":"2024-06-21T17:23:21Z","title":"LongRAG: Enhancing Retrieval-Augmented Generation with Long-context LLMs","summary":" In traditional RAG framework, the basic retrieval units are normally short.\nThe common retrievers like DPR normally work with 100-word Wikipedia\nparagraphs. Such a design forces the retriever to search over a large corpus to\nfind the `needle' unit. In contrast, the readers only need to generate answers\nfrom the short retrieved units. The imbalanced `heavy' retriever and `light'\nreader design can lead to sub-optimal performance. The loss of contextual\ninformation in the short, chunked units may increase the likelihood of\nintroducing hard negatives during the retrieval stage. Additionally, the reader\nmight not fully leverage the capabilities of recent advancements in LLMs. In\norder to alleviate the imbalance, we propose a new framework LongRAG,\nconsisting of a `long retriever' and a `long reader'. In the two\nWikipedia-based datasets, NQ and HotpotQA, LongRAG processes the entire\nWikipedia corpus into 4K-token units by grouping related documents. By\nincreasing the unit size, we significantly reduce the total number of units.\nThis greatly reduces the burden on the retriever, resulting in strong retrieval\nperformance with only a few (less than 8) top units. Without requiring any\ntraining, LongRAG achieves an EM of 62.7% on NQ and 64.3% on HotpotQA, which\nare on par with the (fully-trained) SoTA model. Furthermore, we test on two\nnon-Wikipedia-based datasets, Qasper and MultiFieldQA-en. LongRAG processes\neach individual document as a single (long) unit rather than chunking them into\nsmaller units. By doing so, we achieve an F1 score of 25.9% on Qasper and 57.5%\non MultiFieldQA-en. Our study offers insights into the future roadmap for\ncombining RAG with long-context LLMs.\n","authors":["Ziyan Jiang","Xueguang Ma","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2406.15319v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2404.10237v3","updated":"2024-09-01T16:39:31Z","published":"2024-04-16T02:35:17Z","title":"Med-MoE: Mixture of Domain-Specific Experts for Lightweight Medical\n Vision-Language Models","summary":" Recent advancements in general-purpose or domain-specific multimodal large\nlanguage models (LLMs) have witnessed remarkable progress for medical\ndecision-making. However, they are designated for specific classification or\ngenerative tasks, and require model training or finetuning on large-scale\ndatasets with sizeable parameters and tremendous computing, hindering their\nclinical utility across diverse resource-constrained scenarios in practice. In\nthis paper, we propose a novel and lightweight framework Med-MoE\n(Mixture-of-Experts) that tackles both discriminative and generative multimodal\nmedical tasks. The learning of Med-MoE consists of three steps: multimodal\nmedical alignment, instruction tuning and routing, and domain-specific MoE\ntuning. After aligning multimodal medical images with LLM tokens, we then\nenable the model for different multimodal medical tasks with instruction\ntuning, together with a trainable router tailored for expert selection across\ninput modalities. Finally, the model is tuned by integrating the router with\nmultiple domain-specific experts, which are selectively activated and further\nempowered by meta expert. Comprehensive experiments on both open- and close-end\nmedical question answering (Med-VQA) and image classification tasks across\ndatasets such as VQA-RAD, SLAKE and Path-VQA demonstrate that our model can\nachieve performance superior to or on par with state-of-the-art baselines,\nwhile only requiring approximately 30\\%-50\\% of activated model parameters.\nExtensive analysis and ablations corroborate the effectiveness and practical\nutility of our method.\n","authors":["Songtao Jiang","Tuo Zheng","Yan Zhang","Yeying Jin","Li Yuan","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10237v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05965v2","updated":"2024-09-01T15:13:52Z","published":"2024-07-08T14:04:58Z","title":"T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models","summary":" The recent development of Sora leads to a new era in text-to-video (T2V)\ngeneration. Along with this comes the rising concern about its security risks.\nThe generated videos may contain illegal or unethical content, and there is a\nlack of comprehensive quantitative understanding of their safety, posing a\nchallenge to their reliability and practical deployment. Previous evaluations\nprimarily focus on the quality of video generation. While some evaluations of\ntext-to-image models have considered safety, they cover fewer aspects and do\nnot address the unique temporal risk inherent in video generation. To bridge\nthis research gap, we introduce T2VSafetyBench, a new benchmark designed for\nconducting safety-critical assessments of text-to-video models. We define 12\ncritical aspects of video generation safety and construct a malicious prompt\ndataset including real-world prompts, LLM-generated prompts and jailbreak\nattack-based prompts. Based on our evaluation results, we draw several\nimportant findings, including: 1) no single model excels in all aspects, with\ndifferent models showing various strengths; 2) the correlation between GPT-4\nassessments and manual reviews is generally high; 3) there is a trade-off\nbetween the usability and safety of text-to-video generative models. This\nindicates that as the field of video generation rapidly advances, safety risks\nare set to surge, highlighting the urgency of prioritizing video safety. We\nhope that T2VSafetyBench can provide insights for better understanding the\nsafety of video generation in the era of generative AI.\n","authors":["Yibo Miao","Yifan Zhu","Yinpeng Dong","Lijia Yu","Jun Zhu","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2407.05965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10459v2","updated":"2024-09-01T14:28:11Z","published":"2024-06-15T01:02:48Z","title":"CancerLLM: A Large Language Model in Cancer Domain","summary":" Medical Large Language Models (LLMs) such as ClinicalCamel 70B,\nLlama3-OpenBioLLM 70B have demonstrated impressive performance on a wide\nvariety of medical NLP task.However, there still lacks a large language model\n(LLM) specifically designed for cancer domain. Moreover, these LLMs typically\nhave billions of parameters, making them computationally expensive for\nhealthcare systems.Thus, in this study, we propose CancerLLM, a model with 7\nbillion parameters and a Mistral-style architecture, pre-trained on 2,676,642\nclinical notes and 515,524 pathology reports covering 17 cancer types, followed\nby fine-tuning on three cancer-relevant tasks, including cancer phenotypes\nextraction, and cancer diagnosis generation. Our evaluation demonstrated that\nCancerLLM achieves state-of-the-art results compared to other existing LLMs,\nwith an average F1 score improvement of 7.61 %. Additionally, CancerLLM\noutperforms other models on two proposed robustness testbeds. This illustrates\nthat CancerLLM can be effectively applied to clinical AI systems, enhancing\nclinical research and healthcare delivery in the field of cancer.\n","authors":["Mingchen Li","Jiatan Huang","Jeremy Yeung","Anne Blaes","Steven Johnson","Hongfang Liu","Hua Xu","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.10459v2.pdf","comment":"add the diagnosis evaluation of ICD code"},{"id":"http://arxiv.org/abs/2304.07687v4","updated":"2024-09-01T13:11:58Z","published":"2023-04-16T03:49:50Z","title":"MLRegTest: A Benchmark for the Machine Learning of Regular Languages","summary":" Synthetic datasets constructed from formal languages allow fine-grained\nexamination of the learning and generalization capabilities of machine learning\nsystems for sequence classification. This article presents a new benchmark for\nmachine learning systems on sequence classification called MLRegTest, which\ncontains training, development, and test sets from 1,800 regular languages.\nDifferent kinds of formal languages represent different kinds of long-distance\ndependencies, and correctly identifying long-distance dependencies in sequences\nis a known challenge for ML systems to generalize successfully. MLRegTest\norganizes its languages according to their logical complexity (monadic second\norder, first order, propositional, or monomial expressions) and the kind of\nlogical literals (string, tier-string, subsequence, or combinations thereof).\nThe logical complexity and choice of literal provides a systematic way to\nunderstand different kinds of long-distance dependencies in regular languages,\nand therefore to understand the capacities of different ML systems to learn\nsuch long-distance dependencies. Finally, the performance of different neural\nnetworks (simple RNN, LSTM, GRU, transformer) on MLRegTest is examined. The\nmain conclusion is that performance depends significantly on the kind of test\nset, the class of language, and the neural network architecture.\n","authors":["Sam van der Poel","Dakotah Lambert","Kalina Kostyszyn","Tiantian Gao","Rahul Verma","Derek Andersen","Joanne Chau","Emily Peterson","Cody St. Clair","Paul Fodor","Chihiro Shibata","Jeffrey Heinz"],"pdf_url":"https://arxiv.org/pdf/2304.07687v4.pdf","comment":"Accepted for publication in the Journal of Machine Learning Research.\n Dataset available at https://doi.org/10.5061/dryad.dncjsxm4h , code available\n at https://github.com/heinz-jeffrey/subregular-learning"},{"id":"http://arxiv.org/abs/2407.11484v6","updated":"2024-09-01T10:12:45Z","published":"2024-07-16T08:20:39Z","title":"The Oscars of AI Theater: A Survey on Role-Playing with Language Models","summary":" This survey explores the burgeoning field of role-playing with language\nmodels, focusing on their development from early persona-based models to\nadvanced character-driven simulations facilitated by Large Language Models\n(LLMs). Initially confined to simple persona consistency due to limited model\ncapabilities, role-playing tasks have now expanded to embrace complex character\nportrayals involving character consistency, behavioral alignment, and overall\nattractiveness. We provide a comprehensive taxonomy of the critical components\nin designing these systems, including data, models and alignment, agent\narchitecture and evaluation. This survey not only outlines the current\nmethodologies and challenges, such as managing dynamic personal profiles and\nachieving high-level persona consistency but also suggests avenues for future\nresearch in improving the depth and realism of role-playing applications. The\ngoal is to guide future research by offering a structured overview of current\nmethodologies and identifying potential areas for improvement. Related\nresources and papers are available at\nhttps://github.com/nuochenpku/Awesome-Role-Play-Papers.\n","authors":["Nuo Chen","Yan Wang","Yang Deng","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.11484v6.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2408.08506v2","updated":"2024-09-01T08:30:58Z","published":"2024-08-16T03:06:57Z","title":"Ex3: Automatic Novel Writing by Extracting, Excelsior and Expanding","summary":" Generating long-term texts such as novels using artificial intelligence has\nalways been a challenge. A common approach is to use large language models\n(LLMs) to construct a hierarchical framework that first plans and then writes.\nDespite the fact that the generated novels reach a sufficient length, they\nexhibit poor logical coherence and appeal in their plots and deficiencies in\ncharacter and event depiction, ultimately compromising the overall narrative\nquality. In this paper, we propose a method named Extracting Excelsior and\nExpanding. Ex3 initially extracts structure information from raw novel data. By\ncombining this structure information with the novel data, an\ninstruction-following dataset is meticulously crafted. This dataset is then\nutilized to fine-tune the LLM, aiming for excelsior generation performance. In\nthe final stage, a tree-like expansion method is deployed to facilitate the\ngeneration of arbitrarily long novels. Evaluation against previous methods\nshowcases Ex3's ability to produce higher-quality long-form novels.\n","authors":["Lei Huang","Jiaming Guo","Guanhua He","Xishan Zhang","Rui Zhang","Shaohui Peng","Shaoli Liu","Tianshi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11905v3","updated":"2024-09-01T07:13:30Z","published":"2024-03-18T16:06:30Z","title":"Tur[k]ingBench: A Challenge Benchmark for Web Agents","summary":" Can advanced multi-modal models effectively tackle complex web-based tasks?\nSuch tasks are often found on crowdsourcing platforms, where crowdworkers\nengage in challenging micro-tasks within web-based environments.\n Building on this idea, we present TurkingBench, a benchmark consisting of\ntasks presented as web pages with textual instructions and multi-modal\ncontexts. Unlike previous approaches that rely on artificially synthesized web\npages, our benchmark uses natural HTML pages originally designed for\ncrowdsourcing workers to perform various annotation tasks. Each task's HTML\ninstructions are instantiated with different values derived from crowdsourcing\ntasks, creating diverse instances. This benchmark includes 32.2K instances\nspread across 158 tasks.\n To support the evaluation of TurkingBench, we have developed a framework that\nlinks chatbot responses to actions on web pages (e.g., modifying a text box,\nselecting a radio button). We assess the performance of cutting-edge private\nand open-source models, including language-only and vision-language models\n(such as GPT4 and InternVL), on this benchmark. Our results show that while\nthese models outperform random chance, there is still significant room for\nimprovement. We hope that this benchmark will drive progress in the evaluation\nand development of web-based agents.\n","authors":["Kevin Xu","Yeganeh Kordi","Tanay Nayak","Ado Asija","Yizhong Wang","Kate Sanders","Adam Byerly","Jingyu Zhang","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2403.11905v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15496v3","updated":"2024-09-01T06:03:46Z","published":"2024-08-28T02:47:27Z","title":"ReMamba: Equip Mamba with Effective Long-Sequence Modeling","summary":" While the Mamba architecture demonstrates superior inference efficiency and\ncompetitive performance on short-context natural language processing (NLP)\ntasks, empirical evidence suggests its capacity to comprehend long contexts is\nlimited compared to transformer-based models. In this study, we investigate the\nlong-context efficiency issues of the Mamba models and propose ReMamba, which\nenhances Mamba's ability to comprehend long contexts. ReMamba incorporates\nselective compression and adaptation techniques within a two-stage re-forward\nprocess, incurring minimal additional inference costs overhead. Experimental\nresults on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy,\nimproving over the baselines by 3.2 and 1.6 points, respectively, and attaining\nperformance almost on par with same-size transformer models.\n","authors":["Danlong Yuan","Jiahao Liu","Bei Li","Huishuai Zhang","Jingang Wang","Xunliang Cai","Dongyan Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.15496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02168v4","updated":"2024-09-01T05:21:46Z","published":"2023-10-03T16:02:36Z","title":"Editing Personality for Large Language Models","summary":" This paper introduces an innovative task focused on editing the personality\ntraits of Large Language Models (LLMs). This task seeks to adjust the models'\nresponses to opinion-related questions on specified topics since an\nindividual's personality often manifests in the form of their expressed\nopinions, thereby showcasing different personality traits. Specifically, we\nconstruct PersonalityEdit, a new benchmark dataset to address this task.\nDrawing on the theory in Social Psychology, we isolate three representative\ntraits, namely Neuroticism, Extraversion, and Agreeableness, as the foundation\nfor our benchmark. We then gather data using GPT-4, generating responses that\nalign with a specified topic and embody the targeted personality trait. We\nconduct comprehensive experiments involving various baselines and discuss the\nrepresentation of personality behavior in LLMs. Our findings uncover potential\nchallenges of the proposed task, illustrating several remaining issues. We\nanticipate that our work can stimulate further annotation in model editing and\npersonality-related research. Code is available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Shengyu Mao","Xiaohan Wang","Mengru Wang","Yong Jiang","Pengjun Xie","Fei Huang","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02168v4.pdf","comment":"NLPCC 2024"},{"id":"http://arxiv.org/abs/2402.12189v2","updated":"2024-09-01T03:02:36Z","published":"2024-02-19T14:52:50Z","title":"Amplifying Training Data Exposure through Fine-Tuning with\n Pseudo-Labeled Memberships","summary":" Neural language models (LMs) are vulnerable to training data extraction\nattacks due to data memorization. This paper introduces a novel attack scenario\nwherein an attacker adversarially fine-tunes pre-trained LMs to amplify the\nexposure of the original training data. This strategy differs from prior\nstudies by aiming to intensify the LM's retention of its pre-training dataset.\nTo achieve this, the attacker needs to collect generated texts that are closely\naligned with the pre-training data. However, without knowledge of the actual\ndataset, quantifying the amount of pre-training data within generated texts is\nchallenging. To address this, we propose the use of pseudo-labels for these\ngenerated texts, leveraging membership approximations indicated by\nmachine-generated probabilities from the target LM. We subsequently fine-tune\nthe LM to favor generations with higher likelihoods of originating from the\npre-training data, based on their membership probabilities. Our empirical\nfindings indicate a remarkable outcome: LMs with over 1B parameters exhibit a\nfour to eight-fold increase in training data exposure. We discuss potential\nmitigations and suggest future research directions.\n","authors":["Myung Gyo Oh","Hong Eun Ahn","Leo Hyun Park","Taekyoung Kwon"],"pdf_url":"https://arxiv.org/pdf/2402.12189v2.pdf","comment":"20 pages, 6 figures, 15 tables"},{"id":"http://arxiv.org/abs/2406.04220v3","updated":"2024-09-01T01:17:21Z","published":"2024-06-06T16:18:30Z","title":"BEADs: Bias Evaluation Across Domains","summary":" Recent advancements in large language models (LLMs) have greatly enhanced\nnatural language processing (NLP) applications. Nevertheless, these models\noften inherit biases from their training data. Despite the availability of\nvarious datasets, most are limited to one or two NLP tasks (typically\nclassification or evaluation) and lack comprehensive evaluations across a\nbroader range of NLP tasks. To address this gap, we introduce the Bias\nEvaluations Across Domains (BEADs) dataset, designed to support a wide array of\nNLP tasks, including text classification, token classification, bias\nquantification, and benign language generation. A key focus of this paper is\nthe gold label subset of BEADs, an important portion of the data verified by\nexperts to ensure high reliability. BEADs provides data for both fine-tuning,\nincluding classification and language generation tasks, and for evaluating\nLLMs. Our findings indicate that BEADs effectively identifies numerous biases\nwhen fine-tuned on this dataset. It also reduces biases when used for\nfine-tuning language generation task, while preserving language quality. The\nresults also reveal some prevalent demographic biases in LLMs when BEADs is\nused for evaluation in demographic task. The benchmarking results highlight the\nefficacy of fine-tuning LLMs for bias identification and the necessity of\ncomprehensive bias evaluation. We make BEADs publicly available to promote more\nresponsible AI development. The dataset can be accessed at\nhttps://huggingface.co/datasets/shainar/BEAD .\n","authors":["Shaina Raza","Mizanur Rahman","Michael R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.04220v3.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.06292v3","updated":"2024-09-01T00:41:18Z","published":"2024-08-12T16:58:11Z","title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific\n Discovery","summary":" One of the grand challenges of artificial general intelligence is developing\nagents capable of conducting scientific research and discovering new knowledge.\nWhile frontier models have already been used as aides to human scientists, e.g.\nfor brainstorming ideas, writing code, or prediction tasks, they still conduct\nonly a small part of the scientific process. This paper presents the first\ncomprehensive framework for fully automatic scientific discovery, enabling\nfrontier large language models to perform research independently and\ncommunicate their findings. We introduce The AI Scientist, which generates\nnovel research ideas, writes code, executes experiments, visualizes results,\ndescribes its findings by writing a full scientific paper, and then runs a\nsimulated review process for evaluation. In principle, this process can be\nrepeated to iteratively develop ideas in an open-ended fashion, acting like the\nhuman scientific community. We demonstrate its versatility by applying it to\nthree distinct subfields of machine learning: diffusion modeling,\ntransformer-based language modeling, and learning dynamics. Each idea is\nimplemented and developed into a full paper at a cost of less than $15 per\npaper. To evaluate the generated papers, we design and validate an automated\nreviewer, which we show achieves near-human performance in evaluating paper\nscores. The AI Scientist can produce papers that exceed the acceptance\nthreshold at a top machine learning conference as judged by our automated\nreviewer. This approach signifies the beginning of a new era in scientific\ndiscovery in machine learning: bringing the transformative benefits of AI\nagents to the entire research process of AI itself, and taking us closer to a\nworld where endless affordable creativity and innovation can be unleashed on\nthe world's most challenging problems. Our code is open-sourced at\nhttps://github.com/SakanaAI/AI-Scientist\n","authors":["Chris Lu","Cong Lu","Robert Tjarko Lange","Jakob Foerster","Jeff Clune","David Ha"],"pdf_url":"https://arxiv.org/pdf/2408.06292v3.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.13766v2","updated":"2024-09-01T22:05:52Z","published":"2023-10-20T18:57:38Z","title":"U-BEV: Height-aware Bird's-Eye-View Segmentation and Neural Map-based\n Relocalization","summary":" Efficient relocalization is essential for intelligent vehicles when GPS\nreception is insufficient or sensor-based localization fails. Recent advances\nin Bird's-Eye-View (BEV) segmentation allow for accurate estimation of local\nscene appearance and in turn, can benefit the relocalization of the vehicle.\nHowever, one downside of BEV methods is the heavy computation required to\nleverage the geometric constraints. This paper presents U-BEV, a U-Net inspired\narchitecture that extends the current state-of-the-art by allowing the BEV to\nreason about the scene on multiple height layers before flattening the BEV\nfeatures. We show that this extension boosts the performance of the U-BEV by up\nto 4.11 IoU. Additionally, we combine the encoded neural BEV with a\ndifferentiable template matcher to perform relocalization on neural SD-map\ndata. The model is fully end-to-end trainable and outperforms transformer-based\nBEV methods of similar computational complexity by 1.7 to 2.8 mIoU and\nBEV-based relocalization by over 26% Recall Accuracy on the nuScenes dataset.\n","authors":["Andrea Boscolo Camiletto","Alfredo Bochicchio","Alexander Liniger","Dengxin Dai","Abel Gawel"],"pdf_url":"https://arxiv.org/pdf/2310.13766v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.16767v3","updated":"2024-09-01T19:54:56Z","published":"2024-04-25T17:20:45Z","title":"REBEL: Reinforcement Learning via Regressing Relative Rewards","summary":" While originally developed for continuous control problems, Proximal Policy\nOptimization (PPO) has emerged as the work-horse of a variety of reinforcement\nlearning (RL) applications, including the fine-tuning of generative models.\nUnfortunately, PPO requires multiple heuristics to enable stable convergence\n(e.g. value networks, clipping), and is notorious for its sensitivity to the\nprecise implementation of these components. In response, we take a step back\nand ask what a minimalist RL algorithm for the era of generative models would\nlook like. We propose REBEL, an algorithm that cleanly reduces the problem of\npolicy optimization to regressing the relative reward between two completions\nto a prompt in terms of the policy, enabling strikingly lightweight\nimplementation. In theory, we prove that fundamental RL algorithms like Natural\nPolicy Gradient can be seen as variants of REBEL, which allows us to match the\nstrongest known theoretical guarantees in terms of convergence and sample\ncomplexity in the RL literature. REBEL can also cleanly incorporate offline\ndata and be extended to handle the intransitive preferences we frequently see\nin practice. Empirically, we find that REBEL provides a unified approach to\nlanguage modeling and image generation with stronger or similar performance as\nPPO and DPO, all while being simpler to implement and more computationally\nefficient than PPO. When fine-tuning Llama-3-8B-Instruct, REBEL achieves strong\nperformance in AlpacaEval 2.0, MT-Bench, and Open LLM Leaderboard.\n","authors":["Zhaolin Gao","Jonathan D. Chang","Wenhao Zhan","Owen Oertell","Gokul Swamy","Kianté Brantley","Thorsten Joachims","J. Andrew Bagnell","Jason D. Lee","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.16767v3.pdf","comment":"New experimental results on general chat"},{"id":"http://arxiv.org/abs/2406.19941v3","updated":"2024-09-01T18:18:19Z","published":"2024-06-28T14:17:16Z","title":"GRACE: Graph-Regularized Attentive Convolutional Entanglement with\n Laplacian Smoothing for Robust DeepFake Video Detection","summary":" As DeepFake video manipulation techniques escalate, posing profound threats,\nthe urgent need to develop efficient detection strategies is underscored.\nHowever, one particular issue lies with facial images being mis-detected, often\noriginating from degraded videos or adversarial attacks, leading to unexpected\ntemporal artifacts that can undermine the efficacy of DeepFake video detection\ntechniques. This paper introduces a novel method for robust DeepFake video\ndetection, harnessing the power of the proposed Graph-Regularized Attentive\nConvolutional Entanglement (GRACE) based on the graph convolutional network\nwith graph Laplacian to address the aforementioned challenges. First,\nconventional Convolution Neural Networks are deployed to perform spatiotemporal\nfeatures for the entire video. Then, the spatial and temporal features are\nmutually entangled by constructing a graph with sparse constraint, enforcing\nessential features of valid face images in the noisy face sequences remaining,\nthus augmenting stability and performance for DeepFake video detection.\nFurthermore, the Graph Laplacian prior is proposed in the graph convolutional\nnetwork to remove the noise pattern in the feature space to further improve the\nperformance. Comprehensive experiments are conducted to illustrate that our\nproposed method delivers state-of-the-art performance in DeepFake video\ndetection under noisy face sequences. The source code is available at\nhttps://github.com/ming053l/GRACE.\n","authors":["Chih-Chung Hsu","Shao-Ning Chen","Mei-Hsuan Wu","Yi-Fang Wang","Chia-Ming Lee","Yi-Shiuan Chou"],"pdf_url":"https://arxiv.org/pdf/2406.19941v3.pdf","comment":"Submitted to TPAMI 2024"},{"id":"http://arxiv.org/abs/2310.13906v2","updated":"2024-09-01T17:28:22Z","published":"2023-10-21T04:24:30Z","title":"Exploring Driving Behavior for Autonomous Vehicles Based on Gramian\n Angular Field Vision Transformer","summary":" Effective classification of autonomous vehicle (AV) driving behavior emerges\nas a critical area for diagnosing AV operation faults, enhancing autonomous\ndriving algorithms, and reducing accident rates. This paper presents the\nGramian Angular Field Vision Transformer (GAF-ViT) model, designed to analyze\nAV driving behavior. The proposed GAF-ViT model consists of three key\ncomponents: GAF Transformer Module, Channel Attention Module, and Multi-Channel\nViT Module. These modules collectively convert representative sequences of\nmultivariate behavior into multi-channel images and employ image recognition\ntechniques for behavior classification. A channel attention mechanism is\napplied to multi-channel images to discern the impact of various driving\nbehavior features. Experimental evaluation on the Waymo Open Dataset of\ntrajectories demonstrates that the proposed model achieves state-of-the-art\nperformance. Furthermore, an ablation study effectively substantiates the\nefficacy of individual modules within the model.\n","authors":["Junwei You","Ying Chen","Zhuoyu Jiang","Zhangchi Liu","Zilin Huang","Yifeng Ding","Bin Ran"],"pdf_url":"https://arxiv.org/pdf/2310.13906v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10237v3","updated":"2024-09-01T16:39:31Z","published":"2024-04-16T02:35:17Z","title":"Med-MoE: Mixture of Domain-Specific Experts for Lightweight Medical\n Vision-Language Models","summary":" Recent advancements in general-purpose or domain-specific multimodal large\nlanguage models (LLMs) have witnessed remarkable progress for medical\ndecision-making. However, they are designated for specific classification or\ngenerative tasks, and require model training or finetuning on large-scale\ndatasets with sizeable parameters and tremendous computing, hindering their\nclinical utility across diverse resource-constrained scenarios in practice. In\nthis paper, we propose a novel and lightweight framework Med-MoE\n(Mixture-of-Experts) that tackles both discriminative and generative multimodal\nmedical tasks. The learning of Med-MoE consists of three steps: multimodal\nmedical alignment, instruction tuning and routing, and domain-specific MoE\ntuning. After aligning multimodal medical images with LLM tokens, we then\nenable the model for different multimodal medical tasks with instruction\ntuning, together with a trainable router tailored for expert selection across\ninput modalities. Finally, the model is tuned by integrating the router with\nmultiple domain-specific experts, which are selectively activated and further\nempowered by meta expert. Comprehensive experiments on both open- and close-end\nmedical question answering (Med-VQA) and image classification tasks across\ndatasets such as VQA-RAD, SLAKE and Path-VQA demonstrate that our model can\nachieve performance superior to or on par with state-of-the-art baselines,\nwhile only requiring approximately 30\\%-50\\% of activated model parameters.\nExtensive analysis and ablations corroborate the effectiveness and practical\nutility of our method.\n","authors":["Songtao Jiang","Tuo Zheng","Yan Zhang","Yeying Jin","Li Yuan","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2404.10237v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05965v2","updated":"2024-09-01T15:13:52Z","published":"2024-07-08T14:04:58Z","title":"T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models","summary":" The recent development of Sora leads to a new era in text-to-video (T2V)\ngeneration. Along with this comes the rising concern about its security risks.\nThe generated videos may contain illegal or unethical content, and there is a\nlack of comprehensive quantitative understanding of their safety, posing a\nchallenge to their reliability and practical deployment. Previous evaluations\nprimarily focus on the quality of video generation. While some evaluations of\ntext-to-image models have considered safety, they cover fewer aspects and do\nnot address the unique temporal risk inherent in video generation. To bridge\nthis research gap, we introduce T2VSafetyBench, a new benchmark designed for\nconducting safety-critical assessments of text-to-video models. We define 12\ncritical aspects of video generation safety and construct a malicious prompt\ndataset including real-world prompts, LLM-generated prompts and jailbreak\nattack-based prompts. Based on our evaluation results, we draw several\nimportant findings, including: 1) no single model excels in all aspects, with\ndifferent models showing various strengths; 2) the correlation between GPT-4\nassessments and manual reviews is generally high; 3) there is a trade-off\nbetween the usability and safety of text-to-video generative models. This\nindicates that as the field of video generation rapidly advances, safety risks\nare set to surge, highlighting the urgency of prioritizing video safety. We\nhope that T2VSafetyBench can provide insights for better understanding the\nsafety of video generation in the era of generative AI.\n","authors":["Yibo Miao","Yifan Zhu","Yinpeng Dong","Lijia Yu","Jun Zhu","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2407.05965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04958v2","updated":"2024-09-01T14:43:05Z","published":"2024-08-09T09:23:07Z","title":"Surgical-VQLA++: Adversarial Contrastive Learning for Calibrated Robust\n Visual Question-Localized Answering in Robotic Surgery","summary":" Medical visual question answering (VQA) bridges the gap between visual\ninformation and clinical decision-making, enabling doctors to extract\nunderstanding from clinical images and videos. In particular, surgical VQA can\nenhance the interpretation of surgical data, aiding in accurate diagnoses,\neffective education, and clinical interventions. However, the inability of VQA\nmodels to visually indicate the regions of interest corresponding to the given\nquestions results in incomplete comprehension of the surgical scene. To tackle\nthis, we propose the surgical visual question localized-answering (VQLA) for\nprecise and context-aware responses to specific queries regarding surgical\nimages. Furthermore, to address the strong demand for safety in surgical\nscenarios and potential corruptions in image acquisition and transmission, we\npropose a novel approach called Calibrated Co-Attention Gated Vision-Language\n(C$^2$G-ViL) embedding to integrate and align multimodal information\neffectively. Additionally, we leverage the adversarial sample-based contrastive\nlearning strategy to boost our performance and robustness. We also extend our\nEndoVis-18-VQLA and EndoVis-17-VQLA datasets to broaden the scope and\napplication of our data. Extensive experiments on the aforementioned datasets\ndemonstrate the remarkable performance and robustness of our solution. Our\nsolution can effectively combat real-world image corruption. Thus, our proposed\napproach can serve as an effective tool for assisting surgical education,\npatient care, and enhancing surgical outcomes.\n","authors":["Long Bai","Guankun Wang","Mobarakol Islam","Lalithkumar Seenivasan","An Wang","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2408.04958v2.pdf","comment":"Accepted by Information Fusion. Code and data availability:\n https://github.com/longbai1006/Surgical-VQLAPlus"},{"id":"http://arxiv.org/abs/2408.15628v2","updated":"2024-09-01T13:22:03Z","published":"2024-08-28T08:27:41Z","title":"CSAD: Unsupervised Component Segmentation for Logical Anomaly Detection","summary":" To improve logical anomaly detection, some previous works have integrated\nsegmentation techniques with conventional anomaly detection methods. Although\nthese methods are effective, they frequently lead to unsatisfactory\nsegmentation results and require manual annotations. To address these\ndrawbacks, we develop an unsupervised component segmentation technique that\nleverages foundation models to autonomously generate training labels for a\nlightweight segmentation network without human labeling. Integrating this new\nsegmentation technique with our proposed Patch Histogram module and the\nLocal-Global Student-Teacher (LGST) module, we achieve a detection AUROC of\n95.3% in the MVTec LOCO AD dataset, which surpasses previous SOTA methods.\nFurthermore, our proposed method provides lower latency and higher throughput\nthan most existing approaches.\n","authors":["Yu-Hsuan Hsieh","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03873v3","updated":"2024-09-01T09:56:44Z","published":"2024-06-06T09:04:48Z","title":"Quantum Implicit Neural Representations","summary":" Implicit neural representations have emerged as a powerful paradigm to\nrepresent signals such as images and sounds. This approach aims to utilize\nneural networks to parameterize the implicit function of the signal. However,\nwhen representing implicit functions, traditional neural networks such as\nReLU-based multilayer perceptrons face challenges in accurately modeling\nhigh-frequency components of signals. Recent research has begun to explore the\nuse of Fourier Neural Networks (FNNs) to overcome this limitation. In this\npaper, we propose Quantum Implicit Representation Network (QIREN), a novel\nquantum generalization of FNNs. Furthermore, through theoretical analysis, we\ndemonstrate that QIREN possesses a quantum advantage over classical FNNs.\nLastly, we conducted experiments in signal representation, image\nsuperresolution, and image generation tasks to show the superior performance of\nQIREN compared to state-of-the-art (SOTA) models. Our work not only\nincorporates quantum advantages into implicit neural representations but also\nuncovers a promising application direction for Quantum Neural Networks.\n","authors":["Jiaming Zhao","Wenbo Qiao","Peng Zhang","Hui Gao"],"pdf_url":"https://arxiv.org/pdf/2406.03873v3.pdf","comment":"This paper was accepted by icml 2024"},{"id":"http://arxiv.org/abs/2408.16690v2","updated":"2024-09-01T09:07:51Z","published":"2024-08-29T16:37:58Z","title":"Generic Objects as Pose Probes for Few-Shot View Synthesis","summary":" Radiance fields including NeRFs and 3D Gaussians demonstrate great potential\nin high-fidelity rendering and scene reconstruction, while they require a\nsubstantial number of posed images as inputs. COLMAP is frequently employed for\npreprocessing to estimate poses, while it necessitates a large number of\nfeature matches to operate effectively, and it struggles with scenes\ncharacterized by sparse features, large baselines between images, or a limited\nnumber of input images. We aim to tackle few-view NeRF reconstruction using\nonly 3 to 6 unposed scene images. Traditional methods often use calibration\nboards but they are not common in images. We propose a novel idea of utilizing\neveryday objects, commonly found in both images and real life, as \"pose\nprobes\". The probe object is automatically segmented by SAM, whose shape is\ninitialized from a cube. We apply a dual-branch volume rendering optimization\n(object NeRF and scene NeRF) to constrain the pose optimization and jointly\nrefine the geometry. Specifically, object poses of two views are first\nestimated by PnP matching in an SDF representation, which serves as initial\nposes. PnP matching, requiring only a few features, is suitable for\nfeature-sparse scenes. Additional views are incrementally incorporated to\nrefine poses from preceding views. In experiments, PoseProbe achieves\nstate-of-the-art performance in both pose estimation and novel view synthesis\nacross multiple datasets. We demonstrate its effectiveness, particularly in\nfew-view and large-baseline scenes where COLMAP struggles. In ablations, using\ndifferent objects in a scene yields comparable performance. Our project page is\navailable at: \\href{https://zhirui-gao.github.io/PoseProbe.github.io/}{this\nhttps URL}\n","authors":["Zhirui Gao","Renjiao Yi","Chenyang Zhu","Ke Zhuang","Wei Chen","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2408.16690v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14401v2","updated":"2024-09-01T07:45:02Z","published":"2024-03-21T13:49:42Z","title":"Pensieve: Retrospect-then-Compare Mitigates Visual Hallucination","summary":" Multi-modal Large Language Models (MLLMs) demonstrate remarkable success\nacross various vision-language tasks. However, they suffer from visual\nhallucination, where the generated responses diverge from the provided image.\nAre MLLMs oblivious to the accurate visual cues when they hallucinate? Our\ninvestigation reveals that the visual branch may equally advocate both accurate\nand erroneous content. To address this issue, we propose Pensieve, a\ntraining-free method that leverages the analogous visual hallucinations, which\nare induced by images sharing common semantic and appearance characteristics,\nto mitigate hallucination. Specifically, Pensieve enables MLLMs to retrospect\nrelevant images as references and compare their visual content with the test\nimage via confidence score subtraction. Moreover, our paradigm balances the\neffects of addressing errors from both the visual and textual branches by\nadaptively scaling the subtracted scores. Experiments on Whoops, LLaVA Bench,\nPOPE, and MME demonstrate the efficacy of Pensieve in mitigating visual\nhallucination, surpassing other advanced decoding strategies. Pensieve also\naids MLLMs in identifying visual details and enhance the specificity of\ngenerated image descriptions.\n","authors":["Dingchen Yang","Bowen Cao","Guang Chen","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.14401v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11905v3","updated":"2024-09-01T07:13:30Z","published":"2024-03-18T16:06:30Z","title":"Tur[k]ingBench: A Challenge Benchmark for Web Agents","summary":" Can advanced multi-modal models effectively tackle complex web-based tasks?\nSuch tasks are often found on crowdsourcing platforms, where crowdworkers\nengage in challenging micro-tasks within web-based environments.\n Building on this idea, we present TurkingBench, a benchmark consisting of\ntasks presented as web pages with textual instructions and multi-modal\ncontexts. Unlike previous approaches that rely on artificially synthesized web\npages, our benchmark uses natural HTML pages originally designed for\ncrowdsourcing workers to perform various annotation tasks. Each task's HTML\ninstructions are instantiated with different values derived from crowdsourcing\ntasks, creating diverse instances. This benchmark includes 32.2K instances\nspread across 158 tasks.\n To support the evaluation of TurkingBench, we have developed a framework that\nlinks chatbot responses to actions on web pages (e.g., modifying a text box,\nselecting a radio button). We assess the performance of cutting-edge private\nand open-source models, including language-only and vision-language models\n(such as GPT4 and InternVL), on this benchmark. Our results show that while\nthese models outperform random chance, there is still significant room for\nimprovement. We hope that this benchmark will drive progress in the evaluation\nand development of web-based agents.\n","authors":["Kevin Xu","Yeganeh Kordi","Tanay Nayak","Ado Asija","Yizhong Wang","Kate Sanders","Adam Byerly","Jingyu Zhang","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2403.11905v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11992v2","updated":"2024-09-01T07:04:56Z","published":"2024-08-21T21:03:36Z","title":"MBSS-T1: Model-Based Self-Supervised Motion Correction for Robust\n Cardiac T1 Mapping","summary":" T1 mapping is a valuable quantitative MRI technique for diagnosing diffuse\nmyocardial diseases. Traditional methods, relying on breath-hold sequences and\necho triggering, face challenges with patient compliance and arrhythmias,\nlimiting their effectiveness. Image registration can enable motion-robust T1\nmapping, but inherent intensity differences between time points pose a\nchallenge. We introduce MBSS-T1, a self-supervised model for motion correction\nin cardiac T1 mapping, constrained by physical and anatomical principles. The\nphysical constraints ensure expected signal decay behavior, while the\nanatomical constraints maintain realistic deformations. The unique combination\nof these constraints ensures accurate T1 mapping along the longitudinal\nrelaxation axis. MBSS-T1 outperformed baseline deep-learning-based image\nregistration approaches in a 5-fold experiment on a public dataset of 210\npatients (STONE sequence) and an internal dataset of 19 patients (MOLLI\nsequence). MBSS-T1 excelled in model fitting quality ($R^2$: 0.975 vs. 0.941,\n0.946), anatomical alignment (Dice score: 0.89 vs. 0.84, 0.88), and expert\nvisual quality assessment for the presence of visible motion artifacts (4.33\nvs. 3.38, 3.66). MBSS-T1 has the potential to enable motion-robust T1 mapping\nfor a broader range of patients, overcoming challenges such as arrhythmias and\nsuboptimal compliance, and allowing for free-breathing T1 mapping without\nrequiring large training datasets. Our code will be publicly available upon\nacceptance.\n","authors":["Eyal Hanania","Ilya Volovik","Daphna Link-Sourani","Israel Cohen","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2408.11992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08340v2","updated":"2024-09-01T06:55:04Z","published":"2024-05-14T06:26:58Z","title":"Achieving Resolution-Agnostic DNN-based Image Watermarking: A Novel\n Perspective of Implicit Neural Representation","summary":" DNN-based watermarking methods are rapidly developing and delivering\nimpressive performances. Recent advances achieve resolution-agnostic image\nwatermarking by reducing the variant resolution watermarking problem to a fixed\nresolution watermarking problem. However, such a reduction process can\npotentially introduce artifacts and low robustness. To address this issue, we\npropose the first, to the best of our knowledge, Resolution-Agnostic Image\nWaterMarking (RAIMark) framework by watermarking the implicit neural\nrepresentation (INR) of image. Unlike previous methods, our method does not\nrely on the previous reduction process by directly watermarking the continuous\nsignal instead of image pixels, thus achieving resolution-agnostic\nwatermarking. Precisely, given an arbitrary-resolution image, we fit an INR for\nthe target image. As a continuous signal, such an INR can be sampled to obtain\nimages with variant resolutions. Then, we quickly fine-tune the fitted INR to\nget a watermarked INR conditioned on a binary secret message. A pre-trained\nwatermark decoder extracts the hidden message from any sampled images with\narbitrary resolutions. By directly watermarking INR, we achieve\nresolution-agnostic watermarking with increased robustness. Extensive\nexperiments show that our method outperforms previous methods with significant\nimprovements: averagely improved bit accuracy by 7%$\\sim$29%. Notably, we\nobserve that previous methods are vulnerable to at least one watermarking\nattack (e.g. JPEG, crop, resize), while ours are robust against all\nwatermarking attacks.\n","authors":["Yuchen Wang","Xingyu Zhu","Guanhui Ye","Shiyao Zhang","Xuetao Wei"],"pdf_url":"https://arxiv.org/pdf/2405.08340v2.pdf","comment":"Accepted by ACM MM'24"},{"id":"http://arxiv.org/abs/2403.11679v3","updated":"2024-09-01T06:44:46Z","published":"2024-03-18T11:31:03Z","title":"NEDS-SLAM: A Neural Explicit Dense Semantic SLAM Framework using 3D\n Gaussian Splatting","summary":" We propose NEDS-SLAM, a dense semantic SLAM system based on 3D Gaussian\nrepresentation, that enables robust 3D semantic mapping, accurate camera\ntracking, and high-quality rendering in real-time. In the system, we propose a\nSpatially Consistent Feature Fusion model to reduce the effect of erroneous\nestimates from pre-trained segmentation head on semantic reconstruction,\nachieving robust 3D semantic Gaussian mapping. Additionally, we employ a\nlightweight encoder-decoder to compress the high-dimensional semantic features\ninto a compact 3D Gaussian representation, mitigating the burden of excessive\nmemory consumption. Furthermore, we leverage the advantage of 3D Gaussian\nsplatting, which enables efficient and differentiable novel view rendering, and\npropose a Virtual Camera View Pruning method to eliminate outlier gaussians,\nthereby effectively enhancing the quality of scene representations. Our\nNEDS-SLAM method demonstrates competitive performance over existing dense\nsemantic SLAM methods in terms of mapping and tracking accuracy on Replica and\nScanNet datasets, while also showing excellent capabilities in 3D dense\nsemantic mapping.\n","authors":["Yiming Ji","Yang Liu","Guanghu Xie","Boyu Ma","Zongwu Xie"],"pdf_url":"https://arxiv.org/pdf/2403.11679v3.pdf","comment":"accepted by RA-L, IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2403.11614v4","updated":"2024-09-01T06:32:06Z","published":"2024-03-18T09:44:44Z","title":"CRS-Diff: Controllable Remote Sensing Image Generation with Diffusion\n Model","summary":" The emergence of generative models has revolutionized the field of remote\nsensing (RS) image generation. Despite generating high-quality images, existing\nmethods are limited in relying mainly on text control conditions, and thus do\nnot always generate images accurately and stably. In this paper, we propose\nCRS-Diff, a new RS generative framework specifically tailored for RS image\ngeneration, leveraging the inherent advantages of diffusion models while\nintegrating more advanced control mechanisms. Specifically, CRS-Diff can\nsimultaneously support text-condition, metadata-condition, and image-condition\ncontrol inputs, thus enabling more precise control to refine the generation\nprocess. To effectively integrate multiple condition control information, we\nintroduce a new conditional control mechanism to achieve multi-scale feature\nfusion, thus enhancing the guiding effect of control conditions. To our\nknowledge, CRS-Diff is the first multiple-condition controllable RS generative\nmodel. Experimental results in single-condition and multiple-condition cases\nhave demonstrated the superior ability of our CRS-Diff to generate RS images\nboth quantitatively and qualitatively compared with previous methods.\nAdditionally, our CRS-Diff can serve as a data engine that generates\nhigh-quality training data for downstream tasks, e.g., road extraction. The\ncode is available at https://github.com/Sonettoo/CRS-Diff.\n","authors":["Datao Tang","Xiangyong Cao","Xingsong Hou","Zhongyuan Jiang","Junmin Liu","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2403.11614v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12550v3","updated":"2024-09-01T06:14:37Z","published":"2022-08-26T10:05:39Z","title":"Training and Tuning Generative Neural Radiance Fields for\n Attribute-Conditional 3D-Aware Face Generation","summary":" Generative Neural Radiance Fields (GNeRF)-based 3D-aware GANs have showcased\nremarkable prowess in crafting high-fidelity images while upholding robust 3D\nconsistency, particularly face generation. However, specific existing models\nprioritize view consistency over disentanglement, leading to constrained\nsemantic or attribute control during the generation process. While many methods\nhave explored incorporating semantic masks or leveraging 3D Morphable Models\n(3DMM) priors to imbue models with semantic control, these methods often demand\ntraining from scratch, entailing significant computational overhead. In this\npaper, we propose a novel approach: a conditional GNeRF model that integrates\nspecific attribute labels as input, thus amplifying the controllability and\ndisentanglement capabilities of 3D-aware generative models. Our approach builds\nupon a pre-trained 3D-aware face model, and we introduce a Training as Init and\nOptimizing for Tuning (TRIOT) method to train a conditional normalized flow\nmodule to enable the facial attribute editing, then optimize the latent vector\nto improve attribute-editing precision further. Our extensive experiments\nsubstantiate the efficacy of our model, showcasing its ability to generate\nhigh-quality edits with enhanced view consistency while safeguarding non-target\nregions. The code for our model is publicly available at\nhttps://github.com/zhangqianhui/TT-GNeRF.\n","authors":["Jichao Zhang","Aliaksandr Siarohin","Yahui Liu","Hao Tang","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2208.12550v3.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2405.09032v3","updated":"2024-09-01T06:03:39Z","published":"2024-05-15T02:03:44Z","title":"ICAL: Implicit Character-Aided Learning for Enhanced Handwritten\n Mathematical Expression Recognition","summary":" Significant progress has been made in the field of handwritten mathematical\nexpression recognition, while existing encoder-decoder methods are usually\ndifficult to model global information in $LaTeX$. Therefore, this paper\nintroduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine\nthe global expression information and enhance handwritten mathematical\nexpression recognition. Specifically, we propose the Implicit Character\nConstruction Module (ICCM) to predict implicit character sequences and use a\nFusion Module to merge the outputs of the ICCM and the decoder, thereby\nproducing corrected predictions. By modeling and utilizing implicit character\ninformation, ICAL achieves a more accurate and context-aware interpretation of\nhandwritten mathematical expressions. Experimental results demonstrate that\nICAL notably surpasses the state-of-the-art(SOTA) models, improving the\nexpression recognition rate (ExpRate) by 2.25\\%/1.81\\%/1.39\\% on the CROHME\n2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\\% on the\nchallenging HME100k test set. We make our code available on the GitHub:\nhttps://github.com/qingzhenduyu/ICAL\n","authors":["Jianhua Zhu","Liangcai Gao","Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.09032v3.pdf","comment":"ICDAR 2024 Oral Paper"},{"id":"http://arxiv.org/abs/2405.02008v2","updated":"2024-09-01T05:02:01Z","published":"2024-05-03T11:16:27Z","title":"DiffMap: Enhancing Map Segmentation with Map Prior Using Diffusion Model","summary":" Constructing high-definition (HD) maps is a crucial requirement for enabling\nautonomous driving. In recent years, several map segmentation algorithms have\nbeen developed to address this need, leveraging advancements in Bird's-Eye View\n(BEV) perception. However, existing models still encounter challenges in\nproducing realistic and consistent semantic map layouts. One prominent issue is\nthe limited utilization of structured priors inherent in map segmentation\nmasks. In light of this, we propose DiffMap, a novel approach specifically\ndesigned to model the structured priors of map segmentation masks using latent\ndiffusion model. By incorporating this technique, the performance of existing\nsemantic segmentation methods can be significantly enhanced and certain\nstructural errors present in the segmentation outputs can be effectively\nrectified. Notably, the proposed module can be seamlessly integrated into any\nmap segmentation model, thereby augmenting its capability to accurately\ndelineate semantic information. Furthermore, through extensive visualization\nanalysis, our model demonstrates superior proficiency in generating results\nthat more accurately reflect real-world map layouts, further validating its\nefficacy in improving the quality of the generated maps.\n","authors":["Peijin Jia","Tuopu Wen","Ziang Luo","Mengmeng Yang","Kun Jiang","Zhiquan Lei","Xuewei Tang","Ziyuan Liu","Le Cui","Bo Zhang","Long Huang","Diange Yang"],"pdf_url":"https://arxiv.org/pdf/2405.02008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00347v2","updated":"2024-09-01T03:52:51Z","published":"2024-08-01T07:35:54Z","title":"Advancing Medical Image Segmentation: Morphology-Driven Learning with\n Diffusion Transformer","summary":" Understanding the morphological structure of medical images and precisely\nsegmenting the region of interest or abnormality is an important task that can\nassist in diagnosis. However, the unique properties of medical imaging make\nclear segmentation difficult,and the high cost and time-consuming task of\nlabeling leads to a coarse-grained representation of ground truth. Facing with\nthese problems, we propose a novel Diffusion Transformer Segmentation (DTS)\nmodel for robust segmentation in the presence of noise. We propose an\nalternative to the dominant Denoising U-Net encoder through experiments\napplying a transformer architecture, which captures global dependency through\nself-attention. Additionally, we propose k-neighbor label smoothing, reverse\nboundary attention, and self-supervised learning with morphology-driven\nlearning to improve the ability to identify complex structures. Our model,\nwhich analyzes the morphological representation of images, shows better results\nthan the previous models in various medical imaging modalities, including CT,\nMRI, and lesion images.\n","authors":["Sungmin Kang","Jaeha Song","Jihie Kim"],"pdf_url":"https://arxiv.org/pdf/2408.00347v2.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2205.13542v3","updated":"2024-09-01T03:33:26Z","published":"2022-05-26T17:59:35Z","title":"BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View\n Representation","summary":" Multi-sensor fusion is essential for an accurate and reliable autonomous\ndriving system. Recent approaches are based on point-level fusion: augmenting\nthe LiDAR point cloud with camera features. However, the camera-to-LiDAR\nprojection throws away the semantic density of camera features, hindering the\neffectiveness of such methods, especially for semantic-oriented tasks (such as\n3D scene segmentation). In this paper, we break this deeply-rooted convention\nwith BEVFusion, an efficient and generic multi-task multi-sensor fusion\nframework. It unifies multi-modal features in the shared bird's-eye view (BEV)\nrepresentation space, which nicely preserves both geometric and semantic\ninformation. To achieve this, we diagnose and lift key efficiency bottlenecks\nin the view transformation with optimized BEV pooling, reducing latency by more\nthan 40x. BEVFusion is fundamentally task-agnostic and seamlessly supports\ndifferent 3D perception tasks with almost no architectural changes. It\nestablishes the new state of the art on nuScenes, achieving 1.3% higher mAP and\nNDS on 3D object detection and 13.6% higher mIoU on BEV map segmentation, with\n1.9x lower computation cost. Code to reproduce our results is available at\nhttps://github.com/mit-han-lab/bevfusion.\n","authors":["Zhijian Liu","Haotian Tang","Alexander Amini","Xinyu Yang","Huizi Mao","Daniela Rus","Song Han"],"pdf_url":"https://arxiv.org/pdf/2205.13542v3.pdf","comment":"ICRA 2023. The first two authors contributed equally to this work.\n Project page: https://bevfusion.mit.edu"},{"id":"http://arxiv.org/abs/2301.00954v4","updated":"2024-09-01T01:27:58Z","published":"2023-01-03T05:30:56Z","title":"PanopticPartFormer++: A Unified and Decoupled View for Panoptic Part\n Segmentation","summary":" Panoptic Part Segmentation (PPS) unifies panoptic and part segmentation into\none task. Previous works utilize separate approaches to handle things, stuff,\nand part predictions without shared computation and task association. We aim to\nunify these tasks at the architectural level, designing the first end-to-end\nunified framework, Panoptic-PartFormer. Moreover, we find the previous metric\nPartPQ biases to PQ. To handle both issues, we first design a meta-architecture\nthat decouples part features and things/stuff features, respectively. We model\nthings, stuff, and parts as object queries and directly learn to optimize all\nthree forms of prediction as a unified mask prediction and classification\nproblem. We term our model as Panoptic-PartFormer. Second, we propose a new\nmetric Part-Whole Quality (PWQ), better to measure this task from pixel-region\nand part-whole perspectives. It also decouples the errors for part segmentation\nand panoptic segmentation. Third, inspired by Mask2Former, based on our\nmeta-architecture, we propose Panoptic-PartFormer++ and design a new part-whole\ncross-attention scheme to boost part segmentation qualities further. We design\na new part-whole interaction method using masked cross attention. Finally,\nextensive ablation studies and analysis demonstrate the effectiveness of both\nPanoptic-PartFormer and Panoptic-PartFormer++. Compared with previous\nPanoptic-PartFormer, our Panoptic-PartFormer++ achieves 2% PartPQ and 3% PWQ\nimprovements on the Cityscapes PPS dataset and 5% PartPQ on the Pascal Context\nPPS dataset. On both datasets, Panoptic-PartFormer++ achieves new\nstate-of-the-art results. Our models can serve as a strong baseline and aid\nfuture research in PPS. The source code and trained models will be available\nat~\\url{https://github.com/lxtGH/Panoptic-PartFormer}.\n","authors":["Xiangtai Li","Shilin Xu","Yibo Yang","Haobo Yuan","Guangliang Cheng","Yunhai Tong","Zhouchen Lin","Ming-Hsuan Yang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2301.00954v4.pdf","comment":"T-PAMI-2024, Extension of PanopticPartFormer (ECCV 2022)"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2010.14464v2","updated":"2024-09-01T20:11:27Z","published":"2020-10-27T17:23:18Z","title":"Dynamic Boundary Time Warping for Sub-sequence Matching with Few\n Examples","summary":" The paper presents a novel method of finding a fragment in a long temporal\nsequence similar to the set of shorter sequences. We are the first to propose\nan algorithm for such a search that does not rely on computing the average\nsequence from query examples. Instead, we use query examples as is, utilizing\nall of them simultaneously. The introduced method based on the Dynamic Time\nWarping (DTW) technique is suited explicitly for few-shot query-by-example\nretrieval tasks. We evaluate it on two different few-shot problems from the\nfield of Natural Language Processing. The results show it either outperforms\nbaselines and previous approaches or achieves comparable results when a low\nnumber of examples is available.\n","authors":["Łukasz Borchmann","Dawid Jurkiewicz","Filip Graliński","Tomasz Górecki"],"pdf_url":"https://arxiv.org/pdf/2010.14464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04668v2","updated":"2024-09-01T19:00:25Z","published":"2024-08-07T01:50:59Z","title":"Forecasting Live Chat Intent from Browsing History","summary":" Customers reach out to online live chat agents with various intents, such as\nasking about product details or requesting a return. In this paper, we propose\nthe problem of predicting user intent from browsing history and address it\nthrough a two-stage approach. The first stage classifies a user's browsing\nhistory into high-level intent categories. Here, we represent each browsing\nhistory as a text sequence of page attributes and use the ground-truth class\nlabels to fine-tune pretrained Transformers. The second stage provides a large\nlanguage model (LLM) with the browsing history and predicted intent class to\ngenerate fine-grained intents. For automatic evaluation, we use a separate LLM\nto judge the similarity between generated and ground-truth intents, which\nclosely aligns with human judgments. Our two-stage approach yields significant\nperformance gains compared to generating intents without the classification\nstage.\n","authors":["Se-eun Yoon","Ahmad Bin Rabiah","Zaid Alibadi","Surya Kallumadi","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2408.04668v2.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2309.13611v2","updated":"2024-09-01T17:40:29Z","published":"2023-09-24T11:19:59Z","title":"Sparsity-regularized coded ptychography for robust and efficient\n lensless microscopy on a chip","summary":" Coded ptychography has emerged as a powerful technique for high-throughput,\nhigh-resolution lensless imaging. However, the trade-off between acquisition\nspeed and image quality remains a significant challenge. To address this, we\nintroduce a novel sparsity-regularized approach to coded ptychography that\ndramatically reduces the number of required measurements while maintaining high\nreconstruction quality. The reported approach, termed the ptychographic\nproximal total-variation (PPTV) solver, formulates the reconstruction task as a\ntotal variation regularized optimization problem. Unlike previous\nimplementations that rely on specialized hardware or illumination schemes, PPTV\nintegrates seamlessly into existing coded ptychography setups. Through\ncomprehensive numerical simulations, we demonstrate that PPTV-driven coded\nptychography can produce accurate reconstructions with as few as eight\nintensity measurements, a significant reduction compared to conventional\nmethods. Convergence analysis confirms the robustness and stability of the PPTV\nalgorithm. Experimental results from our optical prototype, featuring a\ndisorder-engineered surface for wavefront modulation, validate PPTV's ability\nto achieve high-throughput, high-resolution imaging with a substantially\nreduced measurement burden. By enabling high-quality reconstructions from fewer\nmeasurements, PPTV paves the way for more compact, efficient, and\ncost-effective lensless microscopy systems on a chip, with potential\napplications in digital pathology, endoscopy, point-of-care diagnostics, and\nhigh-content screening.\n","authors":["Ninghe Liu","Qianhao Zhao","Guoan Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.13611v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.04487v4","updated":"2024-09-01T15:38:50Z","published":"2023-06-07T14:57:21Z","title":"Vague Preference Policy Learning for Conversational Recommendation","summary":" Conversational recommendation systems (CRS) commonly assume users have clear\npreferences, leading to potential over-filtering of relevant alternatives.\nHowever, users often exhibit vague, non-binary preferences. We introduce the\nVague Preference Multi-round Conversational Recommendation (VPMCR) scenario,\nemploying a soft estimation mechanism to accommodate users' vague and dynamic\npreferences while mitigating over-filtering. In VPMCR, we propose Vague\nPreference Policy Learning (VPPL), consisting of Ambiguity-aware Soft\nEstimation (ASE) and Dynamism-aware Policy Learning (DPL). ASE captures\npreference vagueness by estimating scores for clicked and non-clicked options,\nusing a choice-based approach and time-aware preference decay. DPL leverages\nASE's preference distribution to guide the conversation and adapt to preference\nchanges for recommendations or attribute queries. Extensive experiments\ndemonstrate VPPL's effectiveness within VPMCR, outperforming existing methods\nand setting a new benchmark. Our work advances CRS by accommodating users'\ninherent ambiguity and relative decision-making processes, improving real-world\napplicability.\n","authors":["Gangyi Zhang","Chongming Gao","Wenqiang Lei","Xiaojie Guo","Shijun Li","Hongshen Chen","Zhuozhi Ding","Sulong Xu","Lingfei Wu"],"pdf_url":"https://arxiv.org/pdf/2306.04487v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00860v1","updated":"2024-09-01T22:33:29Z","published":"2024-09-01T22:33:29Z","title":"A Counterfactual Explanation Framework for Retrieval Models","summary":" Explainability has become a crucial concern in today's world, aiming to\nenhance transparency in machine learning and deep learning models. Information\nretrieval is no exception to this trend. In existing literature on\nexplainability of information retrieval, the emphasis has predominantly been on\nillustrating the concept of relevance concerning a retrieval model. The\nquestions addressed include why a document is relevant to a query, why one\ndocument exhibits higher relevance than another, or why a specific set of\ndocuments is deemed relevant for a query.\n However, limited attention has been given to understanding why a particular\ndocument is considered non-relevant to a query with respect to a retrieval\nmodel. In an effort to address this gap, our work focus on the question of what\nterms need to be added within a document to improve its ranking. This in turn\nanswers the question of which words played a role in not being favored by a\nretrieval model for a particular query. We use an optimization framework to\nsolve the above-mentioned research problem. % To the best of our knowledge, we\nmark the first attempt to tackle this specific counterfactual problem. Our\nexperiments show the effectiveness of our proposed approach in predicting\ncounterfactuals for both statistical (e.g. BM25) and deep-learning-based models\n(e.g. DRMM, DSSM, ColBERT).\n","authors":["Bhavik Chandna","Procheta Sen"],"pdf_url":"https://arxiv.org/pdf/2409.00860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00851v1","updated":"2024-09-01T22:01:21Z","published":"2024-09-01T22:01:21Z","title":"Dissecting Temporal Understanding in Text-to-Audio Retrieval","summary":" Recent advancements in machine learning have fueled research on multimodal\ntasks, such as for instance text-to-video and text-to-audio retrieval. These\ntasks require models to understand the semantic content of video and audio\ndata, including objects, and characters. The models also need to learn spatial\narrangements and temporal relationships. In this work, we analyse the temporal\nordering of sounds, which is an understudied problem in the context of\ntext-to-audio retrieval. In particular, we dissect the temporal understanding\ncapabilities of a state-of-the-art model for text-to-audio retrieval on the\nAudioCaps and Clotho datasets. Additionally, we introduce a synthetic\ntext-audio dataset that provides a controlled setting for evaluating temporal\ncapabilities of recent models. Lastly, we present a loss function that\nencourages text-audio models to focus on the temporal ordering of events. Code\nand data are available at\nhttps://www.robots.ox.ac.uk/~vgg/research/audio-retrieval/dtu/.\n","authors":["Andreea-Maria Oncescu","João F. Henriques","A. Sophia Koepke"],"pdf_url":"https://arxiv.org/pdf/2409.00851v1.pdf","comment":"9 pages, 5 figures, ACM Multimedia 2024,\n https://www.robots.ox.ac.uk/~vgg/research/audio-retrieval/dtu/"},{"id":"http://arxiv.org/abs/2409.00830v1","updated":"2024-09-01T20:18:36Z","published":"2024-09-01T20:18:36Z","title":"Building FKG.in: a Knowledge Graph for Indian Food","summary":" This paper presents an ontology design along with knowledge engineering, and\nmultilingual semantic reasoning techniques to build an automated system for\nassimilating culinary information for Indian food in the form of a knowledge\ngraph. The main focus is on designing intelligent methods to derive ontology\ndesigns and capture all-encompassing knowledge about food, recipes,\ningredients, cooking characteristics, and most importantly, nutrition, at\nscale. We present our ongoing work in this workshop paper, describe in some\ndetail the relevant challenges in curating knowledge of Indian food, and\npropose our high-level ontology design. We also present a novel workflow that\nuses AI, LLM, and language technology to curate information from recipe blog\nsites in the public domain to build knowledge graphs for Indian food. The\nmethods for knowledge curation proposed in this paper are generic and can be\nreplicated for any domain. The design is application-agnostic and can be used\nfor AI-driven smart analysis, building recommendation systems for Personalized\nDigital Health, and complementing the knowledge graph for Indian food with\ncontextual information such as user information, food biochemistry, geographic\ninformation, agricultural information, etc.\n","authors":["Saransh Kumar Gupta","Lipika Dey","Partha Pratim Das","Ramesh Jain"],"pdf_url":"https://arxiv.org/pdf/2409.00830v1.pdf","comment":"14 pages, 3 figures, 25 references, Formal Ontology in Information\n Systems Conference 2024 - Integrated Food Ontology Workshop"},{"id":"http://arxiv.org/abs/2409.00727v1","updated":"2024-09-01T14:20:01Z","published":"2024-09-01T14:20:01Z","title":"Hound: Hunting Supervision Signals for Few and Zero Shot Node\n Classification on Text-attributed Graph","summary":" Text-attributed graph (TAG) is an important type of graph structured data\nwith text descriptions for each node. Few- and zero-shot node classification on\nTAGs have many applications in fields such as academia and social networks.\nHowever, the two tasks are challenging due to the lack of supervision signals,\nand existing methods only use the contrastive loss to align graph-based node\nembedding and language-based text embedding. In this paper, we propose Hound to\nimprove accuracy by introducing more supervision signals, and the core idea is\nto go beyond the node-text pairs that come with data. Specifically, we design\nthree augmentation techniques, i.e., node perturbation, text matching, and\nsemantics negation to provide more reference nodes for each text and vice\nversa. Node perturbation adds/drops edges to produce diversified node\nembeddings that can be matched with a text. Text matching retrieves texts with\nsimilar embeddings to match with a node. Semantics negation uses a negative\nprompt to construct a negative text with the opposite semantics, which is\ncontrasted with the original node and text. We evaluate Hound on 5 datasets and\ncompare with 13 state-of-the-art baselines. The results show that Hound\nconsistently outperforms all baselines, and its accuracy improvements over the\nbest-performing baseline are usually over 5%.\n","authors":["Yuxiang Wang","Xiao Yan","Shiyu Jin","Quanqing Xu","Chuanhui Yang","Yuanyuan Zhu","Chuang Hu","Bo Du","Jiawei Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.00727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00720v1","updated":"2024-09-01T13:33:41Z","published":"2024-09-01T13:33:41Z","title":"Fair Reciprocal Recommendation in Matching Markets","summary":" Recommender systems play an increasingly crucial role in shaping people's\nopportunities, particularly in online dating platforms. It is essential from\nthe user's perspective to increase the probability of matching with a suitable\npartner while ensuring an appropriate level of fairness in the matching\nopportunities. We investigate reciprocal recommendation in two-sided matching\nmarkets between agents divided into two sides. In our model, a match is\nconsidered successful only when both individuals express interest in each\nother. Additionally, we assume that agents prefer to appear prominently in the\nrecommendation lists presented to those on the other side. We define each\nagent's opportunity to be recommended and introduce its fairness criterion,\nenvy-freeness, from the perspective of fair division theory. The\nrecommendations that approximately maximize the expected number of matches,\nempirically obtained by heuristic algorithms, are likely to result in\nsignificant unfairness of opportunity. Therefore, there can be a trade-off\nbetween maximizing the expected matches and ensuring fairness of opportunity.\nTo address this challenge, we propose a method to find a policy that is close\nto being envy-free by leveraging the Nash social welfare function. Experiments\non synthetic and real-world datasets demonstrate the effectiveness of our\napproach in achieving both relatively high expected matches and fairness for\nopportunities of both sides in reciprocal recommender systems.\n","authors":["Yoji Tomita","Tomohiki Yokoyama"],"pdf_url":"https://arxiv.org/pdf/2409.00720v1.pdf","comment":"Accepted at RecSys2024"},{"id":"http://arxiv.org/abs/2409.00636v1","updated":"2024-09-01T07:01:22Z","published":"2024-09-01T07:01:22Z","title":"A Learnable Agent Collaboration Network Framework for Personalized\n Multimodal AI Search Engine","summary":" Large language models (LLMs) and retrieval-augmented generation (RAG)\ntechniques have revolutionized traditional information access, enabling AI\nagent to search and summarize information on behalf of users during dynamic\ndialogues. Despite their potential, current AI search engines exhibit\nconsiderable room for improvement in several critical areas. These areas\ninclude the support for multimodal information, the delivery of personalized\nresponses, the capability to logically answer complex questions, and the\nfacilitation of more flexible interactions. This paper proposes a novel AI\nSearch Engine framework called the Agent Collaboration Network (ACN). The ACN\nframework consists of multiple specialized agents working collaboratively, each\nwith distinct roles such as Account Manager, Solution Strategist, Information\nManager, and Content Creator. This framework integrates mechanisms for picture\ncontent understanding, user profile tracking, and online evolution, enhancing\nthe AI search engine's response quality, personalization, and interactivity. A\nhighlight of the ACN is the introduction of a Reflective Forward Optimization\nmethod (RFO), which supports the online synergistic adjustment among agents.\nThis feature endows the ACN with online learning capabilities, ensuring that\nthe system has strong interactive flexibility and can promptly adapt to user\nfeedback. This learning method may also serve as an optimization approach for\nagent-based systems, potentially influencing other domains of agent\napplications.\n","authors":["Yunxiao Shi","Min Xu","Haimin Zhang","Xing Zi","Qiang Wu"],"pdf_url":"https://arxiv.org/pdf/2409.00636v1.pdf","comment":"ACMMM 2024 MMGR WORKSHOP"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.17058v2","updated":"2024-09-01T23:57:03Z","published":"2024-06-24T18:18:58Z","title":"Horseshoe-type Priors for Independent Component Estimation","summary":" Independent Component Estimation (ICE) has many applications in modern day\nmachine learning as a feature engineering extraction method. Horseshoe-type\npriors are used to provide scalable algorithms that enables both point\nestimates via expectation-maximization (EM) and full posterior sampling via\nMarkov Chain Monte Carlo (MCMC) algorithms. Our methodology also applies to\nflow-based methods for nonlinear feature extraction and deep learning. We also\ndiscuss how to implement conditional posteriors and envelope-based methods for\noptimization. Through this hierarchy representation, we unify a number of\nhitherto disparate estimation procedures. We illustrate our methodology and\nalgorithms on a numerical example. Finally, we conclude with directions for\nfuture research.\n","authors":["Jyotishka Datta","Nicholas G. Polson"],"pdf_url":"https://arxiv.org/pdf/2406.17058v2.pdf","comment":"23 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.03272v2","updated":"2024-09-01T23:01:00Z","published":"2024-06-05T13:50:59Z","title":"Multi-Microphone Speech Emotion Recognition using the Hierarchical\n Token-semantic Audio Transformer Architecture","summary":" The performance of most emotion recognition systems degrades in real-life\nsituations ('in the wild' scenarios) where the audio is contaminated by\nreverberation. Our study explores new methods to alleviate the performance\ndegradation of SER algorithms and develop a more robust system for adverse\nconditions. We propose processing multi-microphone signals to address these\nchallenges and improve emotion classification accuracy. We adopt a\nstate-of-the-art transformer model, the HTS-AT, to handle multi-channel audio\ninputs. We evaluate two strategies: averaging mel-spectrograms across channels\nand summing patch-embedded representations. Our multi-microphone model achieves\nsuperior performance compared to single-channel baselines when tested on\nreal-world reverberant environments.\n","authors":["Ohad Cohen","Gershon Hazan","Sharon Gannot"],"pdf_url":"https://arxiv.org/pdf/2406.03272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08414v2","updated":"2024-09-01T22:58:51Z","published":"2024-06-12T16:58:41Z","title":"Discovering Preference Optimization Algorithms with and for Large\n Language Models","summary":" Offline preference optimization is a key method for enhancing and controlling\nthe quality of Large Language Model (LLM) outputs. Typically, preference\noptimization is approached as an offline supervised learning task using\nmanually-crafted convex loss functions. While these methods are based on\ntheoretical insights, they are inherently constrained by human creativity, so\nthe large search space of possible loss functions remains under explored. We\naddress this by performing LLM-driven objective discovery to automatically\ndiscover new state-of-the-art preference optimization algorithms without\n(expert) human intervention. Specifically, we iteratively prompt an LLM to\npropose and implement new preference optimization loss functions based on\npreviously-evaluated performance metrics. This process leads to the discovery\nof previously-unknown and performant preference optimization algorithms. The\nbest performing of these we call Discovered Preference Optimization (DiscoPOP),\na novel algorithm that adaptively blends logistic and exponential losses.\nExperiments demonstrate the state-of-the-art performance of DiscoPOP and its\nsuccessful transfer to held-out tasks.\n","authors":["Chris Lu","Samuel Holt","Claudio Fanconi","Alex J. Chan","Jakob Foerster","Mihaela van der Schaar","Robert Tjarko Lange"],"pdf_url":"https://arxiv.org/pdf/2406.08414v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16767v3","updated":"2024-09-01T19:54:56Z","published":"2024-04-25T17:20:45Z","title":"REBEL: Reinforcement Learning via Regressing Relative Rewards","summary":" While originally developed for continuous control problems, Proximal Policy\nOptimization (PPO) has emerged as the work-horse of a variety of reinforcement\nlearning (RL) applications, including the fine-tuning of generative models.\nUnfortunately, PPO requires multiple heuristics to enable stable convergence\n(e.g. value networks, clipping), and is notorious for its sensitivity to the\nprecise implementation of these components. In response, we take a step back\nand ask what a minimalist RL algorithm for the era of generative models would\nlook like. We propose REBEL, an algorithm that cleanly reduces the problem of\npolicy optimization to regressing the relative reward between two completions\nto a prompt in terms of the policy, enabling strikingly lightweight\nimplementation. In theory, we prove that fundamental RL algorithms like Natural\nPolicy Gradient can be seen as variants of REBEL, which allows us to match the\nstrongest known theoretical guarantees in terms of convergence and sample\ncomplexity in the RL literature. REBEL can also cleanly incorporate offline\ndata and be extended to handle the intransitive preferences we frequently see\nin practice. Empirically, we find that REBEL provides a unified approach to\nlanguage modeling and image generation with stronger or similar performance as\nPPO and DPO, all while being simpler to implement and more computationally\nefficient than PPO. When fine-tuning Llama-3-8B-Instruct, REBEL achieves strong\nperformance in AlpacaEval 2.0, MT-Bench, and Open LLM Leaderboard.\n","authors":["Zhaolin Gao","Jonathan D. Chang","Wenhao Zhan","Owen Oertell","Gokul Swamy","Kianté Brantley","Thorsten Joachims","J. Andrew Bagnell","Jason D. Lee","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2404.16767v3.pdf","comment":"New experimental results on general chat"},{"id":"http://arxiv.org/abs/2406.06573v2","updated":"2024-09-01T19:38:02Z","published":"2024-06-03T18:15:56Z","title":"MedFuzz: Exploring the Robustness of Large Language Models in Medical\n Question Answering","summary":" Large language models (LLM) have achieved impressive performance on medical\nquestion-answering benchmarks. However, high benchmark accuracy does not imply\nthat the performance generalizes to real-world clinical settings. Medical\nquestion-answering benchmarks rely on assumptions consistent with quantifying\nLLM performance but that may not hold in the open world of the clinic. Yet LLMs\nlearn broad knowledge that can help the LLM generalize to practical conditions\nregardless of unrealistic assumptions in celebrated benchmarks. We seek to\nquantify how well LLM medical question-answering benchmark performance\ngeneralizes when benchmark assumptions are violated. Specifically, we present\nan adversarial method that we call MedFuzz (for medical fuzzing). MedFuzz\nattempts to modify benchmark questions in ways aimed at confounding the LLM. We\ndemonstrate the approach by targeting strong assumptions about patient\ncharacteristics presented in the MedQA benchmark. Successful \"attacks\" modify a\nbenchmark item in ways that would be unlikely to fool a medical expert but\nnonetheless \"trick\" the LLM into changing from a correct to an incorrect\nanswer. Further, we present a permutation test technique that can ensure a\nsuccessful attack is statistically significant. We show how to use performance\non a \"MedFuzzed\" benchmark, as well as individual successful attacks. The\nmethods show promise at providing insights into the ability of an LLM to\noperate robustly in more realistic settings.\n","authors":["Robert Osazuwa Ness","Katie Matton","Hayden Helm","Sheng Zhang","Junaid Bajwa","Carey E. Priebe","Eric Horvitz"],"pdf_url":"https://arxiv.org/pdf/2406.06573v2.pdf","comment":"9 pages, 3 figures, 2 algorithms, appendix"},{"id":"http://arxiv.org/abs/2407.18629v2","updated":"2024-09-01T18:49:10Z","published":"2024-07-26T09:40:30Z","title":"CardioLab: Laboratory Values Estimation from Electrocardiogram Features\n -- An Exploratory Study","summary":" Introduction: Laboratory value represents a cornerstone of medical\ndiagnostics, but suffers from slow turnaround times, and high costs and only\nprovides information about a single point in time. The continuous estimation of\nlaboratory values from non-invasive data such as electrocardiogram (ECG) would\ntherefore mark a significant frontier in healthcare monitoring. Despite its\ntransformative potential, this domain remains relatively underexplored within\nthe medical community.\n Methods: In this preliminary study, we used a publicly available dataset\n(MIMIC-IV-ECG) to investigate the feasibility of inferring laboratory values\nfrom ECG features and patient demographics using tree-based models (XGBoost).\nWe define the prediction task as a binary prediction problem of predicting\nwhether the lab value falls into low or high abnormalities. The model\nperformance can then be assessed using AUROC.\n Results: Our findings demonstrate promising results in the estimation of\nlaboratory values related to different organ systems based on a small yet\ncomprehensive set of features. While further research and validation are\nwarranted to fully assess the clinical utility and generalizability of\nECG-based estimation in healthcare monitoring, our findings lay the groundwork\nfor future investigations into approaches to laboratory value estimation using\nECG data. Such advancements hold promise for revolutionizing predictive\nhealthcare applications, offering faster, non-invasive, and more affordable\nmeans of patient monitoring.\n","authors":["Juan Miguel Lopez Alcaraz","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2407.18629v2.pdf","comment":"4 pages, (updated dataset features set description version) code\n under https://github.com/AI4HealthUOL/CardioLab"},{"id":"http://arxiv.org/abs/2312.04323v2","updated":"2024-09-01T17:30:48Z","published":"2023-12-07T14:32:32Z","title":"Equivariant Scalar Fields for Molecular Docking with Fast Fourier\n Transforms","summary":" Molecular docking is critical to structure-based virtual screening, yet the\nthroughput of such workflows is limited by the expensive optimization of\nscoring functions involved in most docking algorithms. We explore how machine\nlearning can accelerate this process by learning a scoring function with a\nfunctional form that allows for more rapid optimization. Specifically, we\ndefine the scoring function to be the cross-correlation of multi-channel ligand\nand protein scalar fields parameterized by equivariant graph neural networks,\nenabling rapid optimization over rigid-body degrees of freedom with fast\nFourier transforms. The runtime of our approach can be amortized at several\nlevels of abstraction, and is particularly favorable for virtual screening\nsettings with a common binding pocket. We benchmark our scoring functions on\ntwo simplified docking-related tasks: decoy pose scoring and rigid conformer\ndocking. Our method attains similar but faster performance on crystal\nstructures compared to the widely-used Vina and Gnina scoring functions, and is\nmore robust on computationally predicted structures. Code is available at\nhttps://github.com/bjing2016/scalar-fields.\n","authors":["Bowen Jing","Tommi Jaakkola","Bonnie Berger"],"pdf_url":"https://arxiv.org/pdf/2312.04323v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2404.16196v2","updated":"2024-09-01T16:48:38Z","published":"2024-04-24T20:35:17Z","title":"ApisTox: a new benchmark dataset for the classification of small\n molecules toxicity on honey bees","summary":" The global decline in bee populations poses significant risks to agriculture,\nbiodiversity, and environmental stability. To bridge the gap in existing data,\nwe introduce ApisTox, a comprehensive dataset focusing on the toxicity of\npesticides to honey bees (Apis mellifera). This dataset combines and leverages\ndata from existing sources such as ECOTOX and PPDB, providing an extensive,\nconsistent, and curated collection that surpasses the previous datasets.\nApisTox incorporates a wide array of data, including toxicity levels for\nchemicals, details such as time of their publication in literature, and\nidentifiers linking them to external chemical databases. This dataset may serve\nas an important tool for environmental and agricultural research, but also can\nsupport the development of policies and practices aimed at minimizing harm to\nbee populations. Finally, ApisTox offers a unique resource for benchmarking\nmolecular property prediction methods on agrochemical compounds, facilitating\nadvancements in both environmental science and cheminformatics. This makes it a\nvaluable tool for both academic research and practical applications in bee\nconservation.\n","authors":["Jakub Adamczyk","Jakub Poziemski","Paweł Siedlecki"],"pdf_url":"https://arxiv.org/pdf/2404.16196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05965v2","updated":"2024-09-01T15:13:52Z","published":"2024-07-08T14:04:58Z","title":"T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models","summary":" The recent development of Sora leads to a new era in text-to-video (T2V)\ngeneration. Along with this comes the rising concern about its security risks.\nThe generated videos may contain illegal or unethical content, and there is a\nlack of comprehensive quantitative understanding of their safety, posing a\nchallenge to their reliability and practical deployment. Previous evaluations\nprimarily focus on the quality of video generation. While some evaluations of\ntext-to-image models have considered safety, they cover fewer aspects and do\nnot address the unique temporal risk inherent in video generation. To bridge\nthis research gap, we introduce T2VSafetyBench, a new benchmark designed for\nconducting safety-critical assessments of text-to-video models. We define 12\ncritical aspects of video generation safety and construct a malicious prompt\ndataset including real-world prompts, LLM-generated prompts and jailbreak\nattack-based prompts. Based on our evaluation results, we draw several\nimportant findings, including: 1) no single model excels in all aspects, with\ndifferent models showing various strengths; 2) the correlation between GPT-4\nassessments and manual reviews is generally high; 3) there is a trade-off\nbetween the usability and safety of text-to-video generative models. This\nindicates that as the field of video generation rapidly advances, safety risks\nare set to surge, highlighting the urgency of prioritizing video safety. We\nhope that T2VSafetyBench can provide insights for better understanding the\nsafety of video generation in the era of generative AI.\n","authors":["Yibo Miao","Yifan Zhu","Yinpeng Dong","Lijia Yu","Jun Zhu","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2407.05965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07324v2","updated":"2024-09-01T15:02:49Z","published":"2023-11-13T13:24:09Z","title":"Data-Aware Gradient Compression for DML in Communication-Constrained\n Mobile Computing","summary":" Distributed machine learning (DML) in mobile environments faces significant\ncommunication bottlenecks. Gradient compression has proven as an effective\nsolution to this issue, offering substantial benefits in environments with\nlimited bandwidth and metered data. Yet, it encounters severe performance drops\nin non-IID environments due to a one-size-fits-all compression approach, which\ndoes not account for the varying data volumes across workers. Assigning varying\ncompression ratios to workers with distinct data distributions and volumes is\ntherefore a promising solution. This work derives the convergence rate of\ndistributed SGD with non-uniform compression, which reveals the intricate\nrelationship between model convergence and the compression ratios applied to\nindividual workers. Accordingly, we frame the relative compression ratio\nassignment as an $n$-variable chi-squared nonlinear optimization problem,\nconstrained by a limited communication budget. We propose DAGC-R, which assigns\nconservative compression to workers handling larger data volumes. Recognizing\nthe computational limitations of mobile devices, we propose the DAGC-A, which\nis computationally less demanding and enhances the robustness of compression in\nnon-IID scenarios. Our experiments confirm that the DAGC-A and DAGC-R can speed\nup the training speed by up to $16.65\\%$ and $25.43\\%$ compared to the uniform\ncompression respectively, when dealing with highly imbalanced data volume\ndistribution and restricted communication.\n","authors":["Rongwei Lu","Yutong Jiang","Yinan Mao","Chen Tang","Bin Chen","Laizhong Cui","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2311.07324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12166v4","updated":"2024-09-01T14:43:23Z","published":"2023-11-20T20:32:14Z","title":"Creating Temporally Correlated High-Resolution Profiles of Load\n Injection Using Constrained Generative Adversarial Networks","summary":" Traditional smart meters, which measure energy usage every 15 minutes or more\nand report it at least a few hours later, lack the granularity needed for\nreal-time decision-making. To address this practical problem, we introduce a\nnew method using generative adversarial networks (GAN) that enforces temporal\nconsistency on its high-resolution outputs via hard inequality constraints\nusing convex optimization. A unique feature of our GAN model is that it is\ntrained solely on slow timescale aggregated historical energy data obtained\nfrom smart meters. The results demonstrate that the model can successfully\ncreate minute-by-minute temporally correlated profiles of power usage from\n15-minute interval average power consumption information. This innovative\napproach, emphasizing inter-neuron constraints, offers a promising avenue for\nimproved high-speed state estimation in distribution systems and enhances the\napplicability of data-driven solutions for monitoring and subsequently\ncontrolling such systems.\n","authors":["Hritik Gopal Shah","Behrouz Azimian","Anamitra Pal"],"pdf_url":"https://arxiv.org/pdf/2311.12166v4.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2407.03953v2","updated":"2024-09-01T13:40:23Z","published":"2024-07-04T14:14:09Z","title":"Generalizing Graph Transformers Across Diverse Graphs and Tasks via\n Pre-Training on Industrial-Scale Data","summary":" Graph pre-training has been concentrated on graph-level on small graphs\n(e.g., molecular graphs) or learning node representations on a fixed graph.\nExtending graph pre-trained models to web-scale graphs with billions of nodes\nin industrial scenarios, while avoiding negative transfer across graphs or\ntasks, remains a challenge. We aim to develop a general graph pre-trained model\nwith inductive ability that can make predictions for unseen new nodes and even\nnew graphs. In this work, we introduce a scalable transformer-based graph\npre-training framework called PGT (Pre-trained Graph Transformer).\nSpecifically, we design a flexible and scalable graph transformer as the\nbackbone network. Meanwhile, based on the masked autoencoder architecture, we\ndesign two pre-training tasks: one for reconstructing node features and the\nother one for reconstructing local structures. Unlike the original autoencoder\narchitecture where the pre-trained decoder is discarded, we propose a novel\nstrategy that utilizes the decoder for feature augmentation. We have deployed\nour framework on Tencent's online game data. Extensive experiments have\ndemonstrated that our framework can perform pre-training on real-world\nweb-scale graphs with over 540 million nodes and 12 billion edges and\ngeneralizes effectively to unseen new graphs with different downstream tasks.\nWe further conduct experiments on the publicly available ogbn-papers100M\ndataset, which consists of 111 million nodes and 1.6 billion edges. Our\nframework achieves state-of-the-art performance on both industrial datasets and\npublic datasets, while also enjoying scalability and efficiency.\n","authors":["Yufei He","Zhenyu Hou","Yukuo Cen","Feng He","Xu Cheng","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2407.03953v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2405.09771v2","updated":"2024-09-01T13:19:27Z","published":"2024-05-16T02:22:09Z","title":"Harmonizing Generalization and Personalization in Federated Prompt\n Learning","summary":" Federated Prompt Learning (FPL) incorporates large pre-trained\nVision-Language models (VLM) into federated learning through prompt tuning. The\ntransferable representations and remarkable generalization capacity of VLM make\nthem highly compatible with the integration of federated learning. Addressing\ndata heterogeneity in federated learning requires personalization, but\nexcessive focus on it across clients could compromise the model's ability to\ngeneralize effectively. To preserve the impressive generalization capability of\nVLM, it is crucial to strike a balance between personalization and\ngeneralization in FPL. To tackle this challenge, we proposed Federated Prompt\nLearning with CLIP Generalization and low-rank Personalization (FedPGP), which\nemploys pre-trained CLIP to provide knowledge-guidance on the global prompt for\nimproved generalization and incorporates a low-rank adaptation term to\npersonalize the global prompt. Further, FedPGP integrates a prompt-wise\ncontrastive loss to achieve knowledge guidance and personalized adaptation\nsimultaneously, enabling a harmonious balance between personalization and\ngeneralization in FPL. We conduct extensive experiments on various datasets to\nexplore base-to-novel generalization in both category-level and domain-level\nscenarios with heterogeneous data, showing the superiority of FedPGP in\nbalancing generalization and personalization.\n","authors":["Tianyu Cui","Hongxia Li","Jingya Wang","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2405.09771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04565v2","updated":"2024-09-01T13:12:34Z","published":"2023-09-08T19:34:29Z","title":"A Versatile Graph Learning Approach through LLM-based Agent","summary":" Designing versatile graph learning approaches is important, considering the\ndiverse graphs and tasks existing in real-world applications. Existing methods\nhave attempted to achieve this target through automated machine learning\ntechniques, pre-training and fine-tuning strategies, and large language models.\nHowever, these methods are not versatile enough for graph learning, as they\nwork on either limited types of graphs or a single task. In this paper, we\npropose to explore versatile graph learning approaches with LLM-based agents,\nand the key insight is customizing the graph learning procedures for diverse\ngraphs and tasks. To achieve this, we develop several LLM-based agents,\nequipped with diverse profiles, tools, functions and human experience. They\ncollaborate to configure each procedure with task and data-specific settings\nstep by step towards versatile solutions, and the proposed method is dubbed\nGL-Agent. By evaluating on diverse tasks and graphs, the correct results of the\nagent and its comparable performance showcase the versatility of the proposed\nmethod, especially in complex scenarios.The low resource cost and the potential\nto use open-source LLMs highlight the efficiency of GL-Agent.\n","authors":["Lanning Wei","Huan Zhao","Xiaohan Zheng","Zhiqiang He","Quanming Yao"],"pdf_url":"https://arxiv.org/pdf/2309.04565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07687v4","updated":"2024-09-01T13:11:58Z","published":"2023-04-16T03:49:50Z","title":"MLRegTest: A Benchmark for the Machine Learning of Regular Languages","summary":" Synthetic datasets constructed from formal languages allow fine-grained\nexamination of the learning and generalization capabilities of machine learning\nsystems for sequence classification. This article presents a new benchmark for\nmachine learning systems on sequence classification called MLRegTest, which\ncontains training, development, and test sets from 1,800 regular languages.\nDifferent kinds of formal languages represent different kinds of long-distance\ndependencies, and correctly identifying long-distance dependencies in sequences\nis a known challenge for ML systems to generalize successfully. MLRegTest\norganizes its languages according to their logical complexity (monadic second\norder, first order, propositional, or monomial expressions) and the kind of\nlogical literals (string, tier-string, subsequence, or combinations thereof).\nThe logical complexity and choice of literal provides a systematic way to\nunderstand different kinds of long-distance dependencies in regular languages,\nand therefore to understand the capacities of different ML systems to learn\nsuch long-distance dependencies. Finally, the performance of different neural\nnetworks (simple RNN, LSTM, GRU, transformer) on MLRegTest is examined. The\nmain conclusion is that performance depends significantly on the kind of test\nset, the class of language, and the neural network architecture.\n","authors":["Sam van der Poel","Dakotah Lambert","Kalina Kostyszyn","Tiantian Gao","Rahul Verma","Derek Andersen","Joanne Chau","Emily Peterson","Cody St. Clair","Paul Fodor","Chihiro Shibata","Jeffrey Heinz"],"pdf_url":"https://arxiv.org/pdf/2304.07687v4.pdf","comment":"Accepted for publication in the Journal of Machine Learning Research.\n Dataset available at https://doi.org/10.5061/dryad.dncjsxm4h , code available\n at https://github.com/heinz-jeffrey/subregular-learning"},{"id":"http://arxiv.org/abs/2408.16537v2","updated":"2024-09-01T11:27:45Z","published":"2024-08-29T13:52:28Z","title":"SFR-GNN: Simple and Fast Robust GNNs against Structural Attacks","summary":" Graph Neural Networks (GNNs) have demonstrated commendable performance for\ngraph-structured data. Yet, GNNs are often vulnerable to adversarial structural\nattacks as embedding generation relies on graph topology. Existing efforts are\ndedicated to purifying the maliciously modified structure or applying adaptive\naggregation, thereby enhancing the robustness against adversarial structural\nattacks. It is inevitable for a defender to consume heavy computational costs\ndue to lacking prior knowledge about modified structures. To this end, we\npropose an efficient defense method, called Simple and Fast Robust Graph Neural\nNetwork (SFR-GNN), supported by mutual information theory. The SFR-GNN first\npre-trains a GNN model using node attributes and then fine-tunes it over the\nmodified graph in the manner of contrastive learning, which is free of\npurifying modified structures and adaptive aggregation, thus achieving great\nefficiency gains. Consequently, SFR-GNN exhibits a 24%--162% speedup compared\nto advanced robust models, demonstrating superior robustness for node\nclassification tasks.\n","authors":["Xing Ai","Guanyu Zhu","Yulin Zhu","Yu Zheng","Gaolei Li","Jianhua Li","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.16537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10769v2","updated":"2024-09-01T11:26:11Z","published":"2024-04-16T17:53:59Z","title":"Finite-dimensional approximations of push-forwards on locally analytic\n functionals","summary":" This paper introduces a novel theoretical framework for investigating\nanalytic maps from finite discrete data. Our approach is to consider the\npush-forward on the space of locally analytic functionals, instead of directly\nhandling the analytic map itself. We establish a methodology enabling\nappropriate finite-dimensional approximation of the push-forward from finite\ndiscrete data, through the theory of the Fourier--Borel transform and the Fock\nspace. Moreover, we prove a rigorous convergence result with a convergence\nrate. As an application, we prove that it is not the least-squares polynomial,\nbut the polynomial obtained by truncating its higher-degree terms, that\napproximates analytic functions and further allows for approximation beyond the\nsupport of the data distribution. One advantage of our theory is that it\nenables us to apply linear algebraic operations to the finite-dimensional\napproximation of the push-forward. Utilizing this, we prove the convergence of\na method for approximating an analytic vector field from finite data of the\nflow map of an ordinary differential equation.\n","authors":["Isao Ishikawa"],"pdf_url":"https://arxiv.org/pdf/2404.10769v2.pdf","comment":"32 pages. 2 figures. We modified resutls. Comments are welcome"},{"id":"http://arxiv.org/abs/2306.11715v2","updated":"2024-09-01T11:15:16Z","published":"2023-06-20T17:43:42Z","title":"Multi-Fidelity Active Learning with GFlowNets","summary":" In the last decades, the capacity to generate large amounts of data in\nscience and engineering applications has been growing steadily. Meanwhile,\nmachine learning has progressed to become a suitable tool to process and\nutilise the available data. Nonetheless, many relevant scientific and\nengineering problems present challenges where current machine learning methods\ncannot yet efficiently leverage the available data and resources. For example,\nin scientific discovery, we are often faced with the problem of exploring very\nlarge, structured and high-dimensional spaces. Moreover, the high fidelity,\nblack-box objective function is often very expensive to evaluate. Progress in\nmachine learning methods that can efficiently tackle such challenges would help\naccelerate currently crucial areas such as drug and materials discovery. In\nthis paper, we propose a multi-fidelity active learning algorithm with\nGFlowNets as a sampler, to efficiently discover diverse, high-scoring\ncandidates where multiple approximations of the black-box function are\navailable at lower fidelity and cost. Our evaluation on molecular discovery\ntasks shows that multi-fidelity active learning with GFlowNets can discover\nhigh-scoring candidates at a fraction of the budget of its single-fidelity\ncounterpart while maintaining diversity, unlike RL-based alternatives. These\nresults open new avenues for multi-fidelity active learning to accelerate\nscientific discovery and engineering design.\n","authors":["Alex Hernandez-Garcia","Nikita Saxena","Moksh Jain","Cheng-Hao Liu","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2306.11715v2.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR) 07/2024\n https://openreview.net/forum?id=dLaazW9zuF"},{"id":"http://arxiv.org/abs/2403.16644v2","updated":"2024-09-01T09:57:04Z","published":"2024-03-25T11:29:32Z","title":"Bridging the Sim-to-Real Gap with Bayesian Inference","summary":" We present SIM-FSVGD for learning robot dynamics from data. As opposed to\ntraditional methods, SIM-FSVGD leverages low-fidelity physical priors, e.g., in\nthe form of simulators, to regularize the training of neural network models.\nWhile learning accurate dynamics already in the low data regime, SIM-FSVGD\nscales and excels also when more data is available. We empirically show that\nlearning with implicit physical priors results in accurate mean model\nestimation as well as precise uncertainty quantification. We demonstrate the\neffectiveness of SIM-FSVGD in bridging the sim-to-real gap on a\nhigh-performance RC racecar system. Using model-based RL, we demonstrate a\nhighly dynamic parking maneuver with drifting, using less than half the data\ncompared to the state of the art.\n","authors":["Jonas Rothfuss","Bhavya Sukhija","Lenart Treven","Florian Dörfler","Stelian Coros","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2403.16644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03873v3","updated":"2024-09-01T09:56:44Z","published":"2024-06-06T09:04:48Z","title":"Quantum Implicit Neural Representations","summary":" Implicit neural representations have emerged as a powerful paradigm to\nrepresent signals such as images and sounds. This approach aims to utilize\nneural networks to parameterize the implicit function of the signal. However,\nwhen representing implicit functions, traditional neural networks such as\nReLU-based multilayer perceptrons face challenges in accurately modeling\nhigh-frequency components of signals. Recent research has begun to explore the\nuse of Fourier Neural Networks (FNNs) to overcome this limitation. In this\npaper, we propose Quantum Implicit Representation Network (QIREN), a novel\nquantum generalization of FNNs. Furthermore, through theoretical analysis, we\ndemonstrate that QIREN possesses a quantum advantage over classical FNNs.\nLastly, we conducted experiments in signal representation, image\nsuperresolution, and image generation tasks to show the superior performance of\nQIREN compared to state-of-the-art (SOTA) models. Our work not only\nincorporates quantum advantages into implicit neural representations but also\nuncovers a promising application direction for Quantum Neural Networks.\n","authors":["Jiaming Zhao","Wenbo Qiao","Peng Zhang","Hui Gao"],"pdf_url":"https://arxiv.org/pdf/2406.03873v3.pdf","comment":"This paper was accepted by icml 2024"},{"id":"http://arxiv.org/abs/2408.16115v2","updated":"2024-09-01T08:04:33Z","published":"2024-08-28T19:59:58Z","title":"Uncertainty Modeling in Graph Neural Networks via Stochastic\n Differential Equations","summary":" We address the problem of learning uncertainty-aware representations for\ngraph-structured data. While Graph Neural Ordinary Differential Equations\n(GNODE) are effective in learning node representations, they fail to quantify\nuncertainty. To address this, we introduce Latent Graph Neural Stochastic\nDifferential Equations (LGNSDE), which enhance GNODE by embedding randomness\nthrough Brownian motion to quantify uncertainty. We provide theoretical\nguarantees for LGNSDE and empirically show better performance in uncertainty\nquantification.\n","authors":["Richard Bergna","Sergio Calvo-Ordoñez","Felix L. Opolka","Pietro Liò","Jose Miguel Hernandez-Lobato"],"pdf_url":"https://arxiv.org/pdf/2408.16115v2.pdf","comment":"9 pages including appendix"},{"id":"http://arxiv.org/abs/2407.18865v2","updated":"2024-09-01T06:39:14Z","published":"2024-07-26T16:52:30Z","title":"Downlink CCM Estimation via Representation Learning with Graph\n Regularization","summary":" In this paper, we propose an algorithm for downlink (DL) channel covariance\nmatrix (CCM) estimation for frequency division duplexing (FDD) massive\nmultiple-input multiple-output (MIMO) communication systems with base station\n(BS) possessing a uniform linear array (ULA) antenna structure. We consider a\nsetting where the UL CCM is mapped to DL CCM by a mapping function. We first\npresent a theoretical error analysis of learning a nonlinear embedding by\nconstructing a mapping function, which points to the importance of the\nLipschitz regularity of the mapping function for achieving high estimation\nperformance. Then, based on the theoretical ground, we propose a representation\nlearning algorithm as a solution for the estimation problem, where Gaussian RBF\nkernel interpolators are chosen to map UL CCMs to their DL counterparts. The\nproposed algorithm is based on the optimization of an objective function that\nfits a regression model between the DL CCM and UL CCM samples in the training\ndataset and preserves the local geometric structure of the data in the UL CCM\nspace, while explicitly regulating the Lipschitz continuity of the mapping\nfunction in light of our theoretical findings. The proposed algorithm surpasses\nbenchmark methods in terms of three error metrics as shown by simulations.\n","authors":["Melih Can Zerin","Elif Vural","Ali Özgür Yılmaz"],"pdf_url":"https://arxiv.org/pdf/2407.18865v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02168v4","updated":"2024-09-01T05:21:46Z","published":"2023-10-03T16:02:36Z","title":"Editing Personality for Large Language Models","summary":" This paper introduces an innovative task focused on editing the personality\ntraits of Large Language Models (LLMs). This task seeks to adjust the models'\nresponses to opinion-related questions on specified topics since an\nindividual's personality often manifests in the form of their expressed\nopinions, thereby showcasing different personality traits. Specifically, we\nconstruct PersonalityEdit, a new benchmark dataset to address this task.\nDrawing on the theory in Social Psychology, we isolate three representative\ntraits, namely Neuroticism, Extraversion, and Agreeableness, as the foundation\nfor our benchmark. We then gather data using GPT-4, generating responses that\nalign with a specified topic and embody the targeted personality trait. We\nconduct comprehensive experiments involving various baselines and discuss the\nrepresentation of personality behavior in LLMs. Our findings uncover potential\nchallenges of the proposed task, illustrating several remaining issues. We\nanticipate that our work can stimulate further annotation in model editing and\npersonality-related research. Code is available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Shengyu Mao","Xiaohan Wang","Mengru Wang","Yong Jiang","Pengjun Xie","Fei Huang","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02168v4.pdf","comment":"NLPCC 2024"},{"id":"http://arxiv.org/abs/2211.08082v2","updated":"2024-09-01T05:03:16Z","published":"2022-11-15T12:05:03Z","title":"UniHPF : Universal Healthcare Predictive Framework with Zero Domain\n Knowledge","summary":" Despite the abundance of Electronic Healthcare Records (EHR), its\nheterogeneity restricts the utilization of medical data in building predictive\nmodels. To address this challenge, we propose Universal Healthcare Predictive\nFramework (UniHPF), which requires no medical domain knowledge and minimal\npre-processing for multiple prediction tasks. Experimental results demonstrate\nthat UniHPF is capable of building large-scale EHR models that can process any\nform of medical data from distinct EHR systems. We believe that our findings\ncan provide helpful insights for further research on the multi-source learning\nof EHRs.\n","authors":["Kyunghoon Hur","Jungwoo Oh","Junu Kim","Jiyoun Kim","Min Jae Lee","Eunbyeol Cho","Seong-Eun Moon","Young-Hak Kim","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2211.08082v2.pdf","comment":"The original paper is published on Journal of Biomedical and Health\n Informatics(JBHI) 2023, https://ieeexplore.ieee.org/document/10298642.\n Extended Abstract presented at Machine Learning for Health (ML4H) symposium\n 2022, November 28th, 2022, New Orleans, United States, 19 pages(main paper 6\n pages). arXiv admin note: substantial text overlap with arXiv:2207.09858"},{"id":"http://arxiv.org/abs/2402.12189v2","updated":"2024-09-01T03:02:36Z","published":"2024-02-19T14:52:50Z","title":"Amplifying Training Data Exposure through Fine-Tuning with\n Pseudo-Labeled Memberships","summary":" Neural language models (LMs) are vulnerable to training data extraction\nattacks due to data memorization. This paper introduces a novel attack scenario\nwherein an attacker adversarially fine-tunes pre-trained LMs to amplify the\nexposure of the original training data. This strategy differs from prior\nstudies by aiming to intensify the LM's retention of its pre-training dataset.\nTo achieve this, the attacker needs to collect generated texts that are closely\naligned with the pre-training data. However, without knowledge of the actual\ndataset, quantifying the amount of pre-training data within generated texts is\nchallenging. To address this, we propose the use of pseudo-labels for these\ngenerated texts, leveraging membership approximations indicated by\nmachine-generated probabilities from the target LM. We subsequently fine-tune\nthe LM to favor generations with higher likelihoods of originating from the\npre-training data, based on their membership probabilities. Our empirical\nfindings indicate a remarkable outcome: LMs with over 1B parameters exhibit a\nfour to eight-fold increase in training data exposure. We discuss potential\nmitigations and suggest future research directions.\n","authors":["Myung Gyo Oh","Hong Eun Ahn","Leo Hyun Park","Taekyoung Kwon"],"pdf_url":"https://arxiv.org/pdf/2402.12189v2.pdf","comment":"20 pages, 6 figures, 15 tables"},{"id":"http://arxiv.org/abs/2404.00204v4","updated":"2024-09-01T01:37:04Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone\n Controller for Robust Autonomous Flights","summary":" Navigation precision, speed and stability are crucial for safe Unmanned\nAerial Vehicle (UAV) flight maneuvers and effective flight mission executions\nin dynamic environments. Different flight missions may have varying objectives,\nsuch as minimizing energy consumption, achieving precise positioning, or\nmaximizing speed. A controller that can adapt to different objectives on the\nfly is highly valuable. Proportional Integral Derivative (PID) controllers are\none of the most popular and widely used control algorithms for drones and other\ncontrol systems, but their linear control algorithm fails to capture the\nnonlinear nature of the dynamic wind conditions and complex drone system.\nManually tuning the PID gains for various missions can be time-consuming and\nrequires significant expertise. This paper aims to revolutionize drone flight\ncontrol by presenting the AirPilot, a nonlinear Deep Reinforcement Learning\n(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using\nProximal Policy Optimization (PPO). AirPilot controller combines the simplicity\nand effectiveness of traditional PID control with the adaptability, learning\ncapability, and optimization potential of DRL. This makes it better suited for\nmodern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the simulator and\nimplemented it in a real-world lab setting, which marks a significant milestone\nas one of the first attempts to apply a DRL-based flight controller on an\nactual drone. Airpilot is capable of reducing the navigation error of the\ndefault PX4 PID position controller by 90%, improving effective navigation\nspeed of a fine-tuned PID controller by 21%, reducing settling time and\novershoot by 17% and 16% respectively.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen","Ulices Santa Cruz Leal","Yasser Shoukry"],"pdf_url":"https://arxiv.org/pdf/2404.00204v4.pdf","comment":"9 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.06292v3","updated":"2024-09-01T00:41:18Z","published":"2024-08-12T16:58:11Z","title":"The AI Scientist: Towards Fully Automated Open-Ended Scientific\n Discovery","summary":" One of the grand challenges of artificial general intelligence is developing\nagents capable of conducting scientific research and discovering new knowledge.\nWhile frontier models have already been used as aides to human scientists, e.g.\nfor brainstorming ideas, writing code, or prediction tasks, they still conduct\nonly a small part of the scientific process. This paper presents the first\ncomprehensive framework for fully automatic scientific discovery, enabling\nfrontier large language models to perform research independently and\ncommunicate their findings. We introduce The AI Scientist, which generates\nnovel research ideas, writes code, executes experiments, visualizes results,\ndescribes its findings by writing a full scientific paper, and then runs a\nsimulated review process for evaluation. In principle, this process can be\nrepeated to iteratively develop ideas in an open-ended fashion, acting like the\nhuman scientific community. We demonstrate its versatility by applying it to\nthree distinct subfields of machine learning: diffusion modeling,\ntransformer-based language modeling, and learning dynamics. Each idea is\nimplemented and developed into a full paper at a cost of less than $15 per\npaper. To evaluate the generated papers, we design and validate an automated\nreviewer, which we show achieves near-human performance in evaluating paper\nscores. The AI Scientist can produce papers that exceed the acceptance\nthreshold at a top machine learning conference as judged by our automated\nreviewer. This approach signifies the beginning of a new era in scientific\ndiscovery in machine learning: bringing the transformative benefits of AI\nagents to the entire research process of AI itself, and taking us closer to a\nworld where endless affordable creativity and innovation can be unleashed on\nthe world's most challenging problems. Our code is open-sourced at\nhttps://github.com/SakanaAI/AI-Scientist\n","authors":["Chris Lu","Cong Lu","Robert Tjarko Lange","Jakob Foerster","Jeff Clune","David Ha"],"pdf_url":"https://arxiv.org/pdf/2408.06292v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09105v6","updated":"2024-09-01T00:26:46Z","published":"2024-07-12T09:10:37Z","title":"Enhancing Training Efficiency Using Packing with Flash Attention","summary":" Padding is often used in tuning LLM models by adding special tokens to\nshorter training examples to match the length of the longest sequence in each\nbatch. While this ensures uniformity for batch processing, it introduces\ninefficiencies by including irrelevant padding tokens in the computation and\nwastes GPU resources. Hugging Face SFT trainer has always offered the option to\nuse packing to combine multiple training examples, allowing for maximal\nutilization of GPU resources. However, up till now, it did not offer proper\nmasking of each packed training example. This capability has been added to\nHugging Face Transformers 4.44. We analyse this new feature and show the\nbenefits across different variations of packing.\n","authors":["Achintya Kundu","Rhui Dih Lee","Laura Wynter","Raghu Kiran Ganti","Mayank Mishra"],"pdf_url":"https://arxiv.org/pdf/2407.09105v6.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.10846v2","updated":"2024-09-01T14:57:12Z","published":"2024-08-19T12:06:25Z","title":"Harmonizing Attention: Training-free Texture-aware Geometry Transfer","summary":" Extracting geometry features from photographic images independently of\nsurface texture and transferring them onto different materials remains a\ncomplex challenge. In this study, we introduce Harmonizing Attention, a novel\ntraining-free approach that leverages diffusion models for texture-aware\ngeometry transfer. Our method employs a simple yet effective modification of\nself-attention layers, allowing the model to query information from multiple\nreference images within these layers. This mechanism is seamlessly integrated\ninto the inversion process as Texture-aligning Attention and into the\ngeneration process as Geometry-aligning Attention. This dual-attention approach\nensures the effective capture and transfer of material-independent geometry\nfeatures while maintaining material-specific textural continuity, all without\nthe need for model fine-tuning.\n","authors":["Eito Ikuta","Yohan Lee","Akihiro Iohara","Yu Saito","Toshiyuki Tanaka"],"pdf_url":"https://arxiv.org/pdf/2408.10846v2.pdf","comment":"Accepted at WACV2025"},{"id":"http://arxiv.org/abs/2409.00615v1","updated":"2024-09-01T04:57:42Z","published":"2024-09-01T04:57:42Z","title":"MetaDigiHuman: Haptic Interfaces for Digital Humans in Metaverse","summary":" The way we engage with digital spaces and the digital world has undergone\nrapid changes in recent years, largely due to the emergence of the Metaverse.\nAs technology continues to advance, the demand for sophisticated and immersive\ninterfaces to interact with the Metaverse has become increasingly crucial.\nHaptic interfaces have been developed to meet this need and provide users with\ntactile feedback and realistic touch sensations. These interfaces play a vital\nrole in creating a more authentic and immersive experience within the\nMetaverse. This article introduces the concept of MetaDigiHuman, a\ngroundbreaking framework that combines blended digital humans and haptic\ninterfaces. By harnessing cutting-edge technologies, MetaDigiHuman enables\nseamless and immersive interaction within the Metaverse. Through this\nframework, users can simulate the sensation of touching, feeling, and\ninteracting with digital beings as if they were physically present in the\nenvironments, offering a more compelling and immersive experience within the\nMetaverse.\n","authors":["Senthil Kumar Jagatheesaperumal","Praveen Sathikumar","Harikrishnan Rajan"],"pdf_url":"https://arxiv.org/pdf/2409.00615v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.00597v1","updated":"2024-09-01T03:16:30Z","published":"2024-09-01T03:16:30Z","title":"Multimodal Multi-turn Conversation Stance Detection: A Challenge Dataset\n and Effective Model","summary":" Stance detection, which aims to identify public opinion towards specific\ntargets using social media data, is an important yet challenging task. With the\nproliferation of diverse multimodal social media content including text, and\nimages multimodal stance detection (MSD) has become a crucial research area.\nHowever, existing MSD studies have focused on modeling stance within individual\ntext-image pairs, overlooking the multi-party conversational contexts that\nnaturally occur on social media. This limitation stems from a lack of datasets\nthat authentically capture such conversational scenarios, hindering progress in\nconversational MSD. To address this, we introduce a new multimodal multi-turn\nconversational stance detection dataset (called MmMtCSD). To derive stances\nfrom this challenging dataset, we propose a novel multimodal large language\nmodel stance detection framework (MLLM-SD), that learns joint stance\nrepresentations from textual and visual modalities. Experiments on MmMtCSD show\nstate-of-the-art performance of our proposed MLLM-SD approach for multimodal\nstance detection. We believe that MmMtCSD will contribute to advancing\nreal-world applications of stance detection research.\n","authors":["Fuqiang Niu","Zebang Cheng","Xianghua Fu","Xiaojiang Peng","Genan Dai","Yin Chen","Hu Huang","Bowen Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.00597v1.pdf","comment":"ACM MM2024"}]},"2024-08-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.08396v4","updated":"2024-08-31T23:51:14Z","published":"2024-01-16T14:41:20Z","title":"Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in\n medicine","summary":" Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V performs comparatively to human physicians\nregarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 78% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (35.5%), most prominent in image comprehension\n(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such multimodal AI models into clinical\nworkflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Alex Chen","Josef A. Brejt","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03509v3","updated":"2024-08-31T20:44:41Z","published":"2023-05-04T16:14:43Z","title":"Diffusion Explainer: Visual Explanation for Text-to-image Stable\n Diffusion","summary":" Diffusion-based generative models' impressive ability to create convincing\nimages has garnered global attention. However, their complex structures and\noperations often pose challenges for non-experts to grasp. We present Diffusion\nExplainer, the first interactive visualization tool that explains how Stable\nDiffusion transforms text prompts into images. Diffusion Explainer tightly\nintegrates a visual overview of Stable Diffusion's complex structure with\nexplanations of the underlying operations. By comparing image generation of\nprompt variants, users can discover the impact of keyword changes on image\ngeneration. A 56-participant user study demonstrates that Diffusion Explainer\noffers substantial learning benefits to non-experts. Our tool has been used by\nover 10,300 users from 124 countries at\nhttps://poloclub.github.io/diffusion-explainer/.\n","authors":["Seongmin Lee","Benjamin Hoover","Hendrik Strobelt","Zijie J. Wang","ShengYun Peng","Austin Wright","Kevin Li","Haekyu Park","Haoyang Yang","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2305.03509v3.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.17493v5","updated":"2024-08-31T19:42:31Z","published":"2024-02-27T13:18:00Z","title":"The Foundational Capabilities of Large Language Models in Predicting\n Postoperative Risks Using Clinical Notes","summary":" Clinical notes recorded during a patient's perioperative journey holds\nimmense informational value. Advances in large language models (LLMs) offer\nopportunities for bridging this gap. Using 84,875 pre-operative notes and its\nassociated surgical cases from 2018 to 2021, we examine the performance of LLMs\nin predicting six postoperative risks using various fine-tuning strategies.\nPretrained LLMs outperformed traditional word embeddings by an absolute AUROC\nof 38.3% and AUPRC of 33.2%. Self-supervised fine-tuning further improved\nperformance by 3.2% and 1.5%. Incorporating labels into training further\nincreased AUROC by 1.8% and AUPRC by 2%. The highest performance was achieved\nwith a unified foundation model, with improvements of 3.6% for AUROC and 2.6%\nfor AUPRC compared to self-supervision, highlighting the foundational\ncapabilities of LLMs in predicting postoperative risks, which could be\npotentially beneficial when deployed for perioperative care\n","authors":["Charles Alba","Bing Xue","Joanna Abraham","Thomas Kannampallil","Chenyang Lu"],"pdf_url":"https://arxiv.org/pdf/2402.17493v5.pdf","comment":"Codes are publicly available at:\n https://github.com/cja5553/LLMs_in_perioperative_care"},{"id":"http://arxiv.org/abs/2403.14734v3","updated":"2024-08-31T17:46:10Z","published":"2024-03-21T08:54:56Z","title":"A Survey of Neural Code Intelligence: Paradigms, Advances and Beyond","summary":" Neural Code Intelligence -- leveraging deep learning to understand, generate,\nand optimize code -- holds immense potential for transformative impacts on the\nwhole society. Bridging the gap between Natural Language and Programming\nLanguage, this domain has drawn significant attention from researchers in both\nresearch communities over the past few years. This survey presents a systematic\nand chronological review of the advancements in code intelligence, encompassing\nover 50 representative models and their variants, more than 20 categories of\ntasks, and an extensive coverage of over 680 related works. We follow the\nhistorical progression to trace the paradigm shifts across different research\nphases (e.g., from modeling code with recurrent neural networks to the era of\nLarge Language Models). Concurrently, we highlight the major technical\ntransitions in models, tasks, and evaluations spanning through different\nstages. For applications, we also observe a co-evolving shift. It spans from\ninitial endeavors to tackling specific scenarios, through exploring a diverse\narray of tasks during its rapid expansion, to currently focusing on tackling\nincreasingly complex and varied real-world challenges. Building on our\nexamination of the developmental trajectories, we further investigate the\nemerging synergies between code intelligence and broader machine intelligence,\nuncovering new cross-domain opportunities and illustrating the substantial\ninfluence of code intelligence across various domains. Finally, we delve into\nboth the opportunities and challenges associated with this field, alongside\nelucidating our insights on the most promising research directions. An ongoing,\ndynamically updated project and resources associated with this survey have been\nreleased at https://github.com/QiushiSun/NCISurvey.\n","authors":["Qiushi Sun","Zhirui Chen","Fangzhi Xu","Kanzhi Cheng","Chang Ma","Zhangyue Yin","Jianing Wang","Chengcheng Han","Renyu Zhu","Shuai Yuan","Qipeng Guo","Xipeng Qiu","Pengcheng Yin","Xiaoli Li","Fei Yuan","Lingpeng Kong","Xiang Li","Zhiyong Wu"],"pdf_url":"https://arxiv.org/pdf/2403.14734v3.pdf","comment":"64 pages, 6 figures, 10 tables, 695 references"},{"id":"http://arxiv.org/abs/2408.11962v2","updated":"2024-08-31T17:34:43Z","published":"2024-08-21T19:31:01Z","title":"Characterizing Online Toxicity During the 2022 Mpox Outbreak: A\n Computational Analysis of Topical and Network Dynamics","summary":" Background: Online toxicity, encompassing behaviors such as harassment,\nbullying, hate speech, and the dissemination of misinformation, has become a\npressing social concern in the digital age. The 2022 Mpox outbreak, initially\ntermed \"Monkeypox\" but subsequently renamed to mitigate associated stigmas and\nsocietal concerns, serves as a poignant backdrop to this issue. Objective: In\nthis research, we undertake a comprehensive analysis of the toxic online\ndiscourse surrounding the 2022 Mpox outbreak. Our objective is to dissect its\norigins, characterize its nature and content, trace its dissemination patterns,\nand assess its broader societal implications, with the goal of providing\ninsights that can inform strategies to mitigate such toxicity in future crises.\nMethods: We collected more than 1.6 million unique tweets and analyzed them\nfrom five dimensions, including context, extent, content, speaker, and intent.\nUtilizing BERT-based topic modeling and social network community clustering, we\ndelineated the toxic dynamics on Twitter. Results: We identified five\nhigh-level topic categories in the toxic online discourse on Twitter, including\ndisease (46.6%), health policy and healthcare (19.3%), homophobia (23.9%),\npolitics (6.0%), and racism (4.1%). Through the toxicity diffusion networks of\nmentions, retweets, and the top users, we found that retweets of toxic content\nwere widespread, while influential users rarely engaged with or countered this\ntoxicity through retweets. Conclusions: By tracking topical dynamics, we can\ntrack the changing popularity of toxic content online, providing a better\nunderstanding of societal challenges. Network dynamics spotlight key social\nmedia influencers and their intents, indicating that addressing these central\nfigures in toxic discourse can enhance crisis communication and inform\npolicy-making.\n","authors":["Lizhou Fan","Lingyao Li","Libby Hemphill"],"pdf_url":"https://arxiv.org/pdf/2408.11962v2.pdf","comment":"36 pages, 8 figure, and 12 tables"},{"id":"http://arxiv.org/abs/2404.12535v2","updated":"2024-08-31T17:18:29Z","published":"2024-04-18T22:56:57Z","title":"Is There No Such Thing as a Bad Question? H4R: HalluciBot For\n Ratiocination, Rewriting, Ranking, and Routing","summary":" Hallucination continues to be one of the most critical challenges in the\ninstitutional adoption journey of Large Language Models (LLMs). While prior\nstudies have primarily focused on the post-generation analysis and refinement\nof outputs, this paper centers on the effectiveness of queries in eliciting\naccurate responses from LLMs. We present HalluciBot, a model that estimates the\nquery's propensity to hallucinate before generation, without invoking any LLMs\nduring inference. HalluciBot can serve as a proxy reward model for query\nrewriting, offering a general framework to estimate query quality based on\naccuracy and consensus. In essence, HalluciBot investigates how poorly\nconstructed queries can lead to erroneous outputs - moreover, by employing\nquery rewriting guided by HalluciBot's empirical estimates, we demonstrate that\n95.7% output accuracy can be achieved for Multiple Choice questions. The\ntraining procedure for HalluciBot consists of perturbing 369,837 queries n\ntimes, employing n+1 independent LLM agents, sampling an output from each\nquery, conducting a Multi-Agent Monte Carlo simulation on the sampled outputs,\nand training an encoder classifier. The idea of perturbation is the outcome of\nour ablation studies that measures the increase in output diversity (+12.5\nagreement spread) by perturbing a query in lexically different but semantically\nsimilar ways. Therefore, HalluciBot paves the way to ratiocinate (76.0% test F1\nscore, 46.6% in saved computation on hallucinatory queries), rewrite (+30.2%\npositive class transition from hallucinatory to non-hallucinatory), rank\n(+50.6% positive class transition from hallucinatory to non-hallucinatory), and\nroute queries to effective pipelines.\n","authors":["William Watson","Nicole Cho","Nishan Srishankar"],"pdf_url":"https://arxiv.org/pdf/2404.12535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08800v3","updated":"2024-08-31T16:56:50Z","published":"2023-12-14T10:35:13Z","title":"Evaluating Large Language Models for Health-related Queries with\n Presuppositions","summary":" As corporations rush to integrate large language models (LLMs) to their\nsearch offerings, it is critical that they provide factually accurate\ninformation that is robust to any presuppositions that a user may express. In\nthis work, we introduce UPHILL, a dataset consisting of health-related queries\nwith varying degrees of presuppositions. Using UPHILL, we evaluate the factual\naccuracy and consistency of InstructGPT, ChatGPT, and BingChat models. We find\nthat while model responses rarely disagree with true health claims (posed as\nquestions), they often fail to challenge false claims: responses from\nInstructGPT agree with 32% of the false claims, ChatGPT 26% and BingChat 23%.\nAs we increase the extent of presupposition in input queries, the responses\nfrom InstructGPT and ChatGPT agree with the claim considerably more often,\nregardless of its veracity. Responses from BingChat, which rely on retrieved\nwebpages, are not as susceptible. Given the moderate factual accuracy, and the\ninability of models to consistently correct false assumptions, our work calls\nfor a careful assessment of current LLMs for use in high-stakes scenarios.\n","authors":["Navreet Kaur","Monojit Choudhury","Danish Pruthi"],"pdf_url":"https://arxiv.org/pdf/2312.08800v3.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.00048v2","updated":"2024-08-31T16:36:58Z","published":"2024-05-28T17:01:22Z","title":"Towards a theory of how the structure of language is acquired by deep\n neural networks","summary":" How much data is required to learn the structure of a language via next-token\nprediction? We study this question for synthetic datasets generated via a\nProbabilistic Context-Free Grammar (PCFG) -- a tree-like generative model that\ncaptures many of the hierarchical structures found in natural languages. We\ndetermine token-token correlations analytically in our model and show that they\ncan be used to build a representation of the grammar's hidden variables, the\nlonger the range the deeper the variable. In addition, a finite training set\nlimits the resolution of correlations to an effective range, whose size grows\nwith that of the training set. As a result, a Language Model trained with\nincreasingly many examples can build a deeper representation of the grammar's\nstructure, thus reaching good performance despite the high dimensionality of\nthe problem. We conjecture that the relationship between training set size and\neffective range of correlations holds beyond our synthetic datasets. In\nparticular, our conjecture predicts how the scaling law for the test loss\nbehaviour with training set size depends on the length of the context window,\nwhich we confirm empirically in Shakespeare's plays and Wikipedia articles.\n","authors":["Francesco Cagnetta","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2406.00048v2.pdf","comment":"9 pages, 4 figures (main)"},{"id":"http://arxiv.org/abs/2312.16778v2","updated":"2024-08-31T12:49:43Z","published":"2023-12-28T01:57:26Z","title":"Adversarial Representation with Intra-Modal and Inter-Modal Graph\n Contrastive Learning for Multimodal Emotion Recognition","summary":" With the release of increasing open-source emotion recognition datasets on\nsocial media platforms and the rapid development of computing resources,\nmultimodal emotion recognition tasks (MER) have begun to receive widespread\nresearch attention. The MER task extracts and fuses complementary semantic\ninformation from different modalities, which can classify the speaker's\nemotions. However, the existing feature fusion methods have usually mapped the\nfeatures of different modalities into the same feature space for information\nfusion, which can not eliminate the heterogeneity between different modalities.\nTherefore, it is challenging to make the subsequent emotion class boundary\nlearning. To tackle the above problems, we have proposed a novel Adversarial\nRepresentation with Intra-Modal and Inter-Modal Graph Contrastive for\nMultimodal Emotion Recognition (AR-IIGCN) method. Firstly, we input video,\naudio, and text features into a multi-layer perceptron (MLP) to map them into\nseparate feature spaces. Secondly, we build a generator and a discriminator for\nthe three modal features through adversarial representation, which can achieve\ninformation interaction between modalities and eliminate heterogeneity among\nmodalities. Thirdly, we introduce contrastive graph representation learning to\ncapture intra-modal and inter-modal complementary semantic information and\nlearn intra-class and inter-class boundary information of emotion categories.\nSpecifically, we construct a graph structure for three modal features and\nperform contrastive representation learning on nodes with different emotions in\nthe same modality and the same emotion in different modalities, which can\nimprove the feature representation ability of nodes. Extensive experimental\nworks show that the ARL-IIGCN method can significantly improve emotion\nrecognition accuracy on IEMOCAP and MELD datasets.\n","authors":["Yuntao Shou","Tao Meng","Wei Ai","Nan Yin","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2312.16778v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.00119v2","updated":"2024-08-31T12:44:38Z","published":"2024-06-27T15:54:12Z","title":"Efficient Long-distance Latent Relation-aware Graph Neural Network for\n Multi-modal Emotion Recognition in Conversations","summary":" The task of multi-modal emotion recognition in conversation (MERC) aims to\nanalyze the genuine emotional state of each utterance based on the multi-modal\ninformation in the conversation, which is crucial for conversation\nunderstanding. Existing methods focus on using graph neural networks (GNN) to\nmodel conversational relationships and capture contextual latent semantic\nrelationships. However, due to the complexity of GNN, existing methods cannot\nefficiently capture the potential dependencies between long-distance\nutterances, which limits the performance of MERC. In this paper, we propose an\nEfficient Long-distance Latent Relation-aware Graph Neural Network (ELR-GNN)\nfor multi-modal emotion recognition in conversations. Specifically, we first\nuse pre-extracted text, video and audio features as input to Bi-LSTM to capture\ncontextual semantic information and obtain low-level utterance features. Then,\nwe use low-level utterance features to construct a conversational emotion\ninteraction graph. To efficiently capture the potential dependencies between\nlong-distance utterances, we use the dilated generalized forward push algorithm\nto precompute the emotional propagation between global utterances and design an\nemotional relation-aware operator to capture the potential semantic\nassociations between different utterances. Furthermore, we combine early fusion\nand adaptive late fusion mechanisms to fuse latent dependency information\nbetween speaker relationship information and context. Finally, we obtain\nhigh-level discourse features and feed them into MLP for emotion prediction.\nExtensive experimental results show that ELR-GNN achieves state-of-the-art\nperformance on the benchmark datasets IEMOCAP and MELD, with running times\nreduced by 52\\% and 35\\%, respectively.\n","authors":["Yuntao Shou","Wei Ai","Jiayi Du","Tao Meng","Haiyan Liu","Nan Yin"],"pdf_url":"https://arxiv.org/pdf/2407.00119v2.pdf","comment":"11 pages, 3 tables"},{"id":"http://arxiv.org/abs/2312.10579v2","updated":"2024-08-31T12:41:30Z","published":"2023-12-17T01:49:40Z","title":"DER-GCN: Dialogue and Event Relation-Aware Graph Convolutional Neural\n Network for Multimodal Dialogue Emotion Recognition","summary":" With the continuous development of deep learning (DL), the task of multimodal\ndialogue emotion recognition (MDER) has recently received extensive research\nattention, which is also an essential branch of DL. The MDER aims to identify\nthe emotional information contained in different modalities, e.g., text, video,\nand audio, in different dialogue scenes. However, existing research has focused\non modeling contextual semantic information and dialogue relations between\nspeakers while ignoring the impact of event relations on emotion. To tackle the\nabove issues, we propose a novel Dialogue and Event Relation-Aware Graph\nConvolutional Neural Network for Multimodal Emotion Recognition (DER-GCN)\nmethod. It models dialogue relations between speakers and captures latent event\nrelations information. Specifically, we construct a weighted multi-relationship\ngraph to simultaneously capture the dependencies between speakers and event\nrelations in a dialogue. Moreover, we also introduce a Self-Supervised Masked\nGraph Autoencoder (SMGAE) to improve the fusion representation ability of\nfeatures and structures. Next, we design a new Multiple Information Transformer\n(MIT) to capture the correlation between different relations, which can provide\na better fuse of the multivariate information between relations. Finally, we\npropose a loss optimization strategy based on contrastive learning to enhance\nthe representation learning ability of minority class features. We conduct\nextensive experiments on the IEMOCAP and MELD benchmark datasets, which verify\nthe effectiveness of the DER-GCN model. The results demonstrate that our model\nsignificantly improves both the average accuracy and the f1 value of emotion\nrecognition.\n","authors":["Wei Ai","Yuntao Shou","Tao Meng","Nan Yin","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2312.10579v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2402.19465v2","updated":"2024-08-31T11:31:02Z","published":"2024-02-29T18:55:06Z","title":"Towards Tracing Trustworthiness Dynamics: Revisiting Pre-training Period\n of Large Language Models","summary":" Ensuring the trustworthiness of large language models (LLMs) is crucial. Most\nstudies concentrate on fully pre-trained LLMs to better understand and improve\nLLMs' trustworthiness. In this paper, to reveal the untapped potential of\npre-training, we pioneer the exploration of LLMs' trustworthiness during this\nperiod, focusing on five key dimensions: reliability, privacy, toxicity,\nfairness, and robustness. To begin with, we apply linear probing to LLMs. The\nhigh probing accuracy suggests that \\textit{LLMs in early pre-training can\nalready distinguish concepts in each trustworthiness dimension}. Therefore, to\nfurther uncover the hidden possibilities of pre-training, we extract steering\nvectors from a LLM's pre-training checkpoints to enhance the LLM's\ntrustworthiness. Finally, inspired by~\\citet{choi2023understanding} that mutual\ninformation estimation is bounded by linear probing accuracy, we also probe\nLLMs with mutual information to investigate the dynamics of trustworthiness\nduring pre-training. We are the first to observe a similar two-phase\nphenomenon: fitting and compression~\\citep{shwartz2017opening}. This research\nprovides an initial exploration of trustworthiness modeling during LLM\npre-training, seeking to unveil new insights and spur further developments in\nthe field. We will make our code publicly accessible at\n\\url{https://github.com/ChnQ/TracingLLM}.\n","authors":["Chen Qian","Jie Zhang","Wei Yao","Dongrui Liu","Zhenfei Yin","Yu Qiao","Yong Liu","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2402.19465v2.pdf","comment":"Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2404.11132v2","updated":"2024-08-31T07:52:40Z","published":"2024-04-17T07:26:23Z","title":"A Novel ICD Coding Method Based on Associated and Hierarchical Code\n Description Distillation","summary":" ICD(International Classification of Diseases) coding involves assigning ICD\ncodes to patients visit based on their medical notes. ICD coding is a\nchallenging multilabel text classification problem due to noisy medical\ndocument inputs. Recent advancements in automated ICD coding have enhanced\nperformance by integrating additional data and knowledge bases with the\nencoding of medical notes and codes. However, most of them ignore the code\nhierarchy, leading to improper code assignments. To address these problems, we\npropose a novel framework based on associated and hierarchical code description\ndistillation (AHDD) for better code representation learning and avoidance of\nimproper code assignment.we utilize the code description and the hierarchical\nstructure inherent to the ICD codes. Therefore, in this paper, we leverage the\ncode description and the hierarchical structure inherent to the ICD codes. The\ncode description is also applied to aware the attention layer and output layer.\nExperimental results on the benchmark dataset show the superiority of the\nproposed framework over several state-of-the-art baselines.\n","authors":["Bin Zhang","Junli Wang"],"pdf_url":"https://arxiv.org/pdf/2404.11132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02270v3","updated":"2024-08-31T07:30:59Z","published":"2024-03-04T17:57:18Z","title":"FENICE: Factuality Evaluation of summarization based on Natural language\n Inference and Claim Extraction","summary":" Recent advancements in text summarization, particularly with the advent of\nLarge Language Models (LLMs), have shown remarkable performance. However, a\nnotable challenge persists as a substantial number of automatically-generated\nsummaries exhibit factual inconsistencies, such as hallucinations. In response\nto this issue, various approaches for the evaluation of consistency for\nsummarization have emerged. Yet, these newly-introduced metrics face several\nlimitations, including lack of interpretability, focus on short document\nsummaries (e.g., news articles), and computational impracticality, especially\nfor LLM-based metrics. To address these shortcomings, we propose Factuality\nEvaluation of summarization based on Natural language Inference and Claim\nExtraction (FENICE), a more interpretable and efficient factuality-oriented\nmetric. FENICE leverages an NLI-based alignment between information in the\nsource document and a set of atomic facts, referred to as claims, extracted\nfrom the summary. Our metric sets a new state of the art on AGGREFACT, the\nde-facto benchmark for factuality evaluation. Moreover, we extend our\nevaluation to a more challenging setting by conducting a human annotation\nprocess of long-form summarization. In the hope of fostering research in\nsummarization factuality evaluation, we release the code of our metric and our\nfactuality annotations of long-form summarization at\nhttps://github.com/Babelscape/FENICE.\n","authors":["Alessandro Scirè","Karim Ghonim","Roberto Navigli"],"pdf_url":"https://arxiv.org/pdf/2403.02270v3.pdf","comment":"ACL 2024 camera ready. Code and data at\n https://github.com/Babelscape/FENICE"},{"id":"http://arxiv.org/abs/2405.18638v2","updated":"2024-08-31T05:17:17Z","published":"2024-05-28T22:45:28Z","title":"ConSiDERS-The-Human Evaluation Framework: Rethinking Human Evaluation\n for Generative Large Language Models","summary":" In this position paper, we argue that human evaluation of generative large\nlanguage models (LLMs) should be a multidisciplinary undertaking that draws\nupon insights from disciplines such as user experience research and human\nbehavioral psychology to ensure that the experimental design and results are\nreliable. The conclusions from these evaluations, thus, must consider factors\nsuch as usability, aesthetics, and cognitive biases. We highlight how cognitive\nbiases can conflate fluent information and truthfulness, and how cognitive\nuncertainty affects the reliability of rating scores such as Likert.\nFurthermore, the evaluation should differentiate the capabilities and\nweaknesses of increasingly powerful large language models -- which requires\neffective test sets. The scalability of human evaluation is also crucial to\nwider adoption. Hence, to design an effective human evaluation system in the\nage of generative NLP, we propose the ConSiDERS-The-Human evaluation framework\nconsisting of 6 pillars -- Consistency, Scoring Criteria, Differentiating, User\nExperience, Responsible, and Scalability.\n","authors":["Aparna Elangovan","Ling Liu","Lei Xu","Sravan Bodapati","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2405.18638v2.pdf","comment":"Accepted in ACL 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.08396v4","updated":"2024-08-31T23:51:14Z","published":"2024-01-16T14:41:20Z","title":"Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in\n medicine","summary":" Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V performs comparatively to human physicians\nregarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 78% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (35.5%), most prominent in image comprehension\n(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such multimodal AI models into clinical\nworkflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Alex Chen","Josef A. Brejt","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05781v2","updated":"2024-08-31T21:20:16Z","published":"2024-08-11T14:13:22Z","title":"CURLing the Dream: Contrastive Representations for World Modeling in\n Reinforcement Learning","summary":" In this work, we present Curled-Dreamer, a novel reinforcement learning\nalgorithm that integrates contrastive learning into the DreamerV3 framework to\nenhance performance in visual reinforcement learning tasks. By incorporating\nthe contrastive loss from the CURL algorithm and a reconstruction loss from\nautoencoder, Curled-Dreamer achieves significant improvements in various\nDeepMind Control Suite tasks. Our extensive experiments demonstrate that\nCurled-Dreamer consistently outperforms state-of-the-art algorithms, achieving\nhigher mean and median scores across a diverse set of tasks. The results\nindicate that the proposed approach not only accelerates learning but also\nenhances the robustness of the learned policies. This work highlights the\npotential of combining different learning paradigms to achieve superior\nperformance in reinforcement learning applications.\n","authors":["Victor Augusto Kich","Jair Augusto Bottega","Raul Steinmetz","Ricardo Bedin Grando","Ayano Yorozu","Akihisa Ohya"],"pdf_url":"https://arxiv.org/pdf/2408.05781v2.pdf","comment":"Paper accepted for 24th International Conference on Control,\n Automation and Systems (ICCAS)"},{"id":"http://arxiv.org/abs/2407.05440v2","updated":"2024-08-31T20:26:49Z","published":"2024-07-07T17:03:12Z","title":"Explainable AI: Comparative Analysis of Normal and Dilated ResNet Models\n for Fundus Disease Classification","summary":" This paper presents dilated Residual Network (ResNet) models for disease\nclassification from retinal fundus images. Dilated convolution filters are used\nto replace normal convolution filters in the higher layers of the ResNet model\n(dilated ResNet) in order to improve the receptive field compared to the normal\nResNet model for disease classification. This study introduces\ncomputer-assisted diagnostic tools that employ deep learning, enhanced with\nexplainable AI techniques. These techniques aim to make the tool's\ndecision-making process transparent, thereby enabling medical professionals to\nunderstand and trust the AI's diagnostic decision. They are particularly\nrelevant in today's healthcare landscape, where there is a growing demand for\ntransparency in AI applications to ensure their reliability and ethical use.\nThe dilated ResNet is used as a replacement for the normal ResNet to enhance\nthe classification accuracy of retinal eye diseases and reduce the required\ncomputing time. The dataset used in this work is the Ocular Disease Intelligent\nRecognition (ODIR) dataset which is a structured ophthalmic database with eight\nclasses covering most of the common retinal eye diseases. The evaluation\nmetrics used in this work include precision, recall, accuracy, and F1 score. In\nthis work, a comparative study has been made between normal ResNet models and\ndilated ResNet models on five variants namely ResNet-18, ResNet-34, ResNet-50,\nResNet-101, and ResNet-152. The dilated ResNet model shows promising results as\ncompared to normal ResNet with an average F1 score of 0.71, 0.70, 0.69, 0.67,\nand 0.70 respectively for the above respective variants in ODIR multiclass\ndisease classification.\n","authors":["P. N. Karthikayan","Yoga Sri Varshan V","Hitesh Gupta Kattamuri","Umarani Jayaraman"],"pdf_url":"https://arxiv.org/pdf/2407.05440v2.pdf","comment":"Added authors' contributions"},{"id":"http://arxiv.org/abs/2206.04877v2","updated":"2024-08-31T18:11:08Z","published":"2022-06-10T05:11:02Z","title":"Convex Hull Prediction for Adaptive Video Streaming by Recurrent\n Learning","summary":" Adaptive video streaming relies on the construction of efficient bitrate\nladders to deliver the best possible visual quality to viewers under bandwidth\nconstraints. The traditional method of content dependent bitrate ladder\nselection requires a video shot to be pre-encoded with multiple encoding\nparameters to find the optimal operating points given by the convex hull of the\nresulting rate-quality curves. However, this pre-encoding step is equivalent to\nan exhaustive search process over the space of possible encoding parameters,\nwhich causes significant overhead in terms of both computation and time\nexpenditure. To reduce this overhead, we propose a deep learning based method\nof content aware convex hull prediction. We employ a recurrent convolutional\nnetwork (RCN) to implicitly analyze the spatiotemporal complexity of video\nshots in order to predict their convex hulls. A two-step transfer learning\nscheme is adopted to train our proposed RCN-Hull model, which ensures\nsufficient content diversity to analyze scene complexity, while also making it\npossible to capture the scene statistics of pristine source videos. Our\nexperimental results reveal that our proposed model yields better\napproximations of the optimal convex hulls, and offers competitive time savings\nas compared to existing approaches. On average, the pre-encoding time was\nreduced by 53.8% by our method, while the average Bjontegaard delta bitrate\n(BD-rate) of the predicted convex hulls against ground truth was 0.26%, and the\nmean absolute deviation of the BD-rate distribution was 0.57%.\n","authors":["Somdyuti Paul","Andrey Norkin","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2206.04877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14367v2","updated":"2024-08-31T15:28:20Z","published":"2024-07-19T14:53:18Z","title":"Thinking Racial Bias in Fair Forgery Detection: Models, Datasets and\n Evaluations","summary":" Due to the successful development of deep image generation technology,\nforgery detection plays a more important role in social and economic security.\nRacial bias has not been explored thoroughly in the deep forgery detection\nfield. In the paper, we first contribute a dedicated dataset called the Fair\nForgery Detection (FairFD) dataset, where we prove the racial bias of public\nstate-of-the-art (SOTA) methods. Different from existing forgery detection\ndatasets, the self-constructed FairFD dataset contains a balanced racial ratio\nand diverse forgery generation images with the largest-scale subjects.\nAdditionally, we identify the problems with naive fairness metrics when\nbenchmarking forgery detection models. To comprehensively evaluate fairness, we\ndesign novel metrics including Approach Averaged Metric and Utility Regularized\nMetric, which can avoid deceptive results. We also present an effective and\nrobust post-processing technique, Bias Pruning with Fair Activations (BPFA),\nwhich improves fairness without requiring retraining or weight updates.\nExtensive experiments conducted with 12 representative forgery detection models\ndemonstrate the value of the proposed dataset and the reasonability of the\ndesigned fairness metrics. By applying the BPFA to the existing fairest\ndetector, we achieve a new SOTA. Furthermore, we conduct more in-depth analyses\nto offer more insights to inspire researchers in the community.\n","authors":["Decheng Liu","Zongqi Wang","Chunlei Peng","Nannan Wang","Ruimin Hu","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2407.14367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13723v7","updated":"2024-08-31T13:35:33Z","published":"2022-10-25T02:42:49Z","title":"S3E: A Mulit-Robot Multimodal Dataset for Collaborative SLAM","summary":" The burgeoning demand for collaborative robotic systems to execute complex\ntasks collectively has intensified the research community's focus on advancing\nsimultaneous localization and mapping (SLAM) in a cooperative context. Despite\nthis interest, the scalability and diversity of existing datasets for\ncollaborative trajectories remain limited, especially in scenarios with\nconstrained perspectives where the generalization capabilities of Collaborative\nSLAM (C-SLAM) are critical for the feasibility of multi-agent missions.\nAddressing this gap, we introduce S3E, an expansive multimodal dataset.\nCaptured by a fleet of unmanned ground vehicles traversing four distinct\ncollaborative trajectory paradigms, S3E encompasses 13 outdoor and 5 indoor\nsequences. These sequences feature meticulously synchronized and spatially\ncalibrated data streams, including 360-degree LiDAR point cloud,\nhigh-resolution stereo imagery, high-frequency inertial measurement units\n(IMU), and Ultra-wideband (UWB) relative observations. Our dataset not only\nsurpasses previous efforts in scale, scene diversity, and data intricacy but\nalso provides a thorough analysis and benchmarks for both collaborative and\nindividual SLAM methodologies. For access to the dataset and the latest\ninformation, please visit our repository at https://pengyu-team.github.io/S3E.\n","authors":["Dapeng Feng","Yuhua Qi","Shipeng Zhong","Zhiqiang Chen","Qiming Chen","Hongbo Chen","Jin Wu","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2210.13723v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01758v3","updated":"2024-08-31T12:56:28Z","published":"2023-12-04T09:35:36Z","title":"CILF-CIAE: CLIP-driven Image-Language Fusion for Correcting Inverse Age\n Estimation","summary":" The age estimation task aims to predict the age of an individual by analyzing\nfacial features in an image. The development of age estimation can improve the\nefficiency and accuracy of various applications (e.g., age verification and\nsecure access control, etc.). In recent years, contrastive language-image\npre-training (CLIP) has been widely used in various multimodal tasks and has\nmade some progress in the field of age estimation. However, existing CLIP-based\nage estimation methods require high memory usage (quadratic complexity) when\nglobally modeling images, and lack an error feedback mechanism to prompt the\nmodel about the quality of age prediction results. To tackle the above issues,\nwe propose a novel CLIP-driven Image-Language Fusion for Correcting Inverse Age\nEstimation (CILF-CIAE). Specifically, we first introduce the CLIP model to\nextract image features and text semantic information respectively, and map them\ninto a highly semantically aligned high-dimensional feature space. Next, we\ndesigned a new Transformer architecture (i.e., FourierFormer) to achieve\nchannel evolution and spatial interaction of images, and to fuse image and text\nsemantic information. Compared with the quadratic complexity of the attention\nmechanism, the proposed Fourierformer is of linear log complexity. To further\nnarrow the semantic gap between image and text features, we utilize an\nefficient contrastive multimodal learning module that supervises the multimodal\nfusion process of FourierFormer through contrastive loss for image-text\nmatching, thereby improving the interaction effect between different\nmodalities. Finally, we introduce reversible age estimation, which uses\nend-to-end error feedback to reduce the error rate of age predictions. Through\nextensive experiments on multiple data sets, CILF-CIAE has achieved better age\nprediction results.\n","authors":["Yuntao Shou","Wei Ai","Tao Meng","Nan Yin","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2312.01758v3.pdf","comment":"14 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.02545v2","updated":"2024-08-31T12:53:19Z","published":"2023-12-05T07:23:22Z","title":"Graph Information Bottleneck for Remote Sensing Segmentation","summary":" Remote sensing segmentation has a wide range of applications in environmental\nprotection, and urban change detection, etc. Despite the success of deep\nlearning-based remote sensing segmentation methods (e.g., CNN and Transformer),\nthey are not flexible enough to model irregular objects. In addition, existing\ngraph contrastive learning methods usually adopt the way of maximizing mutual\ninformation to keep the node representations consistent between different graph\nviews, which may cause the model to learn task-independent redundant\ninformation. To tackle the above problems, this paper treats images as graph\nstructures and introduces a simple contrastive vision GNN (SC-ViG) architecture\nfor remote sensing segmentation. Specifically, we construct a node-masked and\nedge-masked graph view to obtain an optimal graph structure representation,\nwhich can adaptively learn whether to mask nodes and edges. Furthermore, this\npaper innovatively introduces information bottleneck theory into graph\ncontrastive learning to maximize task-related information while minimizing\ntask-independent redundant information. Finally, we replace the convolutional\nmodule in UNet with the SC-ViG module to complete the segmentation and\nclassification tasks of remote sensing images. Extensive experiments on\npublicly available real datasets demonstrate that our method outperforms\nstate-of-the-art remote sensing image segmentation methods.\n","authors":["Yuntao Shou","Wei Ai","Tao Meng","Nan Yin"],"pdf_url":"https://arxiv.org/pdf/2312.02545v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.06152v2","updated":"2024-08-31T12:32:50Z","published":"2024-08-12T13:48:06Z","title":"Palantir: Towards Efficient Super Resolution for Ultra-high-definition\n Live Streaming","summary":" Neural enhancement through super-resolution (SR) deep neural networks (DNNs)\nopens up new possibilities for ultra-high-definition (UHD) live streaming over\nexisting encoding and networking infrastructure. Yet, the heavy SR DNN\ninference overhead leads to severe deployment challenges. To reduce the\noverhead, existing systems propose to apply DNN-based SR only on carefully\nselected anchor frames while upscaling non-anchor frames via the lightweight\nreusing-based SR approach. However, frame-level scheduling is coarse-grained\nand fails to deliver optimal efficiency. In this work, we propose Palantir, the\nfirst neural-enhanced UHD live streaming system with fine-grained patch-level\nscheduling. Two novel techniques are incorporated into Palantir to select the\nmost beneficial anchor patches and support latency-sensitive UHD live streaming\napplications. Firstly, under the guidance of our pioneering and theoretical\nanalysis, Palantir constructs a directed acyclic graph (DAG) for lightweight\nyet accurate SR quality estimation under any possible anchor patch set.\nSecondly, to further optimize the scheduling latency, Palantir improves\nparallelizability by refactoring the computation subprocedure of the estimation\nprocess into a sparse matrix-matrix multiplication operation.\n The evaluation results suggest that Palantir incurs a negligible scheduling\nlatency accounting for less than 5.7% of the end-to-end latency requirement.\nWhen compared to the naive method of applying DNN-based SR on all the frames,\nPalantir can reduce the SR DNN inference overhead by 20 times (or 60 times)\nwhile preserving 54.0-82.6% (or 32.8-64.0%) of the quality gain. When compared\nto the state-of-the-art real-time frame-level scheduling strategy, Palantir can\nreduce the SR DNN inference overhead by 80.1% at most (and 38.4% on average)\nwithout sacrificing the video quality.\n","authors":["Xinqi Jin","Zhui Zhu","Xikai Sun","Fan Dang","Jiangchuan Liu","Jingao Xu","Kebin Liu","Xinlei Chen","Yunhao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10723v2","updated":"2024-08-31T11:33:26Z","published":"2024-07-15T13:49:31Z","title":"Anticipating Future Object Compositions without Forgetting","summary":" Despite the significant advancements in computer vision models, their ability\nto generalize to novel object-attribute compositions remains limited. Existing\nmethods for Compositional Zero-Shot Learning (CZSL) mainly focus on image\nclassification. This paper aims to enhance CZSL in object detection without\nforgetting prior learned knowledge. We use Grounding DINO and incorporate\nCompositional Soft Prompting (CSP) into it and extend it with Compositional\nAnticipation. We achieve a 70.5% improvement over CSP on the harmonic mean (HM)\nbetween seen and unseen compositions on the CLEVR dataset. Furthermore, we\nintroduce Contrastive Prompt Tuning to incrementally address model confusion\nbetween similar compositions. We demonstrate the effectiveness of this method\nand achieve an increase of 14.5% in HM across the pretrain, increment, and\nunseen sets. Collectively, these methods provide a framework for learning\nvarious compositions with limited data, as well as improving the performance of\nunderperforming compositions when additional data becomes available.\n","authors":["Youssef Zahran","Gertjan Burghouts","Yke Bauke Eisma"],"pdf_url":"https://arxiv.org/pdf/2407.10723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19875v2","updated":"2024-08-31T10:34:37Z","published":"2024-06-28T12:35:01Z","title":"InfiniBench: A Comprehensive Benchmark for Large Multimodal Models in\n Very Long Video Understanding","summary":" Understanding long videos, ranging from tens of minutes to several hours,\npresents unique challenges in video comprehension. Despite the increasing\nimportance of long-form video content, existing benchmarks primarily focus on\nshorter clips. To address this gap, we introduce InfiniBench a comprehensive\nbenchmark for very long video understanding which presents 1)The longest video\nduration, averaging 52.59 minutes per video 2) The largest number of\nquestion-answer pairs, 108.2K 3) Diversity in questions that examine nine\ndifferent skills and include both multiple-choice questions and open-ended\nquestions 4) Human-centric, as the video sources come from movies and daily TV\nshows, with specific human-level question designs such as Movie Spoiler\nQuestions that require critical thinking and comprehensive understanding. Using\nInfiniBench, we comprehensively evaluate existing Large Multi-Modality Models\n(LMMs) on each skill, including the commercial models such as GPT-4o and Gemini\n1.5 Flash and the open-source models. The evaluation shows significant\nchallenges in our benchmark. Our findings reveal that even leading AI models\nlike GPT-4o and Gemini 1.5 Flash face challenges in achieving high performance\nin long video understanding, with average accuracies of just 49.16\\% and\n42.72\\%, and average scores of 3.22 and 2.71 out of 5, respectively. We hope\nthis benchmark will stimulate the LMMs community towards long video and\nhuman-level understanding. Our benchmark can be accessed at\nhttps://vision-cair.github.io/InfiniBench/\n","authors":["Kirolos Ataallah","Chenhui Gou","Eslam Abdelrahman","Khushbu Pahwa","Jian Ding","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2406.19875v2.pdf","comment":"24 pages,25 figures"},{"id":"http://arxiv.org/abs/2402.08207v2","updated":"2024-08-31T06:35:17Z","published":"2024-02-13T04:12:41Z","title":"Translating Images to Road Network: A Sequence-to-Sequence Perspective","summary":" The extraction of road network is essential for the generation of\nhigh-definition maps since it enables the precise localization of road\nlandmarks and their interconnections. However, generating road network poses a\nsignificant challenge due to the conflicting underlying combination of\nEuclidean (e.g., road landmarks location) and non-Euclidean (e.g., road\ntopological connectivity) structures. Existing methods struggle to merge the\ntwo types of data domains effectively, but few of them address it properly.\nInstead, our work establishes a unified representation of both types of data\ndomain by projecting both Euclidean and non-Euclidean data into an integer\nseries called RoadNet Sequence. Further than modeling an auto-regressive\nsequence-to-sequence Transformer model to understand RoadNet Sequence, we\ndecouple the dependency of RoadNet Sequence into a mixture of auto-regressive\nand non-autoregressive dependency. Building on this, our proposed\nnon-autoregressive sequence-to-sequence approach leverages non-autoregressive\ndependencies while fixing the gap towards auto-regressive dependencies,\nresulting in success on both efficiency and accuracy. We further identify two\nmain bottlenecks in the current RoadNetTransformer on a non-overfitting split\nof the dataset: poor landmark detection limited by the BEV Encoder and error\npropagation to topology reasoning. Therefore, we propose Topology-Inherited\nTraining to inherit better topology knowledge into RoadNetTransformer.\nAdditionally, we collect SD-Maps from open-source map datasets and use this\nprior information to significantly improve landmark detection and reachability.\nExtensive experiments on nuScenes dataset demonstrate the superiority of\nRoadNet Sequence representation and the non-autoregressive approach compared to\nexisting state-of-the-art alternatives.\n","authors":["Jiachen Lu","Renyuan Peng","Xinyue Cai","Hang Xu","Feng Wen","Wei Zhang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.08207v2.pdf","comment":"V1 is the ICCV 2023 conference version, and V2 is the extended\n version"},{"id":"http://arxiv.org/abs/2401.11436v2","updated":"2024-08-31T06:24:18Z","published":"2024-01-21T09:16:29Z","title":"Geometric Prior Guided Feature Representation Learning for Long-Tailed\n Classification","summary":" Real-world data are long-tailed, the lack of tail samples leads to a\nsignificant limitation in the generalization ability of the model. Although\nnumerous approaches of class re-balancing perform well for moderate class\nimbalance problems, additional knowledge needs to be introduced to help the\ntail class recover the underlying true distribution when the observed\ndistribution from a few tail samples does not represent its true distribution\nproperly, thus allowing the model to learn valuable information outside the\nobserved domain. In this work, we propose to leverage the geometric information\nof the feature distribution of the well-represented head class to guide the\nmodel to learn the underlying distribution of the tail class. Specifically, we\nfirst systematically define the geometry of the feature distribution and the\nsimilarity measures between the geometries, and discover four phenomena\nregarding the relationship between the geometries of different feature\ndistributions. Then, based on four phenomena, feature uncertainty\nrepresentation is proposed to perturb the tail features by utilizing the\ngeometry of the head class feature distribution. It aims to make the perturbed\nfeatures cover the underlying distribution of the tail class as much as\npossible, thus improving the model's generalization performance in the test\ndomain. Finally, we design a three-stage training scheme enabling feature\nuncertainty modeling to be successfully applied. Experiments on\nCIFAR-10/100-LT, ImageNet-LT, and iNaturalist2018 show that our proposed\napproach outperforms other similar methods on most metrics. In addition, the\nexperimental phenomena we discovered are able to provide new perspectives and\ntheoretical foundations for subsequent studies.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11436v2.pdf","comment":"This work was accepted by the IJCV 2024"},{"id":"http://arxiv.org/abs/2408.12590v2","updated":"2024-08-31T05:12:09Z","published":"2024-08-22T17:55:22Z","title":"xGen-VideoSyn-1: High-fidelity Text-to-Video Synthesis with Compressed\n Representations","summary":" We present xGen-VideoSyn-1, a text-to-video (T2V) generation model capable of\nproducing realistic scenes from textual descriptions. Building on recent\nadvancements, such as OpenAI's Sora, we explore the latent diffusion model\n(LDM) architecture and introduce a video variational autoencoder (VidVAE).\nVidVAE compresses video data both spatially and temporally, significantly\nreducing the length of visual tokens and the computational demands associated\nwith generating long-sequence videos. To further address the computational\ncosts, we propose a divide-and-merge strategy that maintains temporal\nconsistency across video segments. Our Diffusion Transformer (DiT) model\nincorporates spatial and temporal self-attention layers, enabling robust\ngeneralization across different timeframes and aspect ratios. We have devised a\ndata processing pipeline from the very beginning and collected over 13M\nhigh-quality video-text pairs. The pipeline includes multiple steps such as\nclipping, text detection, motion estimation, aesthetics scoring, and dense\ncaptioning based on our in-house video-LLM model. Training the VidVAE and DiT\nmodels required approximately 40 and 642 H100 days, respectively. Our model\nsupports over 14-second 720p video generation in an end-to-end way and\ndemonstrates competitive performance against state-of-the-art T2V models.\n","authors":["Can Qin","Congying Xia","Krithika Ramakrishnan","Michael Ryoo","Lifu Tu","Yihao Feng","Manli Shu","Honglu Zhou","Anas Awadalla","Jun Wang","Senthil Purushwalkam","Le Xue","Yingbo Zhou","Huan Wang","Silvio Savarese","Juan Carlos Niebles","Zeyuan Chen","Ran Xu","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.12590v2.pdf","comment":"Accepted by ECCV24 AI4VA"},{"id":"http://arxiv.org/abs/2408.05891v3","updated":"2024-08-31T02:52:26Z","published":"2024-08-12T02:09:25Z","title":"CMAB: A First National-Scale Multi-Attribute Building Dataset in China\n Derived from Open Source Data and GeoAI","summary":" Rapidly acquiring three-dimensional (3D) building data, including geometric\nattributes like rooftop, height and orientations, as well as indicative\nattributes like function, quality, and age, is essential for accurate urban\nanalysis, simulations, and policy updates. Current building datasets suffer\nfrom incomplete coverage of building multi-attributes. This paper introduces a\ngeospatial artificial intelligence (GeoAI) framework for large-scale building\nmodeling, presenting the first national-scale Multi-Attribute Building dataset\n(CMAB), covering 3,667 spatial cities, 29 million buildings, and 21.3 billion\nsquare meters of rooftops with an F1-Score of 89.93% in OCRNet-based\nextraction, totaling 337.7 billion cubic meters of building stock. We trained\nbootstrap aggregated XGBoost models with city administrative classifications,\nincorporating features such as morphology, location, and function. Using\nmulti-source data, including billions of high-resolution Google Earth images\nand 60 million street view images (SVIs), we generated rooftop, height,\nfunction, age, and quality attributes for each building. Accuracy was validated\nthrough model benchmarks, existing similar products, and manual SVI validation,\nmostly above 80%. Our dataset and results are crucial for global SDGs and urban\nplanning.\n","authors":["Yecheng Zhang","Huimin Zhao","Ying Long"],"pdf_url":"https://arxiv.org/pdf/2408.05891v3.pdf","comment":"43 pages, 20 figures"},{"id":"http://arxiv.org/abs/2403.12172v2","updated":"2024-08-31T02:36:11Z","published":"2024-03-18T18:42:32Z","title":"Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video\n Anomaly Detection","summary":" Skeleton-based video anomaly detection (SVAD) is a crucial task in computer\nvision. Accurately identifying abnormal patterns or events enables operators to\npromptly detect suspicious activities, thereby enhancing safety. Achieving this\ndemands a comprehensive understanding of human motions, both at body and region\nlevels, while also accounting for the wide variations of performing a single\naction. However, existing studies fail to simultaneously address these crucial\nproperties. This paper introduces a novel, practical and lightweight framework,\nnamely Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video\nAnomaly Detection (GiCiSAD) to overcome the challenges associated with SVAD.\nGiCiSAD consists of three novel modules: the Graph Attention-based Forecasting\nmodule to capture the spatio-temporal dependencies inherent in the data, the\nGraph-level Jigsaw Puzzle Maker module to distinguish subtle region-level\ndiscrepancies between normal and abnormal motions, and the Graph-based\nConditional Diffusion model to generate a wide spectrum of human motions.\nExtensive experiments on four widely used skeleton-based video datasets show\nthat GiCiSAD outperforms existing methods with significantly fewer training\nparameters, establishing it as the new state-of-the-art.\n","authors":["Ali Karami","Thi Kieu Khanh Ho","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2403.12172v2.pdf","comment":"Accepted at the Winter Conference on Applications of Computer Vision\n (WACV). 17 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2402.01105v3","updated":"2024-08-31T02:28:20Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.18646v2","updated":"2024-08-31T19:26:21Z","published":"2024-07-26T10:28:59Z","title":"Decoding Knowledge Claims: The Evaluation of Scientific Publication\n Contributions through Semantic Analysis","summary":" The surge in scientific publications challenges the use of publication counts\nas a measure of scientific progress, requiring alternative metrics that\nemphasize the quality and novelty of scientific contributions rather than sheer\nquantity. This paper proposes the use of Relaxed Word Mover's Distance (RWMD),\na semantic text similarity measure, to evaluate the novelty of scientific\npapers. We hypothesize that RWMD can more effectively gauge the growth of\nscientific knowledge. To test such an assumption, we apply RWMD to evaluate\nseminal papers, with Hirsch's H-Index paper as a primary case study. We compare\nRWMD results across three groups: 1) H-Index-related papers, 2) scientometric\nstudies, and 3) unrelated papers, aiming to discern redundant literature and\nhype from genuine innovations. Findings suggest that emphasizing knowledge\nclaims offers a deeper insight into scientific contributions, marking RWMD as a\npromising alternative method to traditional citation metrics, thus better\ntracking significant scientific breakthroughs.\n","authors":["Luca D'Aniello","Nicolas Robinson-Garcia","Massimo Aria","Corrado Cuccurullo"],"pdf_url":"https://arxiv.org/pdf/2407.18646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00448v1","updated":"2024-08-31T13:01:58Z","published":"2024-08-31T13:01:58Z","title":"PSLF: A PID Controller-incorporated Second-order Latent Factor Analysis\n Model for Recommender System","summary":" A second-order-based latent factor (SLF) analysis model demonstrates superior\nperformance in graph representation learning, particularly for high-dimensional\nand incomplete (HDI) interaction data, by incorporating the curvature\ninformation of the loss landscape. However, its objective function is commonly\nbi-linear and non-convex, causing the SLF model to suffer from a low\nconvergence rate. To address this issue, this paper proposes a PID\ncontroller-incorporated SLF (PSLF) model, leveraging two key strategies: a)\nrefining learning error estimation by incorporating the PID controller\nprinciples, and b) acquiring second-order information insights through\nHessian-vector products. Experimental results on multiple HDI datasets indicate\nthat the proposed PSLF model outperforms four state-of-the-art latent factor\nmodels based on advanced optimizers regarding convergence rates and\ngeneralization performance.\n","authors":["Jialiang Wang","Yan Xia","Ye Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.00448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00400v1","updated":"2024-08-31T09:19:41Z","published":"2024-08-31T09:19:41Z","title":"An Enhanced Batch Query Architecture in Real-time Recommendation","summary":" In industrial recommendation systems on websites and apps, it is essential to\nrecall and predict top-n results relevant to user interests from a content pool\nof billions within milliseconds. To cope with continuous data growth and\nimprove real-time recommendation performance, we have designed and implemented\na high-performance batch query architecture for real-time recommendation\nsystems. Our contributions include optimizing hash structures with a\ncacheline-aware probing method to enhance coalesced hashing, as well as the\nimplementation of a hybrid storage key-value service built upon it. Our\nexperiments indicate this approach significantly surpasses conventional hash\ntables in batch query throughput, achieving up to 90% of the query throughput\nof random memory access when incorporating parallel optimization. The support\nfor NVMe, integrating two-tier storage for hot and cold data, notably reduces\nresource consumption. Additionally, the system facilitates dynamic updates,\nautomated sharding of attributes and feature embedding tables, and introduces\ninnovative protocols for consistency in batch queries, thereby enhancing the\neffectiveness of real-time incremental learning updates. This architecture has\nbeen deployed and in use in the bilibili recommendation system for over a year,\na video content community with hundreds of millions of users, supporting 10x\nincrease in model computation with minimal resource growth, improving outcomes\nwhile preserving the system's real-time performance.\n","authors":["Qiang Zhang","Zhipeng Teng","Disheng Wu","Jiayin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00400v1.pdf","comment":"8 pages, 10 figures, CIKM 2024 Applied Research Paper"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.03353v2","updated":"2024-08-31T23:33:10Z","published":"2024-08-06T08:55:49Z","title":"Adversarial Domain Adaptation for Cross-user Activity Recognition Using\n Diffusion-based Noise-centred Learning","summary":" Human Activity Recognition (HAR) plays a crucial role in various applications\nsuch as human-computer interaction and healthcare monitoring. However,\nchallenges persist in HAR models due to the data distribution differences\nbetween training and real-world data distributions, particularly evident in\ncross-user scenarios. This paper introduces a novel framework, termed\nDiffusion-based Noise-centered Adversarial Learning Domain Adaptation\n(Diff-Noise-Adv-DA), designed to address these challenges by leveraging\ngenerative diffusion modeling and adversarial learning techniques. Traditional\nHAR models often struggle with the diversity of user behaviors and sensor data\ndistributions. Diff-Noise-Adv-DA innovatively integrates the inherent noise\nwithin diffusion models, harnessing its latent information to enhance domain\nadaptation. Specifically, the framework transforms noise into a critical\ncarrier of activity and domain class information, facilitating robust\nclassification across different user domains. Experimental evaluations\ndemonstrate the effectiveness of Diff-Noise-Adv-DA in improving HAR model\nperformance across different users, surpassing traditional domain adaptation\nmethods. The framework not only mitigates distribution mismatches but also\nenhances data quality through noise-based denoising techniques.\n","authors":["Xiaozhou Ye","Kevin I-Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05781v2","updated":"2024-08-31T21:20:16Z","published":"2024-08-11T14:13:22Z","title":"CURLing the Dream: Contrastive Representations for World Modeling in\n Reinforcement Learning","summary":" In this work, we present Curled-Dreamer, a novel reinforcement learning\nalgorithm that integrates contrastive learning into the DreamerV3 framework to\nenhance performance in visual reinforcement learning tasks. By incorporating\nthe contrastive loss from the CURL algorithm and a reconstruction loss from\nautoencoder, Curled-Dreamer achieves significant improvements in various\nDeepMind Control Suite tasks. Our extensive experiments demonstrate that\nCurled-Dreamer consistently outperforms state-of-the-art algorithms, achieving\nhigher mean and median scores across a diverse set of tasks. The results\nindicate that the proposed approach not only accelerates learning but also\nenhances the robustness of the learned policies. This work highlights the\npotential of combining different learning paradigms to achieve superior\nperformance in reinforcement learning applications.\n","authors":["Victor Augusto Kich","Jair Augusto Bottega","Raul Steinmetz","Ricardo Bedin Grando","Ayano Yorozu","Akihisa Ohya"],"pdf_url":"https://arxiv.org/pdf/2408.05781v2.pdf","comment":"Paper accepted for 24th International Conference on Control,\n Automation and Systems (ICCAS)"},{"id":"http://arxiv.org/abs/2408.05886v2","updated":"2024-08-31T21:11:40Z","published":"2024-08-12T01:27:06Z","title":"Online-Score-Aided Federated Learning: Taming the Resource Constraints\n in Wireless Networks","summary":" While FL is a widely popular distributed ML strategy that protects data\nprivacy, time-varying wireless network parameters and heterogeneous system\nconfigurations of the wireless device pose significant challenges. Although the\nlimited radio and computational resources of the network and the clients,\nrespectively, are widely acknowledged, two critical yet often ignored aspects\nare (a) wireless devices can only dedicate a small chunk of their limited\nstorage for the FL task and (b) new training samples may arrive in an online\nmanner in many practical wireless applications. Therefore, we propose a new FL\nalgorithm called OSAFL, specifically designed to learn tasks relevant to\nwireless applications under these practical considerations. Since it has long\nbeen proven that under extreme resource constraints, clients may perform an\narbitrary number of local training steps, which may lead to client drift under\nstatistically heterogeneous data distributions, we leverage normalized gradient\nsimilarities and exploit weighting clients' updates based on optimized scores\nthat facilitate the convergence rate of the proposed OSAFL algorithm. Our\nextensive simulation results on two different tasks -- each with three\ndifferent datasets -- with four popular ML models validate the effectiveness of\nOSAFL compared to six existing state-of-the-art FL baselines.\n","authors":["Md Ferdous Pervej","Minseok Choi","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2408.05886v2.pdf","comment":"Under review for possible publication in IEEE Transactions on\n Communications"},{"id":"http://arxiv.org/abs/2408.04841v3","updated":"2024-08-31T21:01:06Z","published":"2024-08-09T03:32:37Z","title":"Kolmogorov-Arnold Network for Online Reinforcement Learning","summary":" Kolmogorov-Arnold Networks (KANs) have shown potential as an alternative to\nMulti-Layer Perceptrons (MLPs) in neural networks, providing universal function\napproximation with fewer parameters and reduced memory usage. In this paper, we\nexplore the use of KANs as function approximators within the Proximal Policy\nOptimization (PPO) algorithm. We evaluate this approach by comparing its\nperformance to the original MLP-based PPO using the DeepMind Control Proprio\nRobotics benchmark. Our results indicate that the KAN-based reinforcement\nlearning algorithm can achieve comparable performance to its MLP-based\ncounterpart, often with fewer parameters. These findings suggest that KANs may\noffer a more efficient option for reinforcement learning models.\n","authors":["Victor Augusto Kich","Jair Augusto Bottega","Raul Steinmetz","Ricardo Bedin Grando","Ayano Yorozu","Akihisa Ohya"],"pdf_url":"https://arxiv.org/pdf/2408.04841v3.pdf","comment":"Paper accepted at 24th International Conference on Control,\n Automation and Systems (ICCAS)"},{"id":"http://arxiv.org/abs/2305.03509v3","updated":"2024-08-31T20:44:41Z","published":"2023-05-04T16:14:43Z","title":"Diffusion Explainer: Visual Explanation for Text-to-image Stable\n Diffusion","summary":" Diffusion-based generative models' impressive ability to create convincing\nimages has garnered global attention. However, their complex structures and\noperations often pose challenges for non-experts to grasp. We present Diffusion\nExplainer, the first interactive visualization tool that explains how Stable\nDiffusion transforms text prompts into images. Diffusion Explainer tightly\nintegrates a visual overview of Stable Diffusion's complex structure with\nexplanations of the underlying operations. By comparing image generation of\nprompt variants, users can discover the impact of keyword changes on image\ngeneration. A 56-participant user study demonstrates that Diffusion Explainer\noffers substantial learning benefits to non-experts. Our tool has been used by\nover 10,300 users from 124 countries at\nhttps://poloclub.github.io/diffusion-explainer/.\n","authors":["Seongmin Lee","Benjamin Hoover","Hendrik Strobelt","Zijie J. Wang","ShengYun Peng","Austin Wright","Kevin Li","Haekyu Park","Haoyang Yang","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2305.03509v3.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.01259v3","updated":"2024-08-31T20:03:49Z","published":"2024-01-02T16:05:23Z","title":"Do Concept Bottleneck Models Respect Localities?","summary":" Concept-based methods explain model predictions using human-understandable\nconcepts. These models require accurate concept predictors, yet the\nfaithfulness of existing concept predictors to their underlying concepts is\nunclear. In this paper, we investigate the faithfulness of Concept Bottleneck\nModels (CBMs), a popular family of concept-based architectures, by looking at\nwhether they respect \"localities\" in datasets. Localities involve using only\nrelevant features when predicting a concept's value. When localities are not\nconsidered, concepts may be predicted based on spuriously correlated features,\ndegrading performance and robustness. This work examines how CBM predictions\nchange when perturbing model inputs, and reveals that CBMs may not capture\nlocalities, even when independent concepts are localised to non-overlapping\nfeature subsets. Our empirical and theoretical results demonstrate that\ndatasets with correlated concepts may lead to accurate but uninterpretable\nmodels that fail to learn localities. Overall, we find that CBM\ninterpretability is fragile, as CBMs occasionally rely upon spurious features,\nnecessitating further research into the robustness of concept predictors.\n","authors":["Naveen Raman","Mateo Espinosa Zarlenga","Juyeon Heo","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2401.01259v3.pdf","comment":"Previous Version Accepted at NeurIPs 23 XAI in Action Workshop"},{"id":"http://arxiv.org/abs/2312.17382v2","updated":"2024-08-31T18:36:53Z","published":"2023-12-28T22:12:54Z","title":"Discovery of Small Ultra-short-period Planets Orbiting KG Dwarfs in\n Kepler Survey Using GPU Phase Folding and Deep Learning Detection System","summary":" Since the discovery of the first hot Jupiter orbiting a solar-type star, 51\nPeg, in 1995, more than 4000 exoplanets have been identified using various\nobservational techniques. The formation process of these sub-Earths remains\nelusive, and acquiring additional samples is essential for investigating this\nunique population. In our study, we employ a novel GPU Phase Folding algorithm\ncombined with a Convolutional Neural Network, termed the GPFC method, on Kepler\nphotometry data. This method enhances the transit search speed significantly\nover the traditional Box-fitting Least Squares method, allowing a complete\nsearch of the known KOI photometry data within hours using a commercial GPU\ncard. To date, we have identified five promising sub-Earth short-period\ncandidates: K00446.c, K01821.b, K01522.c, K03404.b, and K04978.b. A closer\nanalysis reveals the following characteristics: K00446.c orbits a K dwarf on a\n0.645091-day period. With a radius of $0.461R_\\oplus$, it ranks as the second\nsmallest USP discovered to date. K01821.b is a sub-Earth with a radius of\n$0.648R_\\oplus$, orbiting a G dwarf over a 0.91978-day period. It is the second\nsmallest USP among all confirmed USPs orbiting G dwarfs in the NASA Archive.\nK01522.c has a radius of $0.704 R_\\oplus$ and completes an orbit around a\nSun-like G dwarf in 0.64672 days; K03404.b, with a radius of $0.738 R_\\oplus$,\norbits a G dwarf on a 0.68074-day period; and K04978.b, with its planetary\nradius of $0.912 R_\\oplus$, orbits a G dwarf, completing an orbit every 0.94197\ndays. Three of our finds, K01821.b, K01522.c and K03404.b, rank as the smallest\nplanets among all confirmed USPs orbiting G dwarfs in the Kepler dataset. The\ndiscovery of these small exoplanets underscores the promising capability of the\nGPFC method for searching for small, new transiting exoplanets in photometry\ndata from Kepler, TESS, and future space transit missions.\n","authors":["Kaitlyn Wang","Jian Ge","Kevin Willis","Kevin Wang","Yinan Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.17382v2.pdf","comment":"24 pages, 40 figures; To be published in the Monthly Notices of the\n Royal Astronomical Society (MNRAS)"},{"id":"http://arxiv.org/abs/2206.04877v2","updated":"2024-08-31T18:11:08Z","published":"2022-06-10T05:11:02Z","title":"Convex Hull Prediction for Adaptive Video Streaming by Recurrent\n Learning","summary":" Adaptive video streaming relies on the construction of efficient bitrate\nladders to deliver the best possible visual quality to viewers under bandwidth\nconstraints. The traditional method of content dependent bitrate ladder\nselection requires a video shot to be pre-encoded with multiple encoding\nparameters to find the optimal operating points given by the convex hull of the\nresulting rate-quality curves. However, this pre-encoding step is equivalent to\nan exhaustive search process over the space of possible encoding parameters,\nwhich causes significant overhead in terms of both computation and time\nexpenditure. To reduce this overhead, we propose a deep learning based method\nof content aware convex hull prediction. We employ a recurrent convolutional\nnetwork (RCN) to implicitly analyze the spatiotemporal complexity of video\nshots in order to predict their convex hulls. A two-step transfer learning\nscheme is adopted to train our proposed RCN-Hull model, which ensures\nsufficient content diversity to analyze scene complexity, while also making it\npossible to capture the scene statistics of pristine source videos. Our\nexperimental results reveal that our proposed model yields better\napproximations of the optimal convex hulls, and offers competitive time savings\nas compared to existing approaches. On average, the pre-encoding time was\nreduced by 53.8% by our method, while the average Bjontegaard delta bitrate\n(BD-rate) of the predicted convex hulls against ground truth was 0.26%, and the\nmean absolute deviation of the BD-rate distribution was 0.57%.\n","authors":["Somdyuti Paul","Andrey Norkin","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2206.04877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01655v2","updated":"2024-08-31T18:00:40Z","published":"2022-10-04T14:58:12Z","title":"Public Transit Arrival Prediction: a Seq2Seq RNN Approach","summary":" Arrival/Travel times for public transit exhibit variability on account of\nfactors like seasonality, dwell times at bus stops, traffic signals, travel\ndemand fluctuation etc. The developing world in particular is plagued by\nadditional factors like lack of lane discipline, excess vehicles, diverse modes\nof transport and so on. This renders the bus arrival time prediction (BATP) to\nbe a challenging problem especially in the developing world. A novel\ndata-driven model based on recurrent neural networks (RNNs) is proposed for\nBATP (in real-time) in the current work. The model intelligently incorporates\nboth spatial and temporal correlations in a unique (non-linear) fashion\ndistinct from existing approaches. In particular, we propose a Gated Recurrent\nUnit (GRU) based Encoder-Decoder(ED) OR Seq2Seq RNN model (originally\nintroduced for language translation) for BATP. The geometry of the dynamic real\ntime BATP problem enables a nice fit with the Encoder-Decoder based RNN\nstructure. We feed relevant additional synchronized inputs (from previous\ntrips) at each step of the decoder (a feature classically unexplored in machine\ntranslation applications). Further motivated from accurately modelling\ncongestion influences on travel time prediction, we additionally propose to use\na bidirectional layer at the decoder (something unexplored in other time-series\nbased ED application contexts). The effectiveness of the proposed algorithms is\ndemonstrated on real field data collected from challenging traffic conditions.\nOur experiments indicate that the proposed method outperforms diverse existing\nstate-of-art data-driven approaches proposed for the same problem.\n","authors":["Nancy Bhutani","Soumen Pachal","Avinash Achar"],"pdf_url":"https://arxiv.org/pdf/2210.01655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12535v2","updated":"2024-08-31T17:18:29Z","published":"2024-04-18T22:56:57Z","title":"Is There No Such Thing as a Bad Question? H4R: HalluciBot For\n Ratiocination, Rewriting, Ranking, and Routing","summary":" Hallucination continues to be one of the most critical challenges in the\ninstitutional adoption journey of Large Language Models (LLMs). While prior\nstudies have primarily focused on the post-generation analysis and refinement\nof outputs, this paper centers on the effectiveness of queries in eliciting\naccurate responses from LLMs. We present HalluciBot, a model that estimates the\nquery's propensity to hallucinate before generation, without invoking any LLMs\nduring inference. HalluciBot can serve as a proxy reward model for query\nrewriting, offering a general framework to estimate query quality based on\naccuracy and consensus. In essence, HalluciBot investigates how poorly\nconstructed queries can lead to erroneous outputs - moreover, by employing\nquery rewriting guided by HalluciBot's empirical estimates, we demonstrate that\n95.7% output accuracy can be achieved for Multiple Choice questions. The\ntraining procedure for HalluciBot consists of perturbing 369,837 queries n\ntimes, employing n+1 independent LLM agents, sampling an output from each\nquery, conducting a Multi-Agent Monte Carlo simulation on the sampled outputs,\nand training an encoder classifier. The idea of perturbation is the outcome of\nour ablation studies that measures the increase in output diversity (+12.5\nagreement spread) by perturbing a query in lexically different but semantically\nsimilar ways. Therefore, HalluciBot paves the way to ratiocinate (76.0% test F1\nscore, 46.6% in saved computation on hallucinatory queries), rewrite (+30.2%\npositive class transition from hallucinatory to non-hallucinatory), rank\n(+50.6% positive class transition from hallucinatory to non-hallucinatory), and\nroute queries to effective pipelines.\n","authors":["William Watson","Nicole Cho","Nishan Srishankar"],"pdf_url":"https://arxiv.org/pdf/2404.12535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08800v3","updated":"2024-08-31T16:56:50Z","published":"2023-12-14T10:35:13Z","title":"Evaluating Large Language Models for Health-related Queries with\n Presuppositions","summary":" As corporations rush to integrate large language models (LLMs) to their\nsearch offerings, it is critical that they provide factually accurate\ninformation that is robust to any presuppositions that a user may express. In\nthis work, we introduce UPHILL, a dataset consisting of health-related queries\nwith varying degrees of presuppositions. Using UPHILL, we evaluate the factual\naccuracy and consistency of InstructGPT, ChatGPT, and BingChat models. We find\nthat while model responses rarely disagree with true health claims (posed as\nquestions), they often fail to challenge false claims: responses from\nInstructGPT agree with 32% of the false claims, ChatGPT 26% and BingChat 23%.\nAs we increase the extent of presupposition in input queries, the responses\nfrom InstructGPT and ChatGPT agree with the claim considerably more often,\nregardless of its veracity. Responses from BingChat, which rely on retrieved\nwebpages, are not as susceptible. Given the moderate factual accuracy, and the\ninability of models to consistently correct false assumptions, our work calls\nfor a careful assessment of current LLMs for use in high-stakes scenarios.\n","authors":["Navreet Kaur","Monojit Choudhury","Danish Pruthi"],"pdf_url":"https://arxiv.org/pdf/2312.08800v3.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2406.00048v2","updated":"2024-08-31T16:36:58Z","published":"2024-05-28T17:01:22Z","title":"Towards a theory of how the structure of language is acquired by deep\n neural networks","summary":" How much data is required to learn the structure of a language via next-token\nprediction? We study this question for synthetic datasets generated via a\nProbabilistic Context-Free Grammar (PCFG) -- a tree-like generative model that\ncaptures many of the hierarchical structures found in natural languages. We\ndetermine token-token correlations analytically in our model and show that they\ncan be used to build a representation of the grammar's hidden variables, the\nlonger the range the deeper the variable. In addition, a finite training set\nlimits the resolution of correlations to an effective range, whose size grows\nwith that of the training set. As a result, a Language Model trained with\nincreasingly many examples can build a deeper representation of the grammar's\nstructure, thus reaching good performance despite the high dimensionality of\nthe problem. We conjecture that the relationship between training set size and\neffective range of correlations holds beyond our synthetic datasets. In\nparticular, our conjecture predicts how the scaling law for the test loss\nbehaviour with training set size depends on the length of the context window,\nwhich we confirm empirically in Shakespeare's plays and Wikipedia articles.\n","authors":["Francesco Cagnetta","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2406.00048v2.pdf","comment":"9 pages, 4 figures (main)"},{"id":"http://arxiv.org/abs/2310.02698v3","updated":"2024-08-31T16:15:35Z","published":"2023-10-04T10:08:01Z","title":"Enhanced Federated Optimization: Adaptive Unbiased Client Sampling with\n Reduced Variance","summary":" Federated Learning (FL) is a distributed learning paradigm to train a global\nmodel across multiple devices without collecting local data. In FL, a server\ntypically selects a subset of clients for each training round to optimize\nresource usage. Central to this process is the technique of unbiased client\nsampling, which ensures a representative selection of clients. Current methods\nprimarily utilize a random sampling procedure which, despite its effectiveness,\nachieves suboptimal efficiency owing to the loose upper bound caused by the\nsampling variance. In this work, by adopting an independent sampling procedure,\nwe propose a federated optimization framework focused on adaptive unbiased\nclient sampling, improving the convergence rate via an online variance\nreduction strategy. In particular, we present the first adaptive client\nsampler, K-Vib, employing an independent sampling procedure. K-Vib achieves a\nlinear speed-up on the regret bound\n$\\tilde{\\mathcal{O}}\\big(N^{\\frac{1}{3}}T^{\\frac{2}{3}}/K^{\\frac{4}{3}}\\big)$\nwithin a set communication budget $K$. Empirical studies indicate that K-Vib\ndoubles the speed compared to baseline algorithms, demonstrating significant\npotential in federated optimization.\n","authors":["Dun Zeng","Zenglin Xu","Yu Pan","Xu Luo","Qifan Wang","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2310.02698v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2401.05737v3","updated":"2024-08-31T15:46:34Z","published":"2024-01-11T08:40:26Z","title":"An experimental evaluation of Deep Reinforcement Learning algorithms for\n HVAC control","summary":" Heating, Ventilation, and Air Conditioning (HVAC) systems are a major driver\nof energy consumption in commercial and residential buildings. Recent studies\nhave shown that Deep Reinforcement Learning (DRL) algorithms can outperform\ntraditional reactive controllers. However, DRL-based solutions are generally\ndesigned for ad hoc setups and lack standardization for comparison. To fill\nthis gap, this paper provides a critical and reproducible evaluation, in terms\nof comfort and energy consumption, of several state-of-the-art DRL algorithms\nfor HVAC control. The study examines the controllers' robustness, adaptability,\nand trade-off between optimization goals by using the Sinergym framework. The\nresults obtained confirm the potential of DRL algorithms, such as SAC and TD3,\nin complex scenarios and reveal several challenges related to generalization\nand incremental learning.\n","authors":["Antonio Manjavacas","Alejandro Campoy-Nieves","Javier Jiménez-Raboso","Miguel Molina-Solana","Juan Gómez-Romero"],"pdf_url":"https://arxiv.org/pdf/2401.05737v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15173v2","updated":"2024-08-31T15:36:32Z","published":"2024-02-23T08:11:55Z","title":"Second-Order Fine-Tuning without Pain for LLMs:A Hessian Informed\n Zeroth-Order Optimizer","summary":" Fine-tuning large language models (LLMs) with classic first-order optimizers\nentails prohibitive GPU memory due to the backpropagation process. Recent works\nhave turned to zeroth-order optimizers for fine-tuning, which save substantial\nmemory by using two forward passes. However, these optimizers are plagued by\nthe heterogeneity of parameter curvatures across different dimensions. In this\nwork, we propose HiZOO, a diagonal Hessian informed zeroth-order optimizer\nwhich is the first work to leverage the diagonal Hessian to enhance\nzeroth-order optimizer for fine-tuning LLMs. What's more, HiZOO avoids the\nexpensive memory cost and only increases one forward pass per step. Extensive\nexperiments on various models (350M~66B parameters) indicate that HiZOO\nimproves model convergence, significantly reducing training steps and\neffectively enhancing model accuracy. Moreover, we visualize the optimization\ntrajectories of HiZOO on test functions, illustrating its effectiveness in\nhandling heterogeneous curvatures. Lastly, we provide theoretical proofs of\nconvergence for HiZOO. Code is publicly available at\nhttps://anonymous.4open.science/r/HiZOO27F8.\n","authors":["Yanjun Zhao","Sizhe Dang","Haishan Ye","Guang Dai","Yi Qian","Ivor W. Tsang"],"pdf_url":"https://arxiv.org/pdf/2402.15173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13227v2","updated":"2024-08-31T15:16:32Z","published":"2024-05-21T22:28:41Z","title":"A rapid approach to urban traffic noise mapping with a generative\n adversarial network","summary":" With rapid urbanisation and the accompanying increase in traffic density,\ntraffic noise has become a major concern in urban planning. However,\ntraditional grid noise mapping methods have limitations in terms of time\nconsumption, software costs, and a lack of parameter integration interfaces.\nThese limitations hinder their ability to meet the need for iterative updates\nand rapid performance feedback in the early design stages of street-scale urban\nplanning. Herein, we developed a rapid urban traffic noise mapping technique\nthat leverages generative adversarial networks (GANs) as a surrogate model.\nThis approach enables the rapid assessment of urban traffic noise distribution\nby using urban elements such as roads and buildings as the input. The mean\nvalues for the mean squared error (RMSE) and structural similarity index (SSIM)\nare 0.3024 dB(A) and 0.8528, respectively, for the validation dataset. The\ntrained model is integrated into Grasshopper as a tool, facilitating the rapid\ngeneration of traffic noise maps. This integration allows urban designers and\nplanners, even those without expertise in acoustics, to easily anticipate\nchanges in acoustics impacts caused by design in the early design stages.\n","authors":["Xinhao Yang","Zhen Han","Xiaodong Lu","Yuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.13227v2.pdf","comment":"Accepted by Applied Acoustics"},{"id":"http://arxiv.org/abs/2210.12583v4","updated":"2024-08-31T14:15:27Z","published":"2022-10-23T00:45:05Z","title":"Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model\n Predictive Control","summary":" Model-based control requires an accurate model of the system dynamics for\nprecisely and safely controlling the robot in complex and dynamic environments.\nMoreover, in the presence of variations in the operating conditions, the model\nshould be continuously refined to compensate for dynamics changes. In this\npaper, we present a self-supervised learning approach that actively models the\ndynamics of nonlinear robotic systems. We combine offline learning from past\nexperience and online learning from current robot interaction with the unknown\nenvironment. These two ingredients enable a highly sample-efficient and\nadaptive learning process, capable of accurately inferring model dynamics in\nreal-time even in operating regimes that greatly differ from the training\ndistribution. Moreover, we design an uncertainty-aware model predictive\ncontroller that is heuristically conditioned to the aleatoric (data)\nuncertainty of the learned dynamics. This controller actively chooses the\noptimal control actions that (i) optimize the control performance and (ii)\nimprove the efficiency of online learning sample collection. We demonstrate the\neffectiveness of our method through a series of challenging real-world\nexperiments using a quadrotor system. Our approach showcases high resilience\nand generalization capabilities by consistently adapting to unseen flight\nconditions, while it significantly outperforms classical and adaptive control\nbaselines.\n","authors":["Alessandro Saviolo","Jonathan Frey","Abhishek Rathod","Moritz Diehl","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2210.12583v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00973v3","updated":"2024-08-31T13:59:23Z","published":"2022-06-02T10:31:45Z","title":"Primal-dual extrapolation methods for monotone inclusions under local\n Lipschitz continuity","summary":" In this paper we consider a class of monotone inclusion (MI) problems of\nfinding a zero of the sum of two monotone operators, in which one operator is\nmaximal monotone while the other is {\\it locally Lipschitz} continuous. We\npropose primal-dual extrapolation methods to solve them using a point and\noperator extrapolation technique, whose parameters are chosen by a backtracking\nline search scheme. The proposed methods enjoy an operation complexity of\n${\\cal O}(\\log \\epsilon^{-1})$ and ${\\cal O}(\\epsilon^{-1}\\log \\epsilon^{-1})$,\nmeasured by the number of fundamental operations consisting only of evaluations\nof one operator and resolvent of the other operator, for finding an\n$\\varepsilon$-residual solution of strongly and non-strongly MI problems,\nrespectively. The latter complexity significantly improves the previously best\noperation complexity ${\\cal O}(\\varepsilon^{-2})$. As a byproduct, complexity\nresults of the primal-dual extrapolation methods are also obtained for finding\nan $\\varepsilon$-KKT or $\\varepsilon$-residual solution of convex conic\noptimization, conic constrained saddle point, and variational inequality\nproblems under {\\it local Lipschitz} continuity. We provide preliminary\nnumerical results to demonstrate the performance of the proposed methods.\n","authors":["Zhaosong Lu","Sanyou Mei"],"pdf_url":"https://arxiv.org/pdf/2206.00973v3.pdf","comment":"To appear in Mathematics of Operations Research"},{"id":"http://arxiv.org/abs/2407.00119v2","updated":"2024-08-31T12:44:38Z","published":"2024-06-27T15:54:12Z","title":"Efficient Long-distance Latent Relation-aware Graph Neural Network for\n Multi-modal Emotion Recognition in Conversations","summary":" The task of multi-modal emotion recognition in conversation (MERC) aims to\nanalyze the genuine emotional state of each utterance based on the multi-modal\ninformation in the conversation, which is crucial for conversation\nunderstanding. Existing methods focus on using graph neural networks (GNN) to\nmodel conversational relationships and capture contextual latent semantic\nrelationships. However, due to the complexity of GNN, existing methods cannot\nefficiently capture the potential dependencies between long-distance\nutterances, which limits the performance of MERC. In this paper, we propose an\nEfficient Long-distance Latent Relation-aware Graph Neural Network (ELR-GNN)\nfor multi-modal emotion recognition in conversations. Specifically, we first\nuse pre-extracted text, video and audio features as input to Bi-LSTM to capture\ncontextual semantic information and obtain low-level utterance features. Then,\nwe use low-level utterance features to construct a conversational emotion\ninteraction graph. To efficiently capture the potential dependencies between\nlong-distance utterances, we use the dilated generalized forward push algorithm\nto precompute the emotional propagation between global utterances and design an\nemotional relation-aware operator to capture the potential semantic\nassociations between different utterances. Furthermore, we combine early fusion\nand adaptive late fusion mechanisms to fuse latent dependency information\nbetween speaker relationship information and context. Finally, we obtain\nhigh-level discourse features and feed them into MLP for emotion prediction.\nExtensive experimental results show that ELR-GNN achieves state-of-the-art\nperformance on the benchmark datasets IEMOCAP and MELD, with running times\nreduced by 52\\% and 35\\%, respectively.\n","authors":["Yuntao Shou","Wei Ai","Jiayi Du","Tao Meng","Haiyan Liu","Nan Yin"],"pdf_url":"https://arxiv.org/pdf/2407.00119v2.pdf","comment":"11 pages, 3 tables"},{"id":"http://arxiv.org/abs/2305.18161v2","updated":"2024-08-31T12:28:05Z","published":"2023-05-29T15:44:47Z","title":"VA-learning as a more efficient alternative to Q-learning","summary":" In reinforcement learning, the advantage function is critical for policy\nimprovement, but is often extracted from a learned Q-function. A natural\nquestion is: Why not learn the advantage function directly? In this work, we\nintroduce VA-learning, which directly learns advantage function and value\nfunction using bootstrapping, without explicit reference to Q-functions.\nVA-learning learns off-policy and enjoys similar theoretical guarantees as\nQ-learning. Thanks to the direct learning of advantage function and value\nfunction, VA-learning improves the sample efficiency over Q-learning both in\ntabular implementations and deep RL agents on Atari-57 games. We also identify\na close connection between VA-learning and the dueling architecture, which\npartially explains why a simple architectural change to DQN agents tends to\nimprove performance.\n","authors":["Yunhao Tang","Rémi Munos","Mark Rowland","Michal Valko"],"pdf_url":"https://arxiv.org/pdf/2305.18161v2.pdf","comment":"Accepted to ICML 2023 as a conference paper"},{"id":"http://arxiv.org/abs/2402.11228v2","updated":"2024-08-31T03:23:50Z","published":"2024-02-17T09:10:40Z","title":"Adaptive Split Balancing for Optimal Random Forest","summary":" In this paper, we propose a new random forest algorithm that constructs the\ntrees using a novel adaptive split-balancing method. Rather than relying on the\nwidely-used random feature selection, we propose a permutation-based balanced\nsplitting criterion. The adaptive split balancing forest (ASBF), achieves\nminimax optimality under the Lipschitz class. Its localized version, which fits\nlocal regressions at the leaf level, attains the minimax rate under the broad\nH\\\"older class $\\mathcal{H}^{q,\\beta}$ of problems for any $q\\in\\mathbb{N}$ and\n$\\beta\\in(0,1]$. We identify that over-reliance on auxiliary randomness in tree\nconstruction may compromise the approximation power of trees, leading to\nsuboptimal results. Conversely, the proposed less random, permutation-based\napproach demonstrates optimality over a wide range of models. Although random\nforests are known to perform well empirically, their theoretical convergence\nrates are slow. Simplified versions that construct trees without data\ndependence offer faster rates but lack adaptability during tree growth. Our\nproposed method achieves optimality in simple, smooth scenarios while\nadaptively learning the tree structure from the data. Additionally, we\nestablish uniform upper bounds and demonstrate that ASBF improves\ndimensionality dependence in average treatment effect estimation problems.\nSimulation studies and real-world applications demonstrate our methods'\nsuperior performance over existing random forests.\n","authors":["Yuqian Zhang","Weijie Ji","Jelena Bradic"],"pdf_url":"https://arxiv.org/pdf/2402.11228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01105v3","updated":"2024-08-31T02:28:20Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.06152v2","updated":"2024-08-31T12:32:50Z","published":"2024-08-12T13:48:06Z","title":"Palantir: Towards Efficient Super Resolution for Ultra-high-definition\n Live Streaming","summary":" Neural enhancement through super-resolution (SR) deep neural networks (DNNs)\nopens up new possibilities for ultra-high-definition (UHD) live streaming over\nexisting encoding and networking infrastructure. Yet, the heavy SR DNN\ninference overhead leads to severe deployment challenges. To reduce the\noverhead, existing systems propose to apply DNN-based SR only on carefully\nselected anchor frames while upscaling non-anchor frames via the lightweight\nreusing-based SR approach. However, frame-level scheduling is coarse-grained\nand fails to deliver optimal efficiency. In this work, we propose Palantir, the\nfirst neural-enhanced UHD live streaming system with fine-grained patch-level\nscheduling. Two novel techniques are incorporated into Palantir to select the\nmost beneficial anchor patches and support latency-sensitive UHD live streaming\napplications. Firstly, under the guidance of our pioneering and theoretical\nanalysis, Palantir constructs a directed acyclic graph (DAG) for lightweight\nyet accurate SR quality estimation under any possible anchor patch set.\nSecondly, to further optimize the scheduling latency, Palantir improves\nparallelizability by refactoring the computation subprocedure of the estimation\nprocess into a sparse matrix-matrix multiplication operation.\n The evaluation results suggest that Palantir incurs a negligible scheduling\nlatency accounting for less than 5.7% of the end-to-end latency requirement.\nWhen compared to the naive method of applying DNN-based SR on all the frames,\nPalantir can reduce the SR DNN inference overhead by 20 times (or 60 times)\nwhile preserving 54.0-82.6% (or 32.8-64.0%) of the quality gain. When compared\nto the state-of-the-art real-time frame-level scheduling strategy, Palantir can\nreduce the SR DNN inference overhead by 80.1% at most (and 38.4% on average)\nwithout sacrificing the video quality.\n","authors":["Xinqi Jin","Zhui Zhu","Xikai Sun","Fan Dang","Jiangchuan Liu","Jingao Xu","Kebin Liu","Xinlei Chen","Yunhao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.06152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00562v1","updated":"2024-08-31T23:22:30Z","published":"2024-08-31T23:22:30Z","title":"Comparative Analysis of Modality Fusion Approaches for Audio-Visual\n Person Identification and Verification","summary":" Multimodal learning involves integrating information from various modalities\nto enhance learning and comprehension. We compare three modality fusion\nstrategies in person identification and verification by processing two\nmodalities: voice and face. In this paper, a one-dimensional convolutional\nneural network is employed for x-vector extraction from voice, while the\npre-trained VGGFace2 network and transfer learning are utilized for face\nmodality. In addition, gammatonegram is used as speech representation in\nengagement with the Darknet19 pre-trained network. The proposed systems are\nevaluated using the K-fold cross-validation technique on the 118 speakers of\nthe test set of the VoxCeleb2 dataset. The comparative evaluations are done for\nsingle-modality and three proposed multimodal strategies in equal situations.\nResults demonstrate that the feature fusion strategy of gammatonegram and\nfacial features achieves the highest performance, with an accuracy of 98.37% in\nthe person identification task. However, concatenating facial features with the\nx-vector reaches 0.62% for EER in verification tasks.\n","authors":["Aref Farhadipour","Masoumeh Chapariniya","Teodora Vukovic","Volker Dellwo"],"pdf_url":"https://arxiv.org/pdf/2409.00562v1.pdf","comment":"This paper has been submitted to a conference"},{"id":"http://arxiv.org/abs/2409.00552v1","updated":"2024-08-31T22:27:40Z","published":"2024-08-31T22:27:40Z","title":"Digit Recognition using Multimodal Spiking Neural Networks","summary":" Spiking neural networks (SNNs) are the third generation of neural networks\nthat are biologically inspired to process data in a fashion that emulates the\nexchange of signals in the brain. Within the Computer Vision community SNNs\nhave garnered significant attention due in large part to the availability of\nevent-based sensors that produce a spatially resolved spike train in response\nto changes in scene radiance. SNNs are used to process event-based data due to\ntheir neuromorphic nature. The proposed work examines the neuromorphic\nadvantage of fusing multiple sensory inputs in classification tasks.\nSpecifically we study the performance of a SNN in digit classification by\npassing in a visual modality branch (Neuromorphic-MNIST [N-MNIST]) and an\nauditory modality branch (Spiking Heidelberg Digits [SHD]) from datasets that\nwere created using event-based sensors to generate a series of time-dependent\nevents. It is observed that multi-modal SNNs outperform unimodal visual and\nunimodal auditory SNNs. Furthermore, it is observed that the process of sensory\nfusion is insensitive to the depth at which the visual and auditory branches\nare combined. This work achieves a 98.43% accuracy on the combined N-MNIST and\nSHD dataset using a multimodal SNN that concatenates the visual and auditory\nbranches at a late depth.\n","authors":["William Bjorndahl","Jack Easton","Austin Modoff","Eric C. Larson","Joseph Camp","Prasanna Rangarajan"],"pdf_url":"https://arxiv.org/pdf/2409.00552v1.pdf","comment":"4 pages, 2 figures, submitted to 2025 IEEE International Conference\n on Acoustics, Speech, and Signal Processing"},{"id":"http://arxiv.org/abs/2409.00486v1","updated":"2024-08-31T15:43:22Z","published":"2024-08-31T15:43:22Z","title":"Multi-scale Multi-instance Visual Sound Localization and Segmentation","summary":" Visual sound localization is a typical and challenging problem that predicts\nthe location of objects corresponding to the sound source in a video. Previous\nmethods mainly used the audio-visual association between global audio and\none-scale visual features to localize sounding objects in each image. Despite\ntheir promising performance, they omitted multi-scale visual features of the\ncorresponding image, and they cannot learn discriminative regions compared to\nground truths. To address this issue, we propose a novel multi-scale\nmulti-instance visual sound localization framework, namely M2VSL, that can\ndirectly learn multi-scale semantic features associated with sound sources from\nthe input image to localize sounding objects. Specifically, our M2VSL leverages\nlearnable multi-scale visual features to align audio-visual representations at\nmulti-level locations of the corresponding image. We also introduce a novel\nmulti-scale multi-instance transformer to dynamically aggregate multi-scale\ncross-modal representations for visual sound localization. We conduct extensive\nexperiments on VGGSound-Instruments, VGG-Sound Sources, and AVSBench\nbenchmarks. The results demonstrate that the proposed M2VSL can achieve\nstate-of-the-art performance on sounding object localization and segmentation.\n","authors":["Shentong Mo","Haofan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00486v1.pdf","comment":null}]},"2024-09-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.02920v1","updated":"2024-09-04T17:59:52Z","published":"2024-09-04T17:59:52Z","title":"RoboTwin: Dual-Arm Robot Benchmark with Generative Digital Twins (early\n version)","summary":" Effective collaboration of dual-arm robots and their tool use capabilities\nare increasingly important areas in the advancement of robotics. These skills\nplay a significant role in expanding robots' ability to operate in diverse\nreal-world environments. However, progress is impeded by the scarcity of\nspecialized training data. This paper introduces RoboTwin, a novel benchmark\ndataset combining real-world teleoperated data with synthetic data from digital\ntwins, designed for dual-arm robotic scenarios. Using the COBOT Magic platform,\nwe have collected diverse data on tool usage and human-robot interaction. We\npresent a innovative approach to creating digital twins using AI-generated\ncontent, transforming 2D images into detailed 3D models. Furthermore, we\nutilize large language models to generate expert-level training data and\ntask-specific pose sequences oriented toward functionality. Our key\ncontributions are: 1) the RoboTwin benchmark dataset, 2) an efficient\nreal-to-simulation pipeline, and 3) the use of language models for automatic\nexpert-level data generation. These advancements are designed to address the\nshortage of robotic training data, potentially accelerating the development of\nmore capable and versatile robotic systems for a wide range of real-world\napplications. The project page is available at\nhttps://robotwin-benchmark.github.io/early-version/\n","authors":["Yao Mu","Tianxing Chen","Shijia Peng","Zanxin Chen","Zeyu Gao","Yude Zou","Lunkai Lin","Zhiqiang Xie","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2409.02920v1.pdf","comment":"Project page: https://robotwin-benchmark.github.io/early-version/"},{"id":"http://arxiv.org/abs/2409.02908v1","updated":"2024-09-04T17:48:19Z","published":"2024-09-04T17:48:19Z","title":"Masked Diffusion Models are Secretly Time-Agnostic Masked Models and\n Exploit Inaccurate Categorical Sampling","summary":" Masked diffusion models (MDMs) have emerged as a popular research topic for\ngenerative modeling of discrete data, thanks to their superior performance over\nother discrete diffusion models, and are rivaling the auto-regressive models\n(ARMs) for language modeling tasks. The recent effort in simplifying the masked\ndiffusion framework further leads to alignment with continuous-space diffusion\nmodels and more principled training and sampling recipes. In this paper,\nhowever, we reveal that both training and sampling of MDMs are theoretically\nfree from the time variable, arguably the key signature of diffusion models,\nand are instead equivalent to masked models. The connection on the sampling\naspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we\nshow that the FHS is theoretically equivalent to MDMs' original generation\nprocess while significantly alleviating the time-consuming categorical sampling\nand achieving a 20$\\times$ speedup. In addition, our investigation challenges\nprevious claims that MDMs can surpass ARMs in generative perplexity. We\nidentify, for the first time, an underlying numerical issue, even with the\n32-bit floating-point precision, which results in inaccurate categorical\nsampling. We show that the numerical issue lowers the effective temperature\nboth theoretically and empirically, leading to unfair assessments of MDMs'\ngeneration results in the previous literature.\n","authors":["Kaiwen Zheng","Yongxin Chen","Hanzi Mao","Ming-Yu Liu","Jun Zhu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02908v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2409.02897v1","updated":"2024-09-04T17:41:19Z","published":"2024-09-04T17:41:19Z","title":"LongCite: Enabling LLMs to Generate Fine-grained Citations in\n Long-context QA","summary":" Though current long-context large language models (LLMs) have demonstrated\nimpressive capacities in answering user questions based on extensive text, the\nlack of citations in their responses makes user verification difficult, leading\nto concerns about their trustworthiness due to their potential hallucinations.\nIn this work, we aim to enable long-context LLMs to generate responses with\nfine-grained sentence-level citations, improving their faithfulness and\nverifiability. We first introduce LongBench-Cite, an automated benchmark for\nassessing current LLMs' performance in Long-Context Question Answering with\nCitations (LQAC), revealing considerable room for improvement. To this end, we\npropose CoF (Coarse to Fine), a novel pipeline that utilizes off-the-shelf LLMs\nto automatically generate long-context QA instances with precise sentence-level\ncitations, and leverage this pipeline to construct LongCite-45k, a large-scale\nSFT dataset for LQAC. Finally, we train LongCite-8B and LongCite-9B using the\nLongCite-45k dataset, successfully enabling their generation of accurate\nresponses and fine-grained sentence-level citations in a single output. The\nevaluation results on LongBench-Cite show that our trained models achieve\nstate-of-the-art citation quality, surpassing advanced proprietary models\nincluding GPT-4o.\n","authors":["jiajie Zhang","Yushi Bai","Xin Lv","Wanjun Gu","Danqing Liu","Minhao Zou","Shulin Cao","Lei Hou","Yuxiao Dong","Ling Feng","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2409.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07832v3","updated":"2024-09-04T17:31:00Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02889v1","updated":"2024-09-04T17:25:21Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v1.pdf","comment":"19 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.18322v2","updated":"2024-09-04T17:16:05Z","published":"2024-07-01T19:52:41Z","title":"The Need for Guardrails with Large Language Models in Medical\n Safety-Critical Settings: An Artificial Intelligence Application in the\n Pharmacovigilance Ecosystem","summary":" Large language models (LLMs) are useful tools with the capacity for\nperforming specific types of knowledge work at an effective scale. However, LLM\ndeployments in high-risk and safety-critical domains pose unique challenges,\nnotably the issue of ``hallucination,'' where LLMs can generate fabricated\ninformation. This is particularly concerning in settings such as drug safety,\nwhere inaccuracies could lead to patient harm. To mitigate these risks, we have\ndeveloped and demonstrated a proof of concept suite of guardrails specifically\ndesigned to mitigate certain types of hallucinations and errors for drug\nsafety, and potentially applicable to other medical safety-critical contexts.\nThese guardrails include mechanisms to detect anomalous documents to prevent\nthe ingestion of inappropriate data, identify incorrect drug names or adverse\nevent terms, and convey uncertainty in generated content. We integrated these\nguardrails with an LLM fine-tuned for a text-to-text task, which involves\nconverting both structured and unstructured data within adverse event reports\ninto natural language. This method was applied to translate individual case\nsafety reports, demonstrating effective application in a pharmacovigilance\nprocessing task. Our guardrail framework offers a set of tools with broad\napplicability across various domains, ensuring LLMs can be safely used in\nhigh-risk situations by eliminating the occurrence of key errors, including the\ngeneration of incorrect pharmacovigilance-related terms, thus adhering to\nstringent regulatory and quality standards in medical safety-critical\nenvironments.\n","authors":["Joe B Hakim","Jeffery L Painter","Darmendra Ramcharran","Vijay Kara","Greg Powell","Paulina Sobczak","Chiho Sato","Andrew Bate","Andrew Beam"],"pdf_url":"https://arxiv.org/pdf/2407.18322v2.pdf","comment":"27 pages, 6 figures, 4 tables and supplementary material provided"},{"id":"http://arxiv.org/abs/2409.02877v1","updated":"2024-09-04T17:01:02Z","published":"2024-09-04T17:01:02Z","title":"Configurable Foundation Models: Building LLMs from a Modular Perspective","summary":" Advancements in LLMs have recently unveiled challenges tied to computational\nefficiency and continual scalability due to their requirements of huge\nparameters, making the applications and evolution of these models on devices\nwith limited computation resources and scenarios requiring various abilities\nincreasingly cumbersome. Inspired by modularity within the human brain, there\nis a growing tendency to decompose LLMs into numerous functional modules,\nallowing for inference with part of modules and dynamic assembly of modules to\ntackle complex tasks, such as mixture-of-experts. To highlight the inherent\nefficiency and composability of the modular approach, we coin the term brick to\nrepresent each functional module, designating the modularized structure as\nconfigurable foundation models. In this paper, we offer a comprehensive\noverview and investigation of the construction, utilization, and limitation of\nconfigurable foundation models. We first formalize modules into emergent bricks\n- functional neuron partitions that emerge during the pre-training phase, and\ncustomized bricks - bricks constructed via additional post-training to improve\nthe capabilities and knowledge of LLMs. Based on diverse functional bricks, we\nfurther present four brick-oriented operations: retrieval and routing, merging,\nupdating, and growing. These operations allow for dynamic configuration of LLMs\nbased on instructions to handle complex tasks. To verify our perspective, we\nconduct an empirical analysis on widely-used LLMs. We find that the FFN layers\nfollow modular patterns with functional specialization of neurons and\nfunctional neuron partitions. Finally, we highlight several open issues and\ndirections for future research. Overall, this paper aims to offer a fresh\nmodular perspective on existing LLM research and inspire the future creation of\nmore efficient and scalable foundational models.\n","authors":["Chaojun Xiao","Zhengyan Zhang","Chenyang Song","Dazhi Jiang","Feng Yao","Xu Han","Xiaozhi Wang","Shuo Wang","Yufei Huang","Guanyu Lin","Yingfa Chen","Weilin Zhao","Yuge Tu","Zexuan Zhong","Ao Zhang","Chenglei Si","Khai Hao Moo","Chenyang Zhao","Huimin Chen","Yankai Lin","Zhiyuan Liu","Jingbo Shang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02841v1","updated":"2024-09-04T16:14:05Z","published":"2024-09-04T16:14:05Z","title":"Historical German Text Normalization Using Type- and Token-Based\n Language Modeling","summary":" Historic variations of spelling poses a challenge for full-text search or\nnatural language processing on historical digitized texts. To minimize the gap\nbetween the historic orthography and contemporary spelling, usually an\nautomatic orthographic normalization of the historical source material is\npursued. This report proposes a normalization system for German literary texts\nfrom c. 1700-1900, trained on a parallel corpus. The proposed system makes use\nof a machine learning approach using Transformer language models, combining an\nencoder-decoder model to normalize individual word types, and a pre-trained\ncausal language model to adjust these normalizations within their context. An\nextensive evaluation shows that the proposed system provides state-of-the-art\naccuracy, comparable with a much larger fully end-to-end sentence-based\nnormalization system, fine-tuning a pre-trained Transformer large language\nmodel. However, the normalization of historical text remains a challenge due to\ndifficulties for models to generalize, and the lack of extensive high-quality\nparallel data.\n","authors":["Anton Ehrmanntraut"],"pdf_url":"https://arxiv.org/pdf/2409.02841v1.pdf","comment":"27 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.08763v4","updated":"2024-09-04T16:13:18Z","published":"2024-03-13T17:58:57Z","title":"Simple and Scalable Strategies to Continually Pre-train Large Language\n Models","summary":" Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to start the process over again once new data becomes available. A much\nmore efficient solution is to continually pre-train these models, saving\nsignificant compute compared to re-training. However, the distribution shift\ninduced by new data typically results in degraded performance on previous data\nor poor adaptation to the new data. In this work, we show that a simple and\nscalable combination of learning rate (LR) re-warming, LR re-decaying, and\nreplay of previous data is sufficient to match the performance of fully\nre-training from scratch on all available data, as measured by the final loss\nand the average score on several language model (LM) evaluation benchmarks.\nSpecifically, we show this for a weak but realistic distribution shift between\ntwo commonly used LLM pre-training datasets (English$\\rightarrow$English) and a\nstronger distribution shift (English$\\rightarrow$German) at the $405$M\nparameter model scale with large dataset sizes (hundreds of billions of\ntokens). Selecting the weak but realistic shift for larger-scale experiments,\nwe also find that our continual learning strategies match the re-training\nbaseline for a 10B parameter LLM. Our results demonstrate that LLMs can be\nsuccessfully updated via simple and scalable continual learning strategies,\nmatching the re-training baseline using only a fraction of the compute.\nFinally, inspired by previous work, we propose alternatives to the cosine\nlearning rate schedule that help circumvent forgetting induced by LR re-warming\nand that are not bound to a fixed token budget.\n","authors":["Adam Ibrahim","Benjamin Thérien","Kshitij Gupta","Mats L. Richter","Quentin Anthony","Timothée Lesort","Eugene Belilovsky","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2403.08763v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02840v1","updated":"2024-09-04T16:12:30Z","published":"2024-09-04T16:12:30Z","title":"R2GQA: Retriever-Reader-Generator Question Answering System to Support\n Students Understanding Legal Regulations in Higher Education","summary":" In this article, we propose the R2GQA system, a Retriever-Reader-Generator\nQuestion Answering system, consisting of three main components: Document\nRetriever, Machine Reader, and Answer Generator. The Retriever module employs\nadvanced information retrieval techniques to extract the context of articles\nfrom a dataset of legal regulation documents. The Machine Reader module\nutilizes state-of-the-art natural language understanding algorithms to\ncomprehend the retrieved documents and extract answers. Finally, the Generator\nmodule synthesizes the extracted answers into concise and informative responses\nto questions of students regarding legal regulations. Furthermore, we built the\nViRHE4QA dataset in the domain of university training regulations, comprising\n9,758 question-answer pairs with a rigorous construction process. This is the\nfirst Vietnamese dataset in the higher regulations domain with various types of\nanswers, both extractive and abstractive. In addition, the R2GQA system is the\nfirst system to offer abstractive answers in Vietnamese. This paper discusses\nthe design and implementation of each module within the R2GQA system on the\nViRHE4QA dataset, highlighting their functionalities and interactions.\nFurthermore, we present experimental results demonstrating the effectiveness\nand utility of the proposed system in supporting the comprehension of students\nof legal regulations in higher education settings. In general, the R2GQA system\nand the ViRHE4QA dataset promise to contribute significantly to related\nresearch and help students navigate complex legal documents and regulations,\nempowering them to make informed decisions and adhere to institutional policies\neffectively. Our dataset is available for research purposes.\n","authors":["Phuc-Tinh Pham Do","Duy-Ngoc Dinh Cao","Khanh Quoc Tran","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.02840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02836v1","updated":"2024-09-04T16:02:30Z","published":"2024-09-04T16:02:30Z","title":"Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency\n Discussions by Few-Shot Learning with Large Language Models","summary":" This study performs analysis of Predictive statements, Hope speech, and\nRegret Detection behaviors within cryptocurrency-related discussions,\nleveraging advanced natural language processing techniques. We introduce a\nnovel classification scheme named \"Prediction statements,\" categorizing\ncomments into Predictive Incremental, Predictive Decremental, Predictive\nNeutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large\nlanguage model, we explore sentiment dynamics across five prominent\ncryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis\nreveals distinct patterns in predictive sentiments, with Matic demonstrating a\nnotably higher propensity for optimistic predictions. Additionally, we\ninvestigate hope and regret sentiments, uncovering nuanced interplay between\nthese emotions and predictive behaviors. Despite encountering limitations\nrelated to data volume and resource availability, our study reports valuable\ndiscoveries concerning investor behavior and sentiment trends within the\ncryptocurrency market, informing strategic decision-making and future research\nendeavors.\n","authors":["Moein Shahiki Tash","Zahra Ahani","Mohim Tash","Olga Kolesnikova","Grigori Sidorov"],"pdf_url":"https://arxiv.org/pdf/2409.02836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02834v1","updated":"2024-09-04T16:00:21Z","published":"2024-09-04T16:00:21Z","title":"CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the\n Mathematics Reasoning of Large Multimodal Models","summary":" Large language models (LLMs) have obtained promising results in mathematical\nreasoning, which is a foundational skill for human intelligence. Most previous\nstudies focus on improving and measuring the performance of LLMs based on\ntextual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few\nresearchers have released English multimodal math datasets (e.g., MATHVISTA and\nMATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In\nthis paper, we release a Chinese multimodal math (CMM-Math) dataset, including\nbenchmark and training parts, to evaluate and enhance the mathematical\nreasoning of LMMs. CMM-Math contains over 28,000 high-quality samples,\nfeaturing a variety of problem types (e.g., multiple-choice, fill-in-the-blank,\nand so on) with detailed solutions across 12 grade levels from elementary to\nhigh school in China. Specifically, the visual context may be present in the\nquestions or opinions, which makes this dataset more challenging. Through\ncomprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math\ndataset face challenges, emphasizing the necessity for further improvements in\nLMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to\nhandle the problems with mixed input of multiple images and text segments. We\ntrain our model using three stages, including foundational pre-training,\nfoundational fine-tuning, and mathematical fine-tuning. The extensive\nexperiments indicate that our model effectively improves math reasoning\nperformance by comparing it with the SOTA LMMs over three multimodal\nmathematical datasets.\n","authors":["Wentao Liu","Qianjun Pan","Yi Zhang","Zhuo Liu","Ji Wu","Jie Zhou","Aimin Zhou","Qin Chen","Bo Jiang","Liang He"],"pdf_url":"https://arxiv.org/pdf/2409.02834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00509v2","updated":"2024-09-04T15:55:22Z","published":"2024-08-31T17:19:30Z","title":"LongRecipe: Recipe for Efficient Long Context Generalization in Large\n Language Models","summary":" Large language models (LLMs) face significant challenges in handling\nlong-context tasks because of their limited effective context window size\nduring pretraining, which restricts their ability to generalize over extended\nsequences. Meanwhile, extending the context window in LLMs through\npost-pretraining is highly resource-intensive. To address this, we introduce\nLongRecipe, an efficient training strategy for extending the context window of\nLLMs, including impactful token analysis, position index transformation, and\ntraining optimization strategies. It simulates long-sequence inputs while\nmaintaining training efficiency and significantly improves the model's\nunderstanding of long-range dependencies. Experiments on three types of LLMs\nshow that LongRecipe can utilize long sequences while requiring only 30% of the\ntarget context window size, and reduces computational training resource over\n85% compared to full sequence training. Furthermore, LongRecipe also preserves\nthe original LLM's capabilities in general tasks. Ultimately, we can extend the\neffective context window of open-source LLMs from 8k to 128k, achieving\nperformance close to GPT-4 with just one day of dedicated training using a\nsingle GPU with 80G memory. Our code is released at\nhttps://github.com/zhiyuanhubj/LongRecipe.\n","authors":["Zhiyuan Hu","Yuliang Liu","Jinman Zhao","Suyuchen Wang","Yan Wang","Wei Shen","Qing Gu","Anh Tuan Luu","See-Kiong Ng","Zhiwei Jiang","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2409.00509v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2405.04346v2","updated":"2024-09-04T15:48:40Z","published":"2024-05-07T14:23:22Z","title":"Revisiting Character-level Adversarial Attacks for Language Models","summary":" Adversarial attacks in Natural Language Processing apply perturbations in the\ncharacter or token levels. Token-level attacks, gaining prominence for their\nuse of gradient-based methods, are susceptible to altering sentence semantics,\nleading to invalid adversarial examples. While character-level attacks easily\nmaintain semantics, they have received less attention as they cannot easily\nadopt popular gradient-based methods, and are thought to be easy to defend.\nChallenging these beliefs, we introduce Charmer, an efficient query-based\nadversarial attack capable of achieving high attack success rate (ASR) while\ngenerating highly similar adversarial examples. Our method successfully targets\nboth small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2,\nCharmer improves the ASR in 4.84% points and the USE similarity in 8% points\nwith respect to the previous art. Our implementation is available in\nhttps://github.com/LIONS-EPFL/Charmer.\n","authors":["Elias Abad Rocamora","Yongtao Wu","Fanghui Liu","Grigorios G. Chrysos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2405.04346v2.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2408.15778v2","updated":"2024-09-04T15:35:15Z","published":"2024-08-28T13:16:41Z","title":"LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language\n Models","summary":" Large Language Models (LLMs) have demonstrated notable capabilities across\nvarious tasks, showcasing complex problem-solving abilities. Understanding and\nexecuting complex rules, along with multi-step planning, are fundamental to\nlogical reasoning and critical for practical LLM agents and decision-making\nsystems. However, evaluating LLMs as effective rule-based executors and\nplanners remains underexplored. In this paper, we introduce LogicGame, a novel\nbenchmark designed to evaluate the comprehensive rule understanding, execution,\nand planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame\nprovides diverse games that contain a series of rules with an initial state,\nrequiring models to comprehend and apply predefined regulations to solve\nproblems. We create simulated scenarios in which models execute or plan\noperations to achieve specific outcomes. These game scenarios are specifically\ndesigned to distinguish logical reasoning from mere knowledge by relying\nexclusively on predefined rules. This separation allows for a pure assessment\nof rule-based reasoning capabilities. The evaluation considers not only final\noutcomes but also intermediate steps, providing a comprehensive assessment of\nmodel performance. Moreover, these intermediate steps are deterministic and can\nbe automatically verified. LogicGame defines game scenarios with varying\ndifficulty levels, from simple rule applications to complex reasoning chains,\nin order to offer a precise evaluation of model performance on rule\nunderstanding and multi-step execution. Utilizing LogicGame, we test various\nLLMs and identify notable shortcomings in their rule-based logical reasoning\nabilities.\n","authors":["Jiayi Gui","Yiming Liu","Jiale Cheng","Xiaotao Gu","Xiao Liu","Hongning Wang","Yuxiao Dong","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2408.15778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02813v1","updated":"2024-09-04T15:31:26Z","published":"2024-09-04T15:31:26Z","title":"MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding\n Benchmark","summary":" This paper introduces MMMU-Pro, a robust version of the Massive\nMulti-discipline Multimodal Understanding and Reasoning (MMMU) benchmark.\nMMMU-Pro rigorously assesses multimodal models' true understanding and\nreasoning capabilities through a three-step process based on MMMU: (1)\nfiltering out questions answerable by text-only models, (2) augmenting\ncandidate options, and (3) introducing a vision-only input setting where\nquestions are embedded within images. This setting challenges AI to truly \"see\"\nand \"read\" simultaneously, testing a fundamental human cognitive skill of\nseamlessly integrating visual and textual information. Results show that model\nperformance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8%\nto 26.9% across models. We explore the impact of OCR prompts and Chain of\nThought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT\ngenerally improves performance. MMMU-Pro provides a more rigorous evaluation\ntool, closely mimicking real-world scenarios and offering valuable directions\nfor future research in multimodal AI.\n","authors":["Xiang Yue","Tianyu Zheng","Yuansheng Ni","Yubo Wang","Kai Zhang","Shengbang Tong","Yuxuan Sun","Ming Yin","Botao Yu","Ge Zhang","Huan Sun","Yu Su","Wenhu Chen","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2409.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08349v3","updated":"2024-09-04T15:19:35Z","published":"2023-11-14T17:48:19Z","title":"AI-generated text boundary detection with RoFT","summary":" Due to the rapid development of large language models, people increasingly\noften encounter texts that may start as written by a human but continue as\nmachine-generated. Detecting the boundary between human-written and\nmachine-generated parts of such texts is a challenging problem that has not\nreceived much attention in literature. We attempt to bridge this gap and\nexamine several ways to adapt state of the art artificial text detection\nclassifiers to the boundary detection setting. We push all detectors to their\nlimits, using the Real or Fake text benchmark that contains short texts on\nseveral topics and includes generations of various language models. We use this\ndiversity to deeply examine the robustness of all detectors in cross-domain and\ncross-model settings to provide baselines and insights for future research. In\nparticular, we find that perplexity-based approaches to boundary detection tend\nto be more robust to peculiarities of domain-specific data than supervised\nfine-tuning of the RoBERTa model; we also find which features of the text\nconfuse boundary detection algorithms and negatively influence their\nperformance in cross-domain settings.\n","authors":["Laida Kushnareva","Tatiana Gaintseva","German Magai","Serguei Barannikov","Dmitry Abulkhanov","Kristian Kuznetsov","Eduard Tulchinskii","Irina Piontkovskaya","Sergey Nikolenko"],"pdf_url":"https://arxiv.org/pdf/2311.08349v3.pdf","comment":"Our official repository:\n https://github.com/SilverSolver/ai_boundary_detection"},{"id":"http://arxiv.org/abs/2409.02795v1","updated":"2024-09-04T15:11:55Z","published":"2024-09-04T15:11:55Z","title":"Towards a Unified View of Preference Learning for Large Language Models:\n A Survey","summary":" Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of\nthe crucial factors to achieve success is aligning the LLM's output with human\npreferences. This alignment process often requires only a small amount of data\nto efficiently enhance the LLM's performance. While effective, research in this\narea spans multiple domains, and the methods involved are relatively complex to\nunderstand. The relationships between different methods have been\nunder-explored, limiting the development of the preference alignment. In light\nof this, we break down the existing popular alignment strategies into different\ncomponents and provide a unified framework to study the current alignment\nstrategies, thereby establishing connections among them. In this survey, we\ndecompose all the strategies in preference learning into four components:\nmodel, data, feedback, and algorithm. This unified view offers an in-depth\nunderstanding of existing alignment algorithms and also opens up possibilities\nto synergize the strengths of different strategies. Furthermore, we present\ndetailed working examples of prevalent existing algorithms to facilitate a\ncomprehensive understanding for the readers. Finally, based on our unified\nperspective, we explore the challenges and future research directions for\naligning large language models with human preferences.\n","authors":["Bofei Gao","Feifan Song","Yibo Miao","Zefan Cai","Zhe Yang","Liang Chen","Helan Hu","Runxin Xu","Qingxiu Dong","Ce Zheng","Wen Xiao","Ge Zhang","Daoguang Zan","Keming Lu","Bowen Yu","Dayiheng Liu","Zeyu Cui","Jian Yang","Lei Sha","Houfeng Wang","Zhifang Sui","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2409.02795v1.pdf","comment":"Initial Commit, 21 pages"},{"id":"http://arxiv.org/abs/2409.00105v2","updated":"2024-09-04T14:40:14Z","published":"2024-08-27T14:40:16Z","title":"Negation Blindness in Large Language Models: Unveiling the NO Syndrome\n in Image Generation","summary":" Foundational Large Language Models (LLMs) have changed the way we perceive\ntechnology. They have been shown to excel in tasks ranging from poem writing\nand coding to essay generation and puzzle solving. With the incorporation of\nimage generation capability, they have become more comprehensive and versatile\nAI tools. At the same time, researchers are striving to identify the\nlimitations of these tools to improve them further. Currently identified flaws\ninclude hallucination, biases, and bypassing restricted commands to generate\nharmful content. In the present work, we have identified a fundamental\nlimitation related to the image generation ability of LLMs, and termed it The\nNO Syndrome. This negation blindness refers to LLMs inability to correctly\ncomprehend NO related natural language prompts to generate the desired images.\nInterestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found\nto be suffering from this syndrome. To demonstrate the generalization of this\nlimitation, we carried out simulation experiments and conducted entropy-based\nand benchmark statistical analysis tests on various LLMs in multiple languages,\nincluding English, Hindi, and French. We conclude that the NO syndrome is a\nsignificant flaw in current LLMs that needs to be addressed. A related finding\nof this study showed a consistent discrepancy between image and textual\nresponses as a result of this NO syndrome. We posit that the introduction of a\nnegation context-aware reinforcement learning based feedback loop between the\nLLMs textual response and generated image could help ensure the generated text\nis based on both the LLMs correct contextual understanding of the negation\nquery and the generated visual output.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2409.00105v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.02751v1","updated":"2024-09-04T14:30:13Z","published":"2024-09-04T14:30:13Z","title":"A Comparative Study of Pre-training and Self-training","summary":" Pre-training and self-training are two approaches to semi-supervised\nlearning. The comparison between pre-training and self-training has been\nexplored. However, the previous works led to confusing findings: self-training\noutperforms pre-training experienced on some tasks in computer vision, and\ncontrarily, pre-training outperforms self-training experienced on some tasks in\nnatural language processing, under certain conditions of incomparable settings.\nWe propose, comparatively and exhaustively, an ensemble method to empirical\nstudy all feasible training paradigms combining pre-training, self-training,\nand fine-tuning within consistent foundational settings comparable to data\naugmentation. We conduct experiments on six datasets, four data augmentation,\nand imbalanced data for sentiment analysis and natural language inference\ntasks. Our findings confirm that the pre-training and fine-tuning paradigm\nyields the best overall performances. Moreover, self-training offers no\nadditional benefits when combined with semi-supervised pre-training.\n","authors":["Yiheng Wang","Jiayu Lin","Zuoquan Lin"],"pdf_url":"https://arxiv.org/pdf/2409.02751v1.pdf","comment":"19 pages, 2 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.04183v2","updated":"2024-09-04T14:07:07Z","published":"2024-07-04T23:05:58Z","title":"Seeing Like an AI: How LLMs Apply (and Misapply) Wikipedia Neutrality\n Norms","summary":" Large language models (LLMs) are trained on broad corpora and then used in\ncommunities with specialized norms. Is providing LLMs with community rules\nenough for models to follow these norms? We evaluate LLMs' capacity to detect\n(Task 1) and correct (Task 2) biased Wikipedia edits according to Wikipedia's\nNeutral Point of View (NPOV) policy. LLMs struggled with bias detection,\nachieving only 64% accuracy on a balanced dataset. Models exhibited contrasting\nbiases (some under- and others over-predicted bias), suggesting distinct priors\nabout neutrality. LLMs performed better at generation, removing 79% of words\nremoved by Wikipedia editors. However, LLMs made additional changes beyond\nWikipedia editors' simpler neutralizations, resulting in high-recall but\nlow-precision editing. Interestingly, crowdworkers rated AI rewrites as more\nneutral (70%) and fluent (61%) than Wikipedia-editor rewrites. Qualitative\nanalysis found LLMs sometimes applied NPOV more comprehensively than Wikipedia\neditors but often made extraneous non-NPOV-related changes (such as grammar).\nLLMs may apply rules in ways that resonate with the public but diverge from\ncommunity experts. While potentially effective for generation, LLMs may reduce\neditor agency and increase moderation workload (e.g., verifying additions).\nEven when rules are easy to articulate, having LLMs apply them like community\nmembers may still be difficult.\n","authors":["Joshua Ashkinaze","Ruijia Guan","Laura Kurek","Eytan Adar","Ceren Budak","Eric Gilbert"],"pdf_url":"https://arxiv.org/pdf/2407.04183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02727v1","updated":"2024-09-04T14:01:48Z","published":"2024-09-04T14:01:48Z","title":"Pooling And Attention: What Are Effective Designs For LLm-Based\n Embedding Models?","summary":" The significant advancements of Large Language Models (LLMs) in generative\ntasks have led to a growing body of work exploring LLM-based embedding models.\nWhile these models, employing different pooling and attention strategies, have\nachieved state-of-the-art performance on public embedding benchmarks, questions\nstill arise about what constitutes an effective design for LLM-based embedding\nmodels. However, these models are often trained on different datasets, using\ndifferent LLM base models or training settings. Moreover, evaluations on public\nembedding benchmarks often fail to report statistical significance, making it\ndifficult to determine which designs truly contribute to final performance.\nThis complicates the process for practitioners seeking optimal training recipes\nfor LLM-based embedding models. In this study, we conduct a large-scale\nexperiment by training a series of LLM-based embedding models using the same\ntraining data and base model but differing in their pooling and attention\nstrategies. The results show that there is no one-size-fits-all solution: while\nbidirectional attention and an additional trainable pooling layer outperform in\ntext similarity and information retrieval tasks, they do not significantly\nsurpass simpler designs like EOS-last token pooling and default causal\nattention in clustering and classification tasks. Furthermore, we propose a new\npooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs\nof all hidden layers, rather than just the last layer, using a cross-attention\nnetwork. This method proves to be statistically superior in text similarity and\nretrieval tasks compared to existing pooling methods. Overall, this paper sheds\nlight on effective training strategies for LLM-based embedding models.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02727v1.pdf","comment":"https://github.com/yixuantt/PoolingAndAttn"},{"id":"http://arxiv.org/abs/2409.02725v1","updated":"2024-09-04T13:59:48Z","published":"2024-09-04T13:59:48Z","title":"Pre-training data selection for biomedical domain adaptation using\n journal impact metrics","summary":" Domain adaptation is a widely used method in natural language processing\n(NLP) to improve the performance of a language model within a specific domain.\nThis method is particularly common in the biomedical domain, which sees regular\npublication of numerous scientific articles. PubMed, a significant corpus of\ntext, is frequently used in the biomedical domain. The primary objective of\nthis study is to explore whether refining a pre-training dataset using specific\nquality metrics for scientific papers can enhance the performance of the\nresulting model. To accomplish this, we employ two straightforward journal\nimpact metrics and conduct experiments by continually pre-training BERT on\nvarious subsets of the complete PubMed training set, we then evaluate the\nresulting models on biomedical language understanding tasks from the BLURB\nbenchmark. Our results show that pruning using journal impact metrics is not\nefficient. But we also show that pre-training using fewer abstracts (but with\nthe same number of training steps) does not necessarily decrease the resulting\nmodel's performance.\n","authors":["Mathieu Laï-king","Patrick Paroubek"],"pdf_url":"https://arxiv.org/pdf/2409.02725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02718v1","updated":"2024-09-04T13:54:38Z","published":"2024-09-04T13:54:38Z","title":"Alignment-Aware Model Extraction Attacks on Large Language Models","summary":" Model extraction attacks (MEAs) on large language models (LLMs) have received\nincreasing research attention lately. Existing attack methods on LLMs inherit\nthe extraction strategies from those designed for deep neural networks (DNNs)\nyet neglect the inconsistency of training tasks between MEA and LLMs'\nalignments. As such, they result in poor attack performances. To tackle this\nissue, we present Locality Reinforced Distillation (LoRD), a novel model\nextraction attack algorithm specifically for LLMs. In particular, we design a\npolicy-gradient-style training task, which utilizes victim models' responses as\na signal to guide the crafting of preference for the local model. Theoretical\nanalysis has shown that i) LoRD's convergence procedure in MEAs is consistent\nwith the alignments of LLMs, and ii) LoRD can reduce query complexity while\nmitigating watermark protection through exploration-based stealing. Extensive\nexperiments on domain-specific extractions demonstrate the superiority of our\nmethod by examining the extraction of various state-of-the-art commercial LLMs.\n","authors":["Zi Liang","Qingqing Ye","Yanyun Wang","Sen Zhang","Yaxin Xiao","Ronghua Li","Jianliang Xu","Haibo Hu"],"pdf_url":"https://arxiv.org/pdf/2409.02718v1.pdf","comment":"Source code: https://github.com/liangzid/alignmentExtraction"},{"id":"http://arxiv.org/abs/2409.02712v1","updated":"2024-09-04T13:49:45Z","published":"2024-09-04T13:49:45Z","title":"A Data Selection Approach for Enhancing Low Resource Machine Translation\n Using Cross-Lingual Sentence Representations","summary":" Machine translation in low-resource language pairs faces significant\nchallenges due to the scarcity of parallel corpora and linguistic resources.\nThis study focuses on the case of English-Marathi language pairs, where\nexisting datasets are notably noisy, impeding the performance of machine\ntranslation models. To mitigate the impact of data quality issues, we propose a\ndata filtering approach based on cross-lingual sentence representations. Our\nmethodology leverages a multilingual SBERT model to filter out problematic\ntranslations in the training data. Specifically, we employ an IndicSBERT\nsimilarity model to assess the semantic equivalence between original and\ntranslated sentences, allowing us to retain linguistically correct translations\nwhile discarding instances with substantial deviations. The results demonstrate\na significant improvement in translation quality over the baseline\npost-filtering with IndicSBERT. This illustrates how cross-lingual sentence\nrepresentations can reduce errors in machine translation scenarios with limited\nresources. By integrating multilingual sentence BERT models into the\ntranslation pipeline, this research contributes to advancing machine\ntranslation techniques in low-resource environments. The proposed method not\nonly addresses the challenges in English-Marathi language pairs but also\nprovides a valuable framework for enhancing translation quality in other\nlow-resource language translation tasks.\n","authors":["Nidhi Kowtal","Tejas Deshpande","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.02712v1.pdf","comment":"Accepted at I2CT 2024"},{"id":"http://arxiv.org/abs/2405.04160v2","updated":"2024-09-04T13:29:56Z","published":"2024-05-07T09:55:05Z","title":"A Causal Explainable Guardrails for Large Language Models","summary":" Large Language Models (LLMs) have shown impressive performance in natural\nlanguage tasks, but their outputs can exhibit undesirable attributes or biases.\nExisting methods for steering LLMs toward desired attributes often assume\nunbiased representations and rely solely on steering prompts. However, the\nrepresentations learned from pre-training can introduce semantic biases that\ninfluence the steering process, leading to suboptimal results. We propose\nLLMGuardrail, a novel framework that incorporates causal analysis and\nadversarial learning to obtain unbiased steering representations in LLMs.\nLLMGuardrail systematically identifies and blocks the confounding effects of\nbiases, enabling the extraction of unbiased steering representations.\nAdditionally, it includes an explainable component that provides insights into\nthe alignment between the generated output and the desired direction.\nExperiments demonstrate LLMGuardrail's effectiveness in steering LLMs toward\ndesired attributes while mitigating biases. Our work contributes to the\ndevelopment of safe and reliable LLMs that align with desired attributes.\n","authors":["Zhixuan Chu","Yan Wang","Longfei Li","Zhibo Wang","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2405.04160v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2409.02690v1","updated":"2024-09-04T13:23:50Z","published":"2024-09-04T13:23:50Z","title":"Detecting Calls to Action in Multimodal Content: Analysis of the 2021\n German Federal Election Campaign on Instagram","summary":" This study investigates the automated classification of Calls to Action\n(CTAs) within the 2021 German Instagram election campaign to advance the\nunderstanding of mobilization in social media contexts. We analyzed over 2,208\nInstagram stories and 712 posts using fine-tuned BERT models and OpenAI's GPT-4\nmodels. The fine-tuned BERT model incorporating synthetic training data\nachieved a macro F1 score of 0.93, demonstrating a robust classification\nperformance. Our analysis revealed that 49.58% of Instagram posts and 10.64% of\nstories contained CTAs, highlighting significant differences in mobilization\nstrategies between these content types. Additionally, we found that FDP and the\nGreens had the highest prevalence of CTAs in posts, whereas CDU and CSU led in\nstory CTAs.\n","authors":["Michael Achmann-Denkler","Jakob Fehle","Mario Haim","Christian Wolff"],"pdf_url":"https://arxiv.org/pdf/2409.02690v1.pdf","comment":"Accepted Archival Paper for the CPSS Workshop at KONVENS 2024. Camera\n Ready Submission"},{"id":"http://arxiv.org/abs/2409.02686v1","updated":"2024-09-04T13:17:09Z","published":"2024-09-04T13:17:09Z","title":"Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for\n Problem-Solving Improvement of LLMs","summary":" Large Language Models (LLMs) have demonstrated remarkable efficiency in\ntackling various tasks based on human instructions, but recent studies reveal\nthat these models often fail to achieve satisfactory results on questions\ninvolving reasoning, such as mathematics or physics questions. This phenomenon\nis usually attributed to the uncertainty regarding whether these models could\ngenuinely comprehend the knowledge embedded in the text or merely learn to\nreplicate the token distribution without a true understanding of the content.\nIn this paper, we delve into this problem and aim to enhance the reasoning\ncapabilities of LLMs. First, we investigate if the model has genuine reasoning\ncapabilities by visualizing the text generation process at the attention and\nrepresentation level. Then, we formulate the reasoning process of LLMs into a\ncausal framework, which provides a formal explanation of the problems we\nobserve in the visualization. Finally, building upon this causal framework, we\npropose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient\nfine-tuning (PEFT) method to enhance the model's reasoning capabilities by\nencouraging the model to extract the general problem-solving skills and apply\nthese skills to different questions. Experiments show that our method\noutperforms the baseline consistently across multiple benchmarks, and with only\n1.2M tunable parameters, we achieve better or comparable results to other\nfine-tuning methods. This demonstrates the effectiveness and efficiency of our\nmethod in improving the overall accuracy and reliability of LLMs.\n","authors":["Ruoyu Wang","Xiaoxuan Li","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11850v2","updated":"2024-09-04T13:14:57Z","published":"2024-08-13T08:32:06Z","title":"Parallel Speculative Decoding with Adaptive Draft Length","summary":" Speculative decoding (SD), where an extra draft model is employed to provide\nmultiple \\textit{draft} tokens first and then the original target model\nverifies these tokens in parallel, has shown great power for LLM inference\nacceleration. However, existing SD methods suffer from the mutual waiting\nproblem, i.e., the target model gets stuck when the draft model is\n\\textit{guessing} tokens, and vice versa. This problem is directly incurred by\nthe asynchronous execution of the draft model and the target model, and is\nexacerbated due to the fixed draft length in speculative decoding. To address\nthese challenges, we propose a conceptually simple, flexible, and general\nframework to boost speculative decoding, namely \\textbf{P}arallel\nsp\\textbf{E}culative decoding with \\textbf{A}daptive d\\textbf{R}aft\n\\textbf{L}ength (PEARL). Specifically, PEARL proposes \\textit{pre-verify} to\nverify the first draft token in advance during the drafting phase, and\n\\textit{post-verify} to generate more draft tokens during the verification\nphase. PEARL parallels the drafting phase and the verification phase via\napplying the two strategies, and achieves adaptive draft length for different\nscenarios, which effectively alleviates the mutual waiting problem. Moreover,\nwe theoretically demonstrate that the mean accepted tokens of PEARL is more\nthan existing \\textit{draft-then-verify} works. Experiments on various text\ngeneration benchmarks demonstrate the effectiveness of our \\name, leading to a\nsuperior speedup performance up to \\textbf{3.79$\\times$} and\n\\textbf{1.52$\\times$}, compared to auto-regressive decoding and vanilla\nspeculative decoding, respectively.\n","authors":["Tianyu Liu","Yun Li","Qitan Lv","Kai Liu","Jianchen Zhu","Winston Hu"],"pdf_url":"https://arxiv.org/pdf/2408.11850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02667v1","updated":"2024-09-04T12:48:30Z","published":"2024-09-04T12:48:30Z","title":"Creating Domain-Specific Translation Memories for Machine Translation\n Fine-tuning: The TRENCARD Bilingual Cardiology Corpus","summary":" This article investigates how translation memories (TM) can be created by\ntranslators or other language professionals in order to compile domain-specific\nparallel corpora , which can then be used in different scenarios, such as\nmachine translation training and fine-tuning, TM leveraging, and/or large\nlanguage model fine-tuning. The article introduces a semi-automatic TM\npreparation methodology leveraging primarily translation tools used by\ntranslators in favor of data quality and control by the translators. This\nsemi-automatic methodology is then used to build a cardiology-based Turkish ->\nEnglish corpus from bilingual abstracts of Turkish cardiology journals. The\nresulting corpus called TRENCARD Corpus has approximately 800,000 source words\nand 50,000 sentences. Using this methodology, translators can build their\ncustom TMs in a reasonable time and use them in their bilingual data requiring\ntasks.\n","authors":["Gokhan Dogru"],"pdf_url":"https://arxiv.org/pdf/2409.02667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09979v2","updated":"2024-09-04T12:33:24Z","published":"2024-06-14T12:41:07Z","title":"HIRO: Hierarchical Information Retrieval Optimization","summary":" Retrieval-Augmented Generation (RAG) has revolutionized natural language\nprocessing by dynamically integrating external knowledge into Large Language\nModels (LLMs), addressing their limitation of static training datasets. Recent\nimplementations of RAG leverage hierarchical data structures, which organize\ndocuments at various levels of summarization and information density. This\ncomplexity, however, can cause LLMs to \"choke\" on information overload,\nnecessitating more sophisticated querying mechanisms. In this context, we\nintroduce Hierarchical Information Retrieval Optimization (HIRO), a novel\nquerying approach that employs a Depth-First Search (DFS)-based recursive\nsimilarity score calculation and branch pruning. This method uniquely minimizes\nthe context delivered to the LLM without informational loss, effectively\nmanaging the challenge of excessive data. HIRO's refined approach is validated\nby a 10.85% improvement in performance on the NarrativeQA dataset.\n","authors":["Krish Goel","Mahek Chandak"],"pdf_url":"https://arxiv.org/pdf/2406.09979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02649v1","updated":"2024-09-04T12:26:26Z","published":"2024-09-04T12:26:26Z","title":"OpenFact at CheckThat! 2024: Combining Multiple Attack Methods for\n Effective Adversarial Text Generation","summary":" This paper presents the experiments and results for the CheckThat! Lab at\nCLEF 2024 Task 6: Robustness of Credibility Assessment with Adversarial\nExamples (InCrediblAE). The primary objective of this task was to generate\nadversarial examples in five problem domains in order to evaluate the\nrobustness of widely used text classification methods (fine-tuned BERT, BiLSTM,\nand RoBERTa) when applied to credibility assessment issues.\n This study explores the application of ensemble learning to enhance\nadversarial attacks on natural language processing (NLP) models. We\nsystematically tested and refined several adversarial attack methods, including\nBERT-Attack, Genetic algorithms, TextFooler, and CLARE, on five datasets across\nvarious misinformation tasks. By developing modified versions of BERT-Attack\nand hybrid methods, we achieved significant improvements in attack\neffectiveness. Our results demonstrate the potential of modification and\ncombining multiple methods to create more sophisticated and effective\nadversarial attack strategies, contributing to the development of more robust\nand secure systems.\n","authors":["Włodzimierz Lewoniewski","Piotr Stolarski","Milena Stróżyna","Elzbieta Lewańska","Aleksandra Wojewoda","Ewelina Księżniak","Marcin Sawiński"],"pdf_url":"https://arxiv.org/pdf/2409.02649v1.pdf","comment":"CLEF 2024 - Conference and Labs of the Evaluation Forum"},{"id":"http://arxiv.org/abs/2409.02645v1","updated":"2024-09-04T12:22:05Z","published":"2024-09-04T12:22:05Z","title":"A Survey on Emergent Language","summary":" The field of emergent language represents a novel area of research within the\ndomain of artificial intelligence, particularly within the context of\nmulti-agent reinforcement learning. Although the concept of studying language\nemergence is not new, early approaches were primarily concerned with explaining\nhuman language formation, with little consideration given to its potential\nutility for artificial agents. In contrast, studies based on reinforcement\nlearning aim to develop communicative capabilities in agents that are\ncomparable to or even superior to human language. Thus, they extend beyond the\nlearned statistical representations that are common in natural language\nprocessing research. This gives rise to a number of fundamental questions, from\nthe prerequisites for language emergence to the criteria for measuring its\nsuccess. This paper addresses these questions by providing a comprehensive\nreview of 181 scientific publications on emergent language in artificial\nintelligence. Its objective is to serve as a reference for researchers\ninterested in or proficient in the field. Consequently, the main contributions\nare the definition and overview of the prevailing terminology, the analysis of\nexisting evaluation methods and metrics, and the description of the identified\nresearch gaps.\n","authors":["Jannik Peters","Constantin Waubert de Puiseau","Hasan Tercan","Arya Gopikrishnan","Gustavo Adolpho Lucas De Carvalho","Christian Bitter","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.02645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00208v3","updated":"2024-09-04T11:48:04Z","published":"2023-11-01T00:38:26Z","title":"What Formal Languages Can Transformers Express? A Survey","summary":" As transformers have gained prominence in natural language processing, some\nresearchers have investigated theoretically what problems they can and cannot\nsolve, by treating problems as formal languages. Exploring such questions can\nhelp clarify the power of transformers relative to other models of computation,\ntheir fundamental capabilities and limits, and the impact of architectural\nchoices. Work in this subarea has made considerable progress in recent years.\nHere, we undertake a comprehensive survey of this work, documenting the diverse\nassumptions that underlie different results and providing a unified framework\nfor harmonizing seemingly contradictory findings.\n","authors":["Lena Strobl","William Merrill","Gail Weiss","David Chiang","Dana Angluin"],"pdf_url":"https://arxiv.org/pdf/2311.00208v3.pdf","comment":"One minor correction in {\\S}5.1"},{"id":"http://arxiv.org/abs/2308.07107v4","updated":"2024-09-04T11:39:56Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zheng Liu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v4.pdf","comment":"updated to version 3"},{"id":"http://arxiv.org/abs/2409.02617v1","updated":"2024-09-04T11:19:17Z","published":"2024-09-04T11:19:17Z","title":"PUB: Plot Understanding Benchmark and Dataset for Evaluating Large\n Language Models on Synthetic Visual Data Interpretation","summary":" The ability of large language models (LLMs) to interpret visual\nrepresentations of data is crucial for advancing their application in data\nanalysis and decision-making processes. This paper presents a novel synthetic\ndataset designed to evaluate the proficiency of LLMs in interpreting various\nforms of data visualizations, including plots like time series, histograms,\nviolins, boxplots, and clusters. Our dataset is generated using controlled\nparameters to ensure comprehensive coverage of potential real-world scenarios.\nWe employ multimodal text prompts with questions related to visual data in\nimages to benchmark several state-of-the-art models like ChatGPT or Gemini,\nassessing their understanding and interpretative accuracy.\n To ensure data integrity, our benchmark dataset is generated automatically,\nmaking it entirely new and free from prior exposure to the models being tested.\nThis strategy allows us to evaluate the models' ability to truly interpret and\nunderstand the data, eliminating possibility of pre-learned responses, and\nallowing for an unbiased evaluation of the models' capabilities. We also\nintroduce quantitative metrics to assess the performance of the models,\nproviding a robust and comprehensive evaluation tool.\n Benchmarking several state-of-the-art LLMs with this dataset reveals varying\ndegrees of success, highlighting specific strengths and weaknesses in\ninterpreting diverse types of visual data. The results provide valuable\ninsights into the current capabilities of LLMs and identify key areas for\nimprovement. This work establishes a foundational benchmark for future research\nand development aimed at enhancing the visual interpretative abilities of\nlanguage models. In the future, improved LLMs with robust visual interpretation\nskills can significantly aid in automated data analysis, scientific research,\neducational tools, and business intelligence applications.\n","authors":["Aneta Pawelec","Victoria Sara Wesołowska","Zuzanna Bączek","Piotr Sankowski"],"pdf_url":"https://arxiv.org/pdf/2409.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00016v2","updated":"2024-09-04T10:42:35Z","published":"2024-07-28T15:45:08Z","title":"Towards a Universal Method for Meaningful Signal Detection","summary":" It is known that human speech and certain animal vocalizations can convey\nmeaningful content because we can decipher the content that a given utterance\ndoes convey. This paper explores an alternative approach to determining whether\na signal is meaningful, one that analyzes only the signal itself and is\nindependent of what the conveyed meaning might be. We devise a method that\ntakes a waveform as input and outputs a score indicating its degree of\n`meaningfulness`. We cluster contiguous portions of the input to minimize the\ntotal description length, and then take the length of the code of the assigned\ncluster labels as meaningfulness score. We evaluate our method empirically,\nagainst several baselines, and show that it is the only one to give a high\nscore to human speech in various languages and with various speakers, a\nmoderate score to animal vocalizations from birds and orcas, and a low score to\nambient noise from various sources.\n","authors":["Louis Mahon"],"pdf_url":"https://arxiv.org/pdf/2408.00016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02596v1","updated":"2024-09-04T10:27:07Z","published":"2024-09-04T10:27:07Z","title":"An Analysis of Linear Complexity Attention Substitutes with BEST-RQ","summary":" Self-Supervised Learning (SSL) has proven to be effective in various domains,\nincluding speech processing. However, SSL is computationally and memory\nexpensive. This is in part due the quadratic complexity of multi-head\nself-attention (MHSA). Alternatives for MHSA have been proposed and used in the\nspeech domain, but have yet to be investigated properly in an SSL setting. In\nthis work, we study the effects of replacing MHSA with recent state-of-the-art\nalternatives that have linear complexity, namely, HyperMixing, Fastformer,\nSummaryMixing, and Mamba. We evaluate these methods by looking at the speed,\nthe amount of VRAM consumed, and the performance on the SSL MP3S benchmark.\nResults show that these linear alternatives maintain competitive performance\ncompared to MHSA while, on average, decreasing VRAM consumption by around 20%\nto 60% and increasing speed from 7% to 65% for input sequences ranging from 20\nto 80 seconds.\n","authors":["Ryan Whetten","Titouan Parcollet","Adel Moumen","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2409.02596v1.pdf","comment":"Accepted in the IEEE Soken Language Technology Workshop 2024"},{"id":"http://arxiv.org/abs/2405.04296v2","updated":"2024-09-04T10:23:04Z","published":"2024-05-07T13:11:37Z","title":"Open Implementation and Study of BEST-RQ for Speech Processing","summary":" Self-Supervised Learning (SSL) has proven to be useful in various speech\ntasks. However, these methods are generally very demanding in terms of data,\nmemory, and computational resources. BERT-based Speech pre-Training with\nRandom-projection Quantizer (BEST-RQ), is an SSL method that has shown great\nperformance on Automatic Speech Recognition (ASR) while being simpler than\nother SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance,\ndetails are lacking in the original paper, such as the amount of GPU/TPU hours\nused in pre-training, and there is no official easy-to-use open-source\nimplementation. Furthermore, BEST-RQ has not been evaluated on other downstream\ntasks aside from ASR and speech translation. In this work, we describe a\nre-implementation of a Random-projection quantizer and perform a preliminary\nstudy with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the\ndetails and differences of our implementation. We show that a random projection\nquantizer can achieve similar downstream performance as wav2vec 2.0 while\ndecreasing training time by over a factor of two.\n","authors":["Ryan Whetten","Titouan Parcollet","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2405.04296v2.pdf","comment":"Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio,\n Speech and Beyond (SASB 2024)"},{"id":"http://arxiv.org/abs/2409.01227v2","updated":"2024-09-04T10:20:59Z","published":"2024-09-02T13:02:51Z","title":"Prompt Compression with Context-Aware Sentence Encoding for Fast and\n Improved LLM Inference","summary":" Large language models (LLMs) have triggered a new stream of research focusing\non compressing the context length to reduce the computational cost while\nensuring the retention of helpful information for LLMs to answer the given\nquestion. Token-based removal methods are one of the most prominent approaches\nin this direction, but risk losing the semantics of the context caused by\nintermediate token removal, especially under high compression ratios, while\nalso facing challenges in computational efficiency. In this work, we propose\ncontext-aware prompt compression (CPC), a sentence-level prompt compression\ntechnique where its key innovation is a novel context-aware sentence encoder\nthat provides a relevance score for each sentence for a given question. To\ntrain this encoder, we generate a new dataset consisting of questions,\npositives, and negative pairs where positives are sentences relevant to the\nquestion, while negatives are irrelevant context sentences. We train the\nencoder in a contrastive setup to learn context-aware sentence representations.\nOur method considerably outperforms prior works on prompt compression on\nbenchmark datasets and is up to 10.93x faster at inference compared to the best\ntoken-level compression method. We also find better improvement for shorter\nlength constraints in most benchmarks, showing the effectiveness of our\nproposed solution in the compression of relevant information in a shorter\ncontext. Finally, we release the code and the dataset for quick reproducibility\nand further development: https://github.com/Workday/cpc.\n","authors":["Barys Liskavets","Maxim Ushakov","Shuvendu Roy","Mark Klibanov","Ali Etemad","Shane Luke"],"pdf_url":"https://arxiv.org/pdf/2409.01227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06294v3","updated":"2024-09-04T10:16:57Z","published":"2023-05-10T16:31:35Z","title":"CADGE: Context-Aware Dialogue Generation Enhanced with Graph-Structured\n Knowledge Aggregation","summary":" Commonsense knowledge is crucial to many natural language processing tasks.\nExisting works usually incorporate graph knowledge with conventional graph\nneural networks (GNNs), leading to the text and graph knowledge encoding\nprocesses being separated in a serial pipeline. We argue that these separate\nrepresentation learning stages may be suboptimal for neural networks to learn\nthe overall context contained in both types of input knowledge. In this paper,\nwe propose a novel context-aware graph-attention model (Context-aware GAT),\nwhich can effectively incorporate global features of relevant knowledge graphs\nbased on a context-enhanced knowledge aggregation process. Specifically, our\nframework leverages a novel representation learning approach to process\nheterogeneous features - combining flattened graph knowledge with text. To the\nbest of our knowledge, this is the first attempt at hierarchically applying\ngraph knowledge aggregation on a connected subgraph in addition to contextual\ninformation to support commonsense dialogue generation. This framework shows\nsuperior performance compared to conventional GNN-based language frameworks.\nBoth automatic and human evaluation demonstrates that our proposed model has\nsignificant performance uplifts over state-of-the-art baselines.\n","authors":["Hongbo Zhang","Chen Tang","Tyler Loakman","Chenghua Lin","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2305.06294v3.pdf","comment":"Accepted by INLG 2024"},{"id":"http://arxiv.org/abs/2012.15079v2","updated":"2024-09-04T09:44:38Z","published":"2020-12-30T08:31:31Z","title":"Enhancing Sindhi Word Segmentation using Subword Representation Learning\n and Position-aware Self-attention","summary":" Sindhi word segmentation is a challenging task due to space omission and\ninsertion issues. The Sindhi language itself adds to this complexity. It's\ncursive and consists of characters with inherent joining and non-joining\nproperties, independent of word boundaries. Existing Sindhi word segmentation\nmethods rely on designing and combining hand-crafted features. However, these\nmethods have limitations, such as difficulty handling out-of-vocabulary words,\nlimited robustness for other languages, and inefficiency with large amounts of\nnoisy or raw text. Neural network-based models, in contrast, can automatically\ncapture word boundary information without requiring prior knowledge. In this\npaper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses\nword segmentation as a sequence labeling task. The SGNWS model incorporates\nsubword representation learning through a bidirectional long short-term memory\nencoder, position-aware self-attention, and a conditional random field. Our\nempirical results demonstrate that the SGNWS model achieves state-of-the-art\nperformance in Sindhi word segmentation on six datasets.\n","authors":["Wazir Ali","Jay Kumar","Saifullah Tumrani","Redhwan Nour","Adeeb Noor","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2012.15079v2.pdf","comment":"Journal Paper, 14 pages"},{"id":"http://arxiv.org/abs/2409.02569v1","updated":"2024-09-04T09:39:07Z","published":"2024-09-04T09:39:07Z","title":"More is More: Addition Bias in Large Language Models","summary":" In this paper, we investigate the presence of additive bias in Large Language\nModels (LLMs), drawing a parallel to the cognitive bias observed in humans\nwhere individuals tend to favor additive over subtractive changes. Using a\nseries of controlled experiments, we tested various LLMs, including GPT-3.5\nTurbo, Claude 3.5 Sonnet, Mistral, Math$\\Sigma$tral, and Llama 3.1, on tasks\ndesigned to measure their propensity for additive versus subtractive\nmodifications. Our findings demonstrate a significant preference for additive\nchanges across all tested models. For example, in a palindrome creation task,\nLlama 3.1 favored adding letters 97.85% of the time over removing them.\nSimilarly, in a Lego tower balancing task, GPT-3.5 Turbo chose to add a brick\n76.38% of the time rather than remove one. In a text summarization task,\nMistral 7B produced longer summaries in 59.40% to 75.10% of cases when asked to\nimprove its own or others' writing. These results indicate that, similar to\nhumans, LLMs exhibit a marked additive bias, which might have implications when\nLLMs are used on a large scale. Addittive bias might increase resource use and\nenvironmental impact, leading to higher economic costs due to overconsumption\nand waste. This bias should be considered in the development and application of\nLLMs to ensure balanced and efficient problem-solving approaches.\n","authors":["Luca Santagata","Cristiano De Nobili"],"pdf_url":"https://arxiv.org/pdf/2409.02569v1.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.00109v2","updated":"2024-09-04T09:27:05Z","published":"2023-07-26T18:58:53Z","title":"A Sentence is Worth a Thousand Pictures: Can Large Language Models\n Understand Hum4n L4ngu4ge and the W0rld behind W0rds?","summary":" Modern Artificial Intelligence applications show great potential for\nlanguage-related tasks that rely on next-word prediction. The current\ngeneration of Large Language Models (LLMs) have been linked to claims about\nhuman-like linguistic performance and their applications are hailed both as a\nstep towards artificial general intelligence and as a major advance in\nunderstanding the cognitive, and even neural basis of human language. To assess\nthese claims, first we analyze the contribution of LLMs as theoretically\ninformative representations of a target cognitive system vs. atheoretical\nmechanistic tools. Second, we evaluate the models' ability to see the bigger\npicture, through top-down feedback from higher levels of processing, which\nrequires grounding in previous expectations and past world experience. We\nhypothesize that since models lack grounded cognition, they cannot take\nadvantage of these features and instead solely rely on fixed associations\nbetween represented words and word vectors. To assess this, we designed and ran\na novel 'leet task' (l33t t4sk), which requires decoding sentences in which\nletters are systematically replaced by numbers. The results suggest that humans\nexcel in this task whereas models struggle, confirming our hypothesis. We\ninterpret the results by identifying the key abilities that are still missing\nfrom the current state of development of these models, which require solutions\nthat go beyond increased system scaling.\n","authors":["Evelina Leivada","Gary Marcus","Fritz Günther","Elliot Murphy"],"pdf_url":"https://arxiv.org/pdf/2308.00109v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09580v2","updated":"2024-09-04T09:20:04Z","published":"2022-12-19T16:13:52Z","title":"Exploring Interpretability of Independent Components of Word Embeddings\n with Automated Word Intruder Test","summary":" Independent Component Analysis (ICA) is an algorithm originally developed for\nfinding separate sources in a mixed signal, such as a recording of multiple\npeople in the same room speaking at the same time. Unlike Principal Component\nAnalysis (PCA), ICA permits the representation of a word as an unstructured set\nof features, without any particular feature being deemed more significant than\nthe others. In this paper, we used ICA to analyze word embeddings. We have\nfound that ICA can be used to find semantic features of the words, and these\nfeatures can easily be combined to search for words that satisfy the\ncombination. We show that most of the independent components represent such\nfeatures. To quantify the interpretability of the components, we use the word\nintruder test, performed both by humans and by large language models. We\npropose to use the automated version of the word intruder test as a fast and\ninexpensive way of quantifying vector interpretability without the need for\nhuman effort.\n","authors":["Tomáš Musil","David Mareček"],"pdf_url":"https://arxiv.org/pdf/2212.09580v2.pdf","comment":"Presented at LREC-COLING 2024, cite this version please:\n https://aclanthology.org/2024.lrec-main.605/"},{"id":"http://arxiv.org/abs/2409.02519v1","updated":"2024-09-04T08:27:43Z","published":"2024-09-04T08:27:43Z","title":"Language is Scary when Over-Analyzed: Unpacking Implied Misogynistic\n Reasoning with Argumentation Theory-Driven Prompts","summary":" We propose misogyny detection as an Argumentative Reasoning task and we\ninvestigate the capacity of large language models (LLMs) to understand the\nimplicit reasoning used to convey misogyny in both Italian and English. The\ncentral aim is to generate the missing reasoning link between a message and the\nimplied meanings encoding the misogyny. Our study uses argumentation theory as\na foundation to form a collection of prompts in both zero-shot and few-shot\nsettings. These prompts integrate different techniques, including\nchain-of-thought reasoning and augmented knowledge. Our findings show that LLMs\nfall short on reasoning capabilities about misogynistic comments and that they\nmostly rely on their implicit knowledge derived from internalized common\nstereotypes about women to generate implied assumptions, rather than on\ninductive reasoning.\n","authors":["Arianna Muti","Federico Ruggeri","Khalid Al-Khatib","Alberto Barrón-Cedeño","Tommaso Caselli"],"pdf_url":"https://arxiv.org/pdf/2409.02519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00084v2","updated":"2024-09-04T08:22:28Z","published":"2024-08-25T14:50:47Z","title":"Vision-Language and Large Language Model Performance in\n Gastroenterology: GPT, Claude, Llama, Phi, Mistral, Gemma, and Quantized\n Models","summary":" Background and Aims: This study evaluates the medical reasoning performance\nof large language models (LLMs) and vision language models (VLMs) in\ngastroenterology.\n Methods: We used 300 gastroenterology board exam-style multiple-choice\nquestions, 138 of which contain images to systematically assess the impact of\nmodel configurations and parameters and prompt engineering strategies utilizing\nGPT-3.5. Next, we assessed the performance of proprietary and open-source LLMs\n(versions), including GPT (3.5, 4, 4o, 4omini), Claude (3, 3.5), Gemini (1.0),\nMistral, Llama (2, 3, 3.1), Mixtral, and Phi (3), across different interfaces\n(web and API), computing environments (cloud and local), and model precisions\n(with and without quantization). Finally, we assessed accuracy using a\nsemiautomated pipeline.\n Results: Among the proprietary models, GPT-4o (73.7%) and Claude3.5-Sonnet\n(74.0%) achieved the highest accuracy, outperforming the top open-source\nmodels: Llama3.1-405b (64%), Llama3.1-70b (58.3%), and Mixtral-8x7b (54.3%).\nAmong the quantized open-source models, the 6-bit quantized Phi3-14b (48.7%)\nperformed best. The scores of the quantized models were comparable to those of\nthe full-precision models Llama2-7b, Llama2--13b, and Gemma2-9b. Notably, VLM\nperformance on image-containing questions did not improve when the images were\nprovided and worsened when LLM-generated captions were provided. In contrast, a\n10% increase in accuracy was observed when images were accompanied by\nhuman-crafted image descriptions.\n Conclusion: In conclusion, while LLMs exhibit robust zero-shot performance in\nmedical reasoning, the integration of visual data remains a challenge for VLMs.\nEffective deployment involves carefully determining optimal model\nconfigurations, encouraging users to consider either the high performance of\nproprietary models or the flexible adaptability of open-source models.\n","authors":["Seyed Amir Ahmad Safavi-Naini","Shuhaib Ali","Omer Shahab","Zahra Shahhoseini","Thomas Savage","Sara Rafiee","Jamil S Samaan","Reem Al Shabeeb","Farah Ladak","Jamie O Yang","Juan Echavarria","Sumbal Babar","Aasma Shaukat","Samuel Margolis","Nicholas P Tatonetti","Girish Nadkarni","Bara El Kurdi","Ali Soroush"],"pdf_url":"https://arxiv.org/pdf/2409.00084v2.pdf","comment":"Manuscript Pages: 34, Figures: 7, Tables: 2, Supplementary File\n Pages: 35, Data Transparency Statement: Code is available at:\n https://github.com/Sdamirsa/LLM-VLM-in-Gastroenterology . Study data from\n American College of Gastroenterology (ACG) are restricted and available upon\n request with ACG permission. Correction: updated abstract considering\n Llama3.1 results"},{"id":"http://arxiv.org/abs/2409.02481v1","updated":"2024-09-04T07:13:30Z","published":"2024-09-04T07:13:30Z","title":"Word and Phrase Features in Graph Convolutional Network for Automatic\n Question Classification","summary":" Effective question classification is crucial for AI-driven educational tools,\nenabling adaptive learning systems to categorize questions by skill area,\ndifficulty level, and competence. This classification not only supports\neducational diagnostics and analytics but also enhances complex tasks like\ninformation retrieval and question answering by associating questions with\nrelevant categories. Traditional methods, often based on word embeddings and\nconventional classifiers, struggle to capture the nuanced relationships in\nnatural language, leading to suboptimal performance. To address this, we\npropose a novel approach leveraging graph convolutional networks (GCNs), named\nPhrase Question-Graph Convolutional Network (PQ-GCN) to better model the\ninherent structure of questions. By representing questions as graphs -- where\nnodes signify words or phrases and edges denote syntactic or semantic\nrelationships -- our method allows GCNs to learn from the interconnected nature\nof language more effectively. Additionally, we explore the incorporation of\nphrase-based features to enhance classification accuracy, especially in\nlow-resource settings. Our findings demonstrate that GCNs, augmented with these\nfeatures, offer a promising solution for more accurate and context-aware\nquestion classification, bridging the gap between graph neural network research\nand practical educational applications.\n","authors":["Junyoung Lee","Ninad Dixit","Kaustav Chakrabarti","S. Supraja"],"pdf_url":"https://arxiv.org/pdf/2409.02481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02474v1","updated":"2024-09-04T06:46:31Z","published":"2024-09-04T06:46:31Z","title":"A Comparative Study on Large Language Models for Log Parsing","summary":" Background: Log messages provide valuable information about the status of\nsoftware systems. This information is provided in an unstructured fashion and\nautomated approaches are applied to extract relevant parameters. To ease this\nprocess, log parsing can be applied, which transforms log messages into\nstructured log templates. Recent advances in language models have led to\nseveral studies that apply ChatGPT to the task of log parsing with promising\nresults. However, the performance of other state-of-the-art large language\nmodels (LLMs) on the log parsing task remains unclear.\n Aims: In this study, we investigate the current capability of\nstate-of-the-art LLMs to perform log parsing.\n Method: We select six recent LLMs, including both paid proprietary (GPT-3.5,\nClaude 2.1) and four free-to-use open models, and compare their performance on\nsystem logs obtained from a selection of mature open-source projects. We design\ntwo different prompting approaches and apply the LLMs on 1, 354 log templates\nacross 16 different projects. We evaluate their effectiveness, in the number of\ncorrectly identified templates, and the syntactic similarity between the\ngenerated templates and the ground truth.\n Results: We found that free-to-use models are able to compete with paid\nmodels, with CodeLlama extracting 10% more log templates correctly than\nGPT-3.5. Moreover, we provide qualitative insights into the usability of\nlanguage models (e.g., how easy it is to use their responses).\n Conclusions: Our results reveal that some of the smaller, free-to-use LLMs\ncan considerably assist log parsing compared to their paid proprietary\ncompetitors, especially code-specialized models.\n","authors":["Merve Astekin","Max Hort","Leon Moonen"],"pdf_url":"https://arxiv.org/pdf/2409.02474v1.pdf","comment":"Accepted for publication in the 18th ACM/IEEE International Symposium\n on Empirical Software Engineering and Measurement (ESEM '24)"},{"id":"http://arxiv.org/abs/2409.02465v1","updated":"2024-09-04T06:28:22Z","published":"2024-09-04T06:28:22Z","title":"DetectiveQA: Evaluating Long-Context Reasoning on Detective Novels","summary":" With the rapid advancement of Large Language Models (LLMs), long-context\ninformation understanding and processing have become a hot topic in academia\nand industry. However, benchmarks for evaluating the ability of LLMs to handle\nlong-context information do not seem to have kept pace with the development of\nLLMs. Despite the emergence of various long-context evaluation benchmarks, the\ntypes of capability assessed are still limited, without new capability\ndimensions. In this paper, we introduce DetectiveQA, a narrative reasoning\nbenchmark featured with an average context length of over 100K tokens.\nDetectiveQA focuses on evaluating the long-context reasoning ability of LLMs,\nwhich not only requires a full understanding of context but also requires\nextracting important evidences from the context and reasoning according to\nextracted evidences to answer the given questions. This is a new dimension of\ncapability evaluation, which is more in line with the current intelligence\nlevel of LLMs. We use detective novels as data sources, which naturally have\nvarious reasoning elements. Finally, we manually annotated 600 questions in\nChinese and then also provided an English edition of the context information\nand questions. We evaluate many long-context LLMs on DetectiveQA, including\ncommercial and open-sourced models, and the results indicate that existing\nlong-context LLMs still require significant advancements to effectively process\ntrue long-context dependency questions.\n","authors":["Zhe Xu","Jiasheng Ye","Xiangyang Liu","Tianxiang Sun","Xiaoran Liu","Qipeng Guo","Linlin Li","Qun Liu","Xuanjing Huang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.02465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15412v5","updated":"2024-09-04T05:12:54Z","published":"2024-03-05T08:29:36Z","title":"Towards Measuring and Modeling \"Culture\" in LLMs: A Survey","summary":" We present a survey of more than 90 recent papers that aim to study cultural\nrepresentation and inclusion in large language models (LLMs). We observe that\nnone of the studies explicitly define \"culture, which is a complex,\nmultifaceted concept; instead, they probe the models on some specially designed\ndatasets which represent certain aspects of \"culture\". We call these aspects\nthe proxies of culture, and organize them across two dimensions of demographic\nand semantic proxies. We also categorize the probing methods employed. Our\nanalysis indicates that only certain aspects of ``culture,'' such as values and\nobjectives, have been studied, leaving several other interesting and important\nfacets, especially the multitude of semantic domains (Thompson et al., 2020)\nand aboutness (Hershcovich et al., 2022), unexplored. Two other crucial gaps\nare the lack of robustness of probing techniques and situated studies on the\nimpact of cultural mis- and under-representation in LLM-based applications.\n","authors":["Muhammad Farid Adilazuarda","Sagnik Mukherjee","Pradhyumna Lavania","Siddhant Singh","Alham Fikri Aji","Jacki O'Neill","Ashutosh Modi","Monojit Choudhury"],"pdf_url":"https://arxiv.org/pdf/2403.15412v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16672v3","updated":"2024-09-04T05:09:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce a novel architecture and a training framework to\nsupport long context window and multilingual retrieval. Our new model,\nJina-ColBERT-v2, demonstrates strong performance across a range of English and\nmultilingual retrieval tasks,\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Georgios Mastrapas","Saba Sturua","Isabelle Mohr","Andreas Koukounas","Mohammad Kalim Akram","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v3.pdf","comment":"8 pages, references at pp7,8; EMNLP workshop submission"},{"id":"http://arxiv.org/abs/2409.02449v1","updated":"2024-09-04T05:08:23Z","published":"2024-09-04T05:08:23Z","title":"What is lost in Normalization? Exploring Pitfalls in Multilingual ASR\n Model Evaluations","summary":" This paper explores the pitfalls in evaluating multilingual automatic speech\nrecognition (ASR) models, with a particular focus on Indic language scripts. We\ninvestigate the text normalization routine employed by leading ASR models,\nincluding OpenAI Whisper, Meta's MMS, Seamless, and Assembly AI's Conformer,\nand their unintended consequences on performance metrics. Our research reveals\nthat current text normalization practices, while aiming to standardize ASR\noutputs for fair comparison, by removing inconsistencies such as variations in\nspelling, punctuation, and special characters, are fundamentally flawed when\napplied to Indic scripts. Through empirical analysis using text similarity\nscores and in-depth linguistic examination, we demonstrate that these flaws\nlead to artificially inflated performance metrics for Indic languages. We\nconclude by proposing a shift towards developing normalization routines that\nleverage native linguistic expertise, ensuring more robust and accurate\nevaluations of multilingual ASR models.\n","authors":["Kavya Manohar","Leena G Pillai"],"pdf_url":"https://arxiv.org/pdf/2409.02449v1.pdf","comment":"Sumbitted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.02428v1","updated":"2024-09-04T04:15:14Z","published":"2024-09-04T04:15:14Z","title":"Large Language Models as Efficient Reward Function Searchers for\n Custom-Environment Multi-Objective Reinforcement Learning","summary":" Leveraging large language models (LLMs) for designing reward functions\ndemonstrates significant potential. However, achieving effective design and\nimprovement of reward functions in reinforcement learning (RL) tasks with\ncomplex custom environments and multiple requirements presents considerable\nchallenges. In this paper, we enable LLMs to be effective white-box searchers,\nhighlighting their advanced semantic understanding capabilities. Specifically,\nwe generate reward components for each explicit user requirement and employ the\nreward critic to identify the correct code form. Then, LLMs assign weights to\nthe reward components to balance their values and iteratively search and\noptimize these weights based on the context provided by the training log\nanalyzer, while adaptively determining the search step size. We applied the\nframework to an underwater information collection RL task without direct human\nfeedback or reward examples (zero-shot). The reward critic successfully correct\nthe reward code with only one feedback for each requirement, effectively\npreventing irreparable errors that can occur when reward function feedback is\nprovided in aggregate. The effective initialization of weights enables the\nacquisition of different reward functions within the Pareto solution set\nwithout weight search. Even in the case where a weight is 100 times off, fewer\nthan four iterations are needed to obtain solutions that meet user\nrequirements. The framework also works well with most prompts utilizing GPT-3.5\nTurbo, since it does not require advanced numerical understanding or\ncalculation.\n","authors":["Guanwen Xie","Jingzehua Xu","Yiyuan Yang","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00369v2","updated":"2024-09-04T03:50:38Z","published":"2024-08-31T07:10:16Z","title":"An Empirical Study on Information Extraction using Large Language Models","summary":" Human-like large language models (LLMs), especially the most powerful and\npopular ones in OpenAI's GPT family, have proven to be very helpful for many\nnatural language processing (NLP) related tasks. Therefore, various attempts\nhave been made to apply LLMs to information extraction (IE), which is a\nfundamental NLP task that involves extracting information from unstructured\nplain text. To demonstrate the latest representative progress in LLMs'\ninformation extraction ability, we assess the information extraction ability of\nGPT-4 (the latest version of GPT at the time of writing this paper) from four\nperspectives: Performance, Evaluation Criteria, Robustness, and Error Types.\nOur results suggest a visible performance gap between GPT-4 and\nstate-of-the-art (SOTA) IE methods. To alleviate this problem, considering the\nLLMs' human-like characteristics, we propose and analyze the effects of a\nseries of simple prompt-based methods, which can be generalized to other LLMs\nand NLP tasks. Rich experiments show our methods' effectiveness and some of\ntheir remaining issues in improving GPT-4's information extraction ability.\n","authors":["Ridong Han","Chaohao Yang","Tao Peng","Prayag Tiwari","Xiang Wan","Lu Liu","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00369v2.pdf","comment":"This article has an original arxiv version entitled \"Is Information\n Extraction Solved by ChatGPT? An Analysis of Performance, Evaluation\n Criteria, Robustness and Errors\", whose url link is arXiv/2305.14450"},{"id":"http://arxiv.org/abs/2409.02413v1","updated":"2024-09-04T03:39:23Z","published":"2024-09-04T03:39:23Z","title":"Abstractive Text Summarization: State of the Art, Challenges, and\n Improvements","summary":" Specifically focusing on the landscape of abstractive text summarization, as\nopposed to extractive techniques, this survey presents a comprehensive\noverview, delving into state-of-the-art techniques, prevailing challenges, and\nprospective research directions. We categorize the techniques into traditional\nsequence-to-sequence models, pre-trained large language models, reinforcement\nlearning, hierarchical methods, and multi-modal summarization. Unlike prior\nworks that did not examine complexities, scalability and comparisons of\ntechniques in detail, this review takes a comprehensive approach encompassing\nstate-of-the-art methods, challenges, solutions, comparisons, limitations and\ncharts out future improvements - providing researchers an extensive overview to\nadvance abstractive summarization research. We provide vital comparison tables\nacross techniques categorized - offering insights into model complexity,\nscalability and appropriate applications. The paper highlights challenges such\nas inadequate meaning representation, factual consistency, controllable text\nsummarization, cross-lingual summarization, and evaluation metrics, among\nothers. Solutions leveraging knowledge incorporation and other innovative\nstrategies are proposed to address these challenges. The paper concludes by\nhighlighting emerging research areas like factual inconsistency,\ndomain-specific, cross-lingual, multilingual, and long-document summarization,\nas well as handling noisy data. Our objective is to provide researchers and\npractitioners with a structured overview of the domain, enabling them to better\nunderstand the current landscape and identify potential areas for further\nresearch and improvement.\n","authors":["Hassan Shakil","Ahmad Farooq","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2409.02413v1.pdf","comment":"9 Tables, 7 Figures"},{"id":"http://arxiv.org/abs/2409.00128v2","updated":"2024-09-04T03:21:07Z","published":"2024-08-29T05:18:50Z","title":"Can AI Replace Human Subjects? A Large-Scale Replication of\n Psychological Experiments with LLMs","summary":" Artificial Intelligence (AI) is increasingly being integrated into scientific\nresearch, particularly in the social sciences, where understanding human\nbehavior is critical. Large Language Models (LLMs) like GPT-4 have shown\npromise in replicating human-like responses in various psychological\nexperiments. However, the extent to which LLMs can effectively replace human\nsubjects across diverse experimental contexts remains unclear. Here, we conduct\na large-scale study replicating 154 psychological experiments from top social\nscience journals with 618 main effects and 138 interaction effects using GPT-4\nas a simulated participant. We find that GPT-4 successfully replicates 76.0\npercent of main effects and 47.0 percent of interaction effects observed in the\noriginal studies, closely mirroring human responses in both direction and\nsignificance. However, only 19.44 percent of GPT-4's replicated confidence\nintervals contain the original effect sizes, with the majority of replicated\neffect sizes exceeding the 95 percent confidence interval of the original\nstudies. Additionally, there is a 71.6 percent rate of unexpected significant\nresults where the original studies reported null findings, suggesting potential\noverestimation or false positives. Our results demonstrate the potential of\nLLMs as powerful tools in psychological research but also emphasize the need\nfor caution in interpreting AI-driven findings. While LLMs can complement human\nstudies, they cannot yet fully replace the nuanced insights provided by human\nsubjects.\n","authors":["Ziyan Cui","Ning Li","Huaikang Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.00128v2.pdf","comment":"5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.02393v1","updated":"2024-09-04T02:41:44Z","published":"2024-09-04T02:41:44Z","title":"Determination of language families using deep learning","summary":" We use a c-GAN (convolutional generative adversarial) neural network to\nanalyze transliterated text fragments of extant, dead comprehensible, and one\ndead non-deciphered (Cypro-Minoan) language to establish linguistic affinities.\nThe paper is agnostic with respect to translation and/or deciphering. However,\nthere is hope that the proposed approach can be useful for decipherment with\nmore sophisticated neural network techniques.\n","authors":["Peter B. Lerner"],"pdf_url":"https://arxiv.org/pdf/2409.02393v1.pdf","comment":"First draft. Comments are welcome"},{"id":"http://arxiv.org/abs/2409.02387v1","updated":"2024-09-04T02:30:12Z","published":"2024-09-04T02:30:12Z","title":"Large Language Models and Cognitive Science: A Comprehensive Review of\n Similarities, Differences, and Challenges","summary":" This comprehensive review explores the intersection of Large Language Models\n(LLMs) and cognitive science, examining similarities and differences between\nLLMs and human cognitive processes. We analyze methods for evaluating LLMs\ncognitive abilities and discuss their potential as cognitive models. The review\ncovers applications of LLMs in various cognitive fields, highlighting insights\ngained for cognitive science research. We assess cognitive biases and\nlimitations of LLMs, along with proposed methods for improving their\nperformance. The integration of LLMs with cognitive architectures is examined,\nrevealing promising avenues for enhancing artificial intelligence (AI)\ncapabilities. Key challenges and future research directions are identified,\nemphasizing the need for continued refinement of LLMs to better align with\nhuman cognition. This review provides a balanced perspective on the current\nstate and future potential of LLMs in advancing our understanding of both\nartificial and human intelligence.\n","authors":["Qian Niu","Junyu Liu","Ziqian Bi","Pohsun Feng","Benji Peng","Keyu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.02387v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.16586v2","updated":"2024-09-04T02:24:08Z","published":"2024-08-29T14:49:13Z","title":"Enhancing Dialogue Generation in Werewolf Game Through Situation\n Analysis and Persuasion Strategies","summary":" Recent advancements in natural language processing, particularly with large\nlanguage models (LLMs) like GPT-4, have significantly enhanced dialogue\nsystems, enabling them to generate more natural and fluent conversations.\nDespite these improvements, challenges persist, such as managing continuous\ndialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024\naddresses these challenges by employing the Werewolf Game, an incomplete\ninformation game, to test the capabilities of LLMs in complex interactive\nenvironments. This paper introduces a LLM-based Werewolf Game AI, where each\nrole is supported by situation analysis to aid response generation.\nAdditionally, for the werewolf role, various persuasion strategies, including\nlogical appeal, credibility appeal, and emotional appeal, are employed to\neffectively persuade other players to align with its actions.\n","authors":["Zhiyang Qi","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.16586v2.pdf","comment":"Accepted to the AIWolfDial2024 workshop at INLG 2024"},{"id":"http://arxiv.org/abs/2409.02384v1","updated":"2024-09-04T02:20:59Z","published":"2024-09-04T02:20:59Z","title":"STAB: Speech Tokenizer Assessment Benchmark","summary":" Representing speech as discrete tokens provides a framework for transforming\nspeech into a format that closely resembles text, thus enabling the use of\nspeech as an input to the widely successful large language models (LLMs).\nCurrently, while several speech tokenizers have been proposed, there is\nambiguity regarding the properties that are desired from a tokenizer for\nspecific downstream tasks and its overall generalizability. Evaluating the\nperformance of tokenizers across different downstream tasks is a\ncomputationally intensive effort that poses challenges for scalability. To\ncircumvent this requirement, we present STAB (Speech Tokenizer Assessment\nBenchmark), a systematic evaluation framework designed to assess speech\ntokenizers comprehensively and shed light on their inherent characteristics.\nThis framework provides a deeper understanding of the underlying mechanisms of\nspeech tokenization, thereby offering a valuable resource for expediting the\nadvancement of future tokenizer models and enabling comparative analysis using\na standardized benchmark. We evaluate the STAB metrics and correlate this with\ndownstream task performance across a range of speech tasks and tokenizer\nchoices.\n","authors":["Shikhar Vashishth","Harman Singh","Shikhar Bharadwaj","Sriram Ganapathy","Chulayuth Asawaroengchai","Kartik Audhkhasi","Andrew Rosenberg","Ankur Bapna","Bhuvana Ramabhadran"],"pdf_url":"https://arxiv.org/pdf/2409.02384v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.04298v2","updated":"2024-09-04T02:00:58Z","published":"2024-04-04T20:27:37Z","title":"SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated\n Responses","summary":" Can LLMs consistently improve their previous outputs for better results? For\nthis to be true, LLMs would need to be better at discriminating among\npreviously-generated alternatives, than generating initial responses. We\nexplore the validity of this hypothesis in practice. We first formulate a\nunified framework that allows us to compare the generative and discriminative\ncapability of any model on any task. In our resulting experimental analysis of\nseveral open-source and industrial LLMs, we observe that models are not\nreliably better at discriminating among previously-generated alternatives than\ngenerating initial responses. This finding challenges the notion that LLMs may\nbe able to enhance their performance only through their own judgment.\n","authors":["Dongwei Jiang","Jingyu Zhang","Orion Weller","Nathaniel Weir","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.04298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02375v1","updated":"2024-09-04T01:51:37Z","published":"2024-09-04T01:51:37Z","title":"How Privacy-Savvy Are Large Language Models? A Case Study on Compliance\n and Privacy Technical Review","summary":" The recent advances in large language models (LLMs) have significantly\nexpanded their applications across various fields such as language generation,\nsummarization, and complex question answering. However, their application to\nprivacy compliance and technical privacy reviews remains under-explored,\nraising critical concerns about their ability to adhere to global privacy\nstandards and protect sensitive user data. This paper seeks to address this gap\nby providing a comprehensive case study evaluating LLMs' performance in\nprivacy-related tasks such as privacy information extraction (PIE), legal and\nregulatory key point detection (KPD), and question answering (QA) with respect\nto privacy policies and data protection regulations. We introduce a Privacy\nTechnical Review (PTR) framework, highlighting its role in mitigating privacy\nrisks during the software development life-cycle. Through an empirical\nassessment, we investigate the capacity of several prominent LLMs, including\nBERT, GPT-3.5, GPT-4, and custom models, in executing privacy compliance checks\nand technical privacy reviews. Our experiments benchmark the models across\nmultiple dimensions, focusing on their precision, recall, and F1-scores in\nextracting privacy-sensitive information and detecting key regulatory\ncompliance points. While LLMs show promise in automating privacy reviews and\nidentifying regulatory discrepancies, significant gaps persist in their ability\nto fully comply with evolving legal standards. We provide actionable\nrecommendations for enhancing LLMs' capabilities in privacy compliance,\nemphasizing the need for robust model improvements and better integration with\nlegal and regulatory requirements. This study underscores the growing\nimportance of developing privacy-aware LLMs that can both support businesses in\ncompliance efforts and safeguard user privacy rights.\n","authors":["Xichou Zhu","Yang Liu","Zhou Shen","Yi Liu","Min Li","Yujun Chen","Benzi John","Zhenzhen Ma","Tao Hu","Bolong Yang","Manman Wang","Zongxing Xie","Peng Liu","Dan Cai","Junhui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02375v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.02370v1","updated":"2024-09-04T01:40:20Z","published":"2024-09-04T01:40:20Z","title":"Do Large Language Models Possess Sensitive to Sentiment?","summary":" Large Language Models (LLMs) have recently displayed their extraordinary\ncapabilities in language understanding. However, how to comprehensively assess\nthe sentiment capabilities of LLMs continues to be a challenge. This paper\ninvestigates the ability of LLMs to detect and react to sentiment in text\nmodal. As the integration of LLMs into diverse applications is on the rise, it\nbecomes highly critical to comprehend their sensitivity to emotional tone, as\nit can influence the user experience and the efficacy of sentiment-driven\ntasks. We conduct a series of experiments to evaluate the performance of\nseveral prominent LLMs in identifying and responding appropriately to\nsentiments like positive, negative, and neutral emotions. The models' outputs\nare analyzed across various sentiment benchmarks, and their responses are\ncompared with human evaluations. Our discoveries indicate that although LLMs\nshow a basic sensitivity to sentiment, there are substantial variations in\ntheir accuracy and consistency, emphasizing the requirement for further\nenhancements in their training processes to better capture subtle emotional\ncues. Take an example in our findings, in some cases, the models might wrongly\nclassify a strongly positive sentiment as neutral, or fail to recognize sarcasm\nor irony in the text. Such misclassifications highlight the complexity of\nsentiment analysis and the areas where the models need to be refined. Another\naspect is that different LLMs might perform differently on the same set of\ndata, depending on their architecture and training datasets. This variance\ncalls for a more in-depth study of the factors that contribute to the\nperformance differences and how they can be optimized.\n","authors":["Yang Liu","Xichou Zhu","Zhou Shen","Yi Liu","Min Li","Yujun Chen","Benzi John","Zhenzhen Ma","Tao Hu","Zhiyang Xu","Wei Luo","Junhui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02370v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.02361v1","updated":"2024-09-04T01:14:04Z","published":"2024-09-04T01:14:04Z","title":"Diversify-verify-adapt: Efficient and Robust Retrieval-Augmented\n Ambiguous Question Answering","summary":" The retrieval augmented generation (RAG) framework addresses an ambiguity in\nuser queries in QA systems by retrieving passages that cover all plausible\ninterpretations and generating comprehensive responses based on the passages.\nHowever, our preliminary studies reveal that a single retrieval process often\nsuffers from low quality results, as the retrieved passages frequently fail to\ncapture all plausible interpretations. Although the iterative RAG approach has\nbeen proposed to address this problem, it comes at the cost of significantly\nreduced efficiency. To address these issues, we propose the\ndiversify-verify-adapt (DIVA) framework. DIVA first diversifies the retrieved\npassages to encompass diverse interpretations. Subsequently, DIVA verifies the\nquality of the passages and adapts the most suitable approach tailored to their\nquality. This approach improves the QA systems accuracy and robustness by\nhandling low quality retrieval issue in ambiguous questions, while enhancing\nefficiency.\n","authors":["Yeonjun In","Sungchul Kim","Ryan A. Rossi","Md Mehrab Tanjim","Tong Yu","Ritwik Sinha","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2409.02361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15221v2","updated":"2024-09-04T00:58:59Z","published":"2024-08-27T17:33:30Z","title":"LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet","summary":" Recent large language model (LLM) defenses have greatly improved models'\nability to refuse harmful queries, even when adversarially attacked. However,\nLLM defenses are primarily evaluated against automated adversarial attacks in a\nsingle turn of conversation, an insufficient threat model for real-world\nmalicious use. We demonstrate that multi-turn human jailbreaks uncover\nsignificant vulnerabilities, exceeding 70% attack success rate (ASR) on\nHarmBench against defenses that report single-digit ASRs with automated\nsingle-turn attacks. Human jailbreaks also reveal vulnerabilities in machine\nunlearning defenses, successfully recovering dual-use biosecurity knowledge\nfrom unlearned models. We compile these results into Multi-Turn Human\nJailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.\nWe publicly release MHJ alongside a compendium of jailbreak tactics developed\nacross dozens of commercial red teaming engagements, supporting research\ntowards stronger LLM defenses.\n","authors":["Nathaniel Li","Ziwen Han","Ian Steneker","Willow Primack","Riley Goodside","Hugh Zhang","Zifan Wang","Cristina Menghini","Summer Yue"],"pdf_url":"https://arxiv.org/pdf/2408.15221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06266v4","updated":"2024-09-04T00:22:45Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02343v1","updated":"2024-09-04T00:10:36Z","published":"2024-09-04T00:10:36Z","title":"NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for\n Retrieval","summary":" $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval)\nfrom pre-trained embedding models is the predominant retrieval method for text\nand images, as well as Retrieval-Augmented Generation (RAG) pipelines. In\npractice, application developers often fine-tune the embeddings to improve\ntheir accuracy on the dataset and query workload in hand. Existing approaches\neither fine-tune the pre-trained model itself or, more efficiently, but at the\ncost of accuracy, train adaptor models to transform the output of the\npre-trained model. We present NUDGE, a family of novel non-parametric embedding\nfine-tuning approaches that are significantly more accurate and efficient than\nboth sets of existing approaches. NUDGE directly modifies the embeddings of\ndata records to maximize the accuracy of $k$-NN retrieval. We present a\nthorough theoretical and experimental study of NUDGE's non-parametric approach.\nWe show that even though the underlying problem is NP-Hard, constrained\nvariations can be solved efficiently. These constraints additionally ensure\nthat the changes to the embeddings are modest, avoiding large distortions to\nthe semantics learned during pre-training. In experiments across five\npre-trained models and nine standard text and image retrieval datasets, NUDGE\nruns in minutes and often improves NDCG@10 by more than 10% over existing\nfine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase\nin accuracy and runs 200x and 3x faster, respectively, over fine-tuning the\npre-trained model and training adaptors.\n","authors":["Sepanta Zeighami","Zac Wellmer","Aditya Parameswaran"],"pdf_url":"https://arxiv.org/pdf/2409.02343v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.02919v1","updated":"2024-09-04T17:58:08Z","published":"2024-09-04T17:58:08Z","title":"HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical\n MLLM Prompts","summary":" The potential for higher-resolution image generation using pretrained\ndiffusion models is immense, yet these models often struggle with issues of\nobject repetition and structural artifacts especially when scaling to 4K\nresolution and higher. We figure out that the problem is caused by that, a\nsingle prompt for the generation of multiple scales provides insufficient\nefficacy. In response, we propose HiPrompt, a new tuning-free solution that\ntackles the above problems by introducing hierarchical prompts. The\nhierarchical prompts offer both global and local guidance. Specifically, the\nglobal guidance comes from the user input that describes the overall content,\nwhile the local guidance utilizes patch-wise descriptions from MLLMs to\nelaborately guide the regional structure and texture generation. Furthermore,\nduring the inverse denoising process, the generated noise is decomposed into\nlow- and high-frequency spatial components. These components are conditioned on\nmultiple prompt levels, including detailed patch-wise descriptions and broader\nimage-level prompts, facilitating prompt-guided denoising under hierarchical\nsemantic guidance. It further allows the generation to focus more on local\nspatial regions and ensures the generated images maintain coherent local and\nglobal semantics, structures, and textures with high definition. Extensive\nexperiments demonstrate that HiPrompt outperforms state-of-the-art works in\nhigher-resolution image generation, significantly reducing object repetition\nand enhancing structural quality.\n","authors":["Xinyu Liu","Yingqing He","Lanqing Guo","Xiang Li","Bu Jin","Peng Li","Yan Li","Chi-Min Chan","Qifeng Chen","Wei Xue","Wenhan Luo","Qingfeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2409.02919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02917v1","updated":"2024-09-04T17:53:42Z","published":"2024-09-04T17:53:42Z","title":"UC-NeRF: Uncertainty-aware Conditional Neural Radiance Fields from\n Endoscopic Sparse Views","summary":" Visualizing surgical scenes is crucial for revealing internal anatomical\nstructures during minimally invasive procedures. Novel View Synthesis is a\nvital technique that offers geometry and appearance reconstruction, enhancing\nunderstanding, planning, and decision-making in surgical scenes. Despite the\nimpressive achievements of Neural Radiance Field (NeRF), its direct application\nto surgical scenes produces unsatisfying results due to two challenges:\nendoscopic sparse views and significant photometric inconsistencies. In this\npaper, we propose uncertainty-aware conditional NeRF for novel view synthesis\nto tackle the severe shape-radiance ambiguity from sparse surgical views. The\ncore of UC-NeRF is to incorporate the multi-view uncertainty estimation to\ncondition the neural radiance field for modeling the severe photometric\ninconsistencies adaptively. Specifically, our UC-NeRF first builds a\nconsistency learner in the form of multi-view stereo network, to establish the\ngeometric correspondence from sparse views and generate uncertainty estimation\nand feature priors. In neural rendering, we design a base-adaptive NeRF network\nto exploit the uncertainty estimation for explicitly handling the photometric\ninconsistencies. Furthermore, an uncertainty-guided geometry distillation is\nemployed to enhance geometry learning. Experiments on the SCARED and Hamlyn\ndatasets demonstrate our superior performance in rendering appearance and\ngeometry, consistently outperforming the current state-of-the-art approaches.\nOur code will be released at \\url{https://github.com/wrld/UC-NeRF}.\n","authors":["Jiaxin Guo","Jiangliu Wang","Ruofeng Wei","Di Kang","Qi Dou","Yun-hui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02914v1","updated":"2024-09-04T17:52:43Z","published":"2024-09-04T17:52:43Z","title":"Can LVLMs Obtain a Driver's License? A Benchmark Towards Reliable AGI\n for Autonomous Driving","summary":" Large Vision-Language Models (LVLMs) have recently garnered significant\nattention, with many efforts aimed at harnessing their general knowledge to\nenhance the interpretability and robustness of autonomous driving models.\nHowever, LVLMs typically rely on large, general-purpose datasets and lack the\nspecialized expertise required for professional and safe driving. Existing\nvision-language driving datasets focus primarily on scene understanding and\ndecision-making, without providing explicit guidance on traffic rules and\ndriving skills, which are critical aspects directly related to driving safety.\nTo bridge this gap, we propose IDKB, a large-scale dataset containing over one\nmillion data items collected from various countries, including driving\nhandbooks, theory test data, and simulated road test data. Much like the\nprocess of obtaining a driver's license, IDKB encompasses nearly all the\nexplicit knowledge needed for driving from theory to practice. In particular,\nwe conducted comprehensive tests on 15 LVLMs using IDKB to assess their\nreliability in the context of autonomous driving and provided extensive\nanalysis. We also fine-tuned popular models, achieving notable performance\nimprovements, which further validate the significance of our dataset. The\nproject page can be found at:\n\\url{https://4dvlab.github.io/project_page/idkb.html}\n","authors":["Yuhang Lu","Yichen Yao","Jiadong Tu","Jiangnan Shao","Yuexin Ma","Xinge Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.02914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02910v1","updated":"2024-09-04T17:49:54Z","published":"2024-09-04T17:49:54Z","title":"SITAR: Semi-supervised Image Transformer for Action Recognition","summary":" Recognizing actions from a limited set of labeled videos remains a challenge\nas annotating visual data is not only tedious but also can be expensive due to\nclassified nature. Moreover, handling spatio-temporal data using deep $3$D\ntransformers for this can introduce significant computational complexity. In\nthis paper, our objective is to address video action recognition in a\nsemi-supervised setting by leveraging only a handful of labeled videos along\nwith a collection of unlabeled videos in a compute efficient manner.\nSpecifically, we rearrange multiple frames from the input videos in row-column\nform to construct super images. Subsequently, we capitalize on the vast pool of\nunlabeled samples and employ contrastive learning on the encoded super images.\nOur proposed approach employs two pathways to generate representations for\ntemporally augmented super images originating from the same video.\nSpecifically, we utilize a 2D image-transformer to generate representations and\napply a contrastive loss function to minimize the similarity between\nrepresentations from different videos while maximizing the representations of\nidentical videos. Our method demonstrates superior performance compared to\nexisting state-of-the-art approaches for semi-supervised action recognition\nacross various benchmark datasets, all while significantly reducing\ncomputational costs.\n","authors":["Owais Iqbal","Omprakash Chakraborty","Aftab Hussain","Rameswar Panda","Abir Das"],"pdf_url":"https://arxiv.org/pdf/2409.02910v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2408.07832v3","updated":"2024-09-04T17:31:00Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13113v2","updated":"2024-09-04T17:29:04Z","published":"2024-03-19T19:36:48Z","title":"Quantifying uncertainty in lung cancer segmentation with foundation\n models applied to mixed-domain datasets","summary":" Medical image foundation models have shown the ability to segment organs and\ntumors with minimal fine-tuning. These models are typically evaluated on\ntask-specific in-distribution (ID) datasets. However, reliable performance on\nID dataset does not guarantee robust generalization on out-of-distribution\n(OOD) datasets. Importantly, once deployed for clinical use, it is impractical\nto have `ground truth' delineations to assess ongoing performance drifts,\nespecially when images fall into OOD category due to different imaging\nprotocols. Hence, we introduced a comprehensive set of computationally fast\nmetrics to evaluate the performance of multiple foundation models (Swin UNETR,\nSimMIM, iBOT, SMIT) trained with self-supervised learning (SSL). SSL\npretraining was selected as this approach is applicable for large, diverse, and\nunlabeled image sets. All models were fine-tuned on identical datasets for lung\ntumor segmentation from computed tomography (CT) scans. SimMIM, iBOT, and SMIT\nused identical architecture, pretraining, and fine-tuning datasets to assess\nperformance variations with the choice of pretext tasks used in SSL. Evaluation\nwas performed on two public lung cancer datasets (LRAD: n = 140, 5Rater: n =\n21) with different image acquisitions and tumor stage compared to training data\n(n = 317 public resource with stage III-IV lung cancers) and a public\nnon-cancer dataset containing volumetric CT scans of patients with pulmonary\nembolism (n = 120). All models produced similarly accurate tumor segmentation\non the lung cancer testing datasets. SMIT produced a highest F1-score (LRAD:\n0.60, 5Rater: 0.64) and lowest entropy (LRAD: 0.06, 5Rater: 0.12), indicating\nhigher tumor detection rate and confident segmentations. In the OOD dataset,\nSMIT misdetected least number of tumors, indicated by median volume occupancy\nof 5.67 cc compared to second best method SimMIM of 9.97 cc.\n","authors":["Aneesh Rangnekar","Nishant Nadkarni","Jue Jiang","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2403.13113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02889v1","updated":"2024-09-04T17:25:21Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v1.pdf","comment":"19 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.02885v1","updated":"2024-09-04T17:15:44Z","published":"2024-09-04T17:15:44Z","title":"CanvOI, an Oncology Intelligence Foundation Model: Scaling FLOPS\n Differently","summary":" The rapidly evolving field of digital oncopathology faces significant\nchallenges, including the need to address diverse and complex clinical\nquestions, often involving rare conditions, with limited availability of\nlabeled data. These limitations hinder the development of robust AI-driven\ntools in the biomedical space, where accuracy in probabilistic determinations\nis of utmost importance. To address this, digital pathology foundation models\nhave begun to emerge, typically developed with the size and diversity of the\npre-training dataset and model parameters in mind. Here, we present CanvOI, a\nViT-g/10-based foundation model designed to enhance the capabilities of digital\npathology by addressing these challenges through a different approach.\nConsidering the unique nature of oncologic histopathological images and the\nrequirements from the embeddings to provide meaningful representations for\nMultiple Instance Learning (MIL) downstream models, we chose to modify the\ninput image characteristics. By introducing larger tile sizes (380 x 380\npixels) and smaller patch sizes (10 x 10 pixels), we were able to optimize the\nmodel's performance, pushing computational resources in a new direction and\nachieving state-of-the-art performance on cancer-related benchmarks. CanvOI\ndemonstrated a 1.5-7.4% improvement in averaged AUC compared to other leading\nfoundation models built for digital pathology. Moreover, our results\ndemonstrate that CanvOI significantly outperformed the other models, with the\nperformance gap widening substantially when trained on just 10% of the initial\ncohort. This work highlights an alternative approach that, if integrated with\ntraditional development approaches, has the potential to advance Oncology\nIntelligence (OI), overcome some of the current barriers and ultimately improve\nthe clinical outcome of cancer patients.\n","authors":["Jonathan Zalach","Inbal Gazy","Assaf Avinoam","Ron Sinai","Eran Shmuel","Inbar Gilboa","Christine Swisher","Naim Matasci","Reva Basho","David B. Agus"],"pdf_url":"https://arxiv.org/pdf/2409.02885v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02883v1","updated":"2024-09-04T17:08:04Z","published":"2024-09-04T17:08:04Z","title":"Multi-stream deep learning framework to predict mild cognitive\n impairment with Rey Complex Figure Test","summary":" Drawing tests like the Rey Complex Figure Test (RCFT) are widely used to\nassess cognitive functions such as visuospatial skills and memory, making them\nvaluable tools for detecting mild cognitive impairment (MCI). Despite their\nutility, existing predictive models based on these tests often suffer from\nlimitations like small sample sizes and lack of external validation, which\nundermine their reliability. We developed a multi-stream deep learning\nframework that integrates two distinct processing streams: a multi-head\nself-attention based spatial stream using raw RCFT images and a scoring stream\nemploying a previously developed automated scoring system. Our model was\ntrained on data from 1,740 subjects in the Korean cohort and validated on an\nexternal hospital dataset of 222 subjects from Korea. The proposed multi-stream\nmodel demonstrated superior performance over baseline models (AUC = 0.872,\nAccuracy = 0.781) in external validation. The integration of both spatial and\nscoring streams enables the model to capture intricate visual details from the\nraw images while also incorporating structured scoring data, which together\nenhance its ability to detect subtle cognitive impairments. This dual approach\nnot only improves predictive accuracy but also increases the robustness of the\nmodel, making it more reliable in diverse clinical settings. Our model has\npractical implications for clinical settings, where it could serve as a\ncost-effective tool for early MCI screening.\n","authors":["Junyoung Park","Eun Hyun Seo","Sunjun Kim","SangHak Yi","Kun Ho Lee","Sungho Won"],"pdf_url":"https://arxiv.org/pdf/2409.02883v1.pdf","comment":"20 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.02882v1","updated":"2024-09-04T17:07:46Z","published":"2024-09-04T17:07:46Z","title":"Benchmarking Spurious Bias in Few-Shot Image Classifiers","summary":" Few-shot image classifiers are designed to recognize and classify new data\nwith minimal supervision and limited data but often show reliance on spurious\ncorrelations between classes and spurious attributes, known as spurious bias.\nSpurious correlations commonly hold in certain samples and few-shot classifiers\ncan suffer from spurious bias induced from them. There is an absence of an\nautomatic benchmarking system to assess the robustness of few-shot classifiers\nagainst spurious bias. In this paper, we propose a systematic and rigorous\nbenchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied\ndegrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates\nfew-shot evaluation tasks with biased attributes so that using them for\npredictions can demonstrate poor performance. To construct these tasks, we\npropose attribute-based sample selection strategies based on a pre-trained\nvision-language model, eliminating the need for manual dataset curation. This\nallows FewSTAB to automatically benchmark spurious bias using any existing test\ndata. FewSTAB offers evaluation results in a new dimension along with a new\ndesign guideline for building robust classifiers. Moreover, it can benchmark\nspurious bias in varied degrees and enable designs for varied degrees of\nrobustness. Its effectiveness is demonstrated through experiments on ten\nfew-shot learning methods across three datasets. We hope our framework can\ninspire new designs of robust few-shot classifiers. Our code is available at\nhttps://github.com/gtzheng/FewSTAB.\n","authors":["Guangtao Zheng","Wenqian Ye","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02882v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02867v1","updated":"2024-09-04T16:50:48Z","published":"2024-09-04T16:50:48Z","title":"The Impact of Balancing Real and Synthetic Data on Accuracy and Fairness\n in Face Recognition","summary":" Over the recent years, the advancements in deep face recognition have fueled\nan increasing demand for large and diverse datasets. Nevertheless, the\nauthentic data acquired to create those datasets is typically sourced from the\nweb, which, in many cases, can lead to significant privacy issues due to the\nlack of explicit user consent. Furthermore, obtaining a demographically\nbalanced, large dataset is even more difficult because of the natural imbalance\nin the distribution of images from different demographic groups. In this paper,\nwe investigate the impact of demographically balanced authentic and synthetic\ndata, both individually and in combination, on the accuracy and fairness of\nface recognition models. Initially, several generative methods were used to\nbalance the demographic representations of the corresponding synthetic\ndatasets. Then a state-of-the-art face encoder was trained and evaluated using\n(combinations of) synthetic and authentic images. Our findings emphasized two\nmain points: (i) the increased effectiveness of training data generated by\ndiffusion-based models in enhancing accuracy, whether used alone or combined\nwith subsets of authentic data, and (ii) the minimal impact of incorporating\nbalanced data from pre-trained generative methods on fairness (in nearly all\ntested scenarios using combined datasets, fairness scores remained either\nunchanged or worsened, even when compared to unbalanced authentic datasets).\nSource code and data are available at \\url{https://cutt.ly/AeQy1K5G} for\nreproducibility.\n","authors":["Andrea Atzori","Pietro Cosseddu","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2409.02867v1.pdf","comment":"Accepted at Synthetic Data for Computer Vision Workshop - Side Event\n at ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02866v1","updated":"2024-09-04T16:47:16Z","published":"2024-09-04T16:47:16Z","title":"Hybrid-Segmentor: A Hybrid Approach to Automated Fine-Grained Crack\n Segmentation in Civil Infrastructure","summary":" Detecting and segmenting cracks in infrastructure, such as roads and\nbuildings, is crucial for safety and cost-effective maintenance. In spite of\nthe potential of deep learning, there are challenges in achieving precise\nresults and handling diverse crack types. With the proposed dataset and model,\nwe aim to enhance crack detection and infrastructure maintenance. We introduce\nHybrid-Segmentor, an encoder-decoder based approach that is capable of\nextracting both fine-grained local and global crack features. This allows the\nmodel to improve its generalization capabilities in distinguish various type of\nshapes, surfaces and sizes of cracks. To keep the computational performances\nlow for practical purposes, while maintaining the high the generalization\ncapabilities of the model, we incorporate a self-attention model at the encoder\nlevel, while reducing the complexity of the decoder component. The proposed\nmodel outperforms existing benchmark models across 5 quantitative metrics\n(accuracy 0.971, precision 0.804, recall 0.744, F1-score 0.770, and IoU score\n0.630), achieving state-of-the-art status.\n","authors":["June Moh Goo","Xenios Milidonis","Alessandro Artusi","Jan Boehm","Carlo Ciliberto"],"pdf_url":"https://arxiv.org/pdf/2409.02866v1.pdf","comment":"25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.02851v1","updated":"2024-09-04T16:21:33Z","published":"2024-09-04T16:21:33Z","title":"Human-VDM: Learning Single-Image 3D Human Gaussian Splatting from Video\n Diffusion Models","summary":" Generating lifelike 3D humans from a single RGB image remains a challenging\ntask in computer vision, as it requires accurate modeling of geometry,\nhigh-quality texture, and plausible unseen parts. Existing methods typically\nuse multi-view diffusion models for 3D generation, but they often face\ninconsistent view issues, which hinder high-quality 3D human generation. To\naddress this, we propose Human-VDM, a novel method for generating 3D human from\na single RGB image using Video Diffusion Models. Human-VDM provides temporally\nconsistent views for 3D human generation using Gaussian Splatting. It consists\nof three modules: a view-consistent human video diffusion module, a video\naugmentation module, and a Gaussian Splatting module. First, a single image is\nfed into a human video diffusion module to generate a coherent human video.\nNext, the video augmentation module applies super-resolution and video\ninterpolation to enhance the textures and geometric smoothness of the generated\nvideo. Finally, the 3D Human Gaussian Splatting module learns lifelike humans\nunder the guidance of these high-resolution and view-consistent images.\nExperiments demonstrate that Human-VDM achieves high-quality 3D human from a\nsingle image, outperforming state-of-the-art methods in both generation quality\nand quantity. Project page: https://human-vdm.github.io/Human-VDM/\n","authors":["Zhibin Liu","Haoye Dong","Aviral Chharia","Hefeng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.02851v1.pdf","comment":"14 Pages, 8 figures, Project page:\n https://human-vdm.github.io/Human-VDM/"},{"id":"http://arxiv.org/abs/2409.02846v1","updated":"2024-09-04T16:17:45Z","published":"2024-09-04T16:17:45Z","title":"MaDis-Stereo: Enhanced Stereo Matching via Distilled Masked Image\n Modeling","summary":" In stereo matching, CNNs have traditionally served as the predominant\narchitectures. Although Transformer-based stereo models have been studied\nrecently, their performance still lags behind CNN-based stereo models due to\nthe inherent data scarcity issue in the stereo matching task. In this paper, we\npropose Masked Image Modeling Distilled Stereo matching model, termed\nMaDis-Stereo, that enhances locality inductive bias by leveraging Masked Image\nModeling (MIM) in training Transformer-based stereo model. Given randomly\nmasked stereo images as inputs, our method attempts to conduct both image\nreconstruction and depth prediction tasks. While this strategy is beneficial to\nresolving the data scarcity issue, the dual challenge of reconstructing masked\ntokens and subsequently performing stereo matching poses significant\nchallenges, particularly in terms of training stability. To address this, we\npropose to use an auxiliary network (teacher), updated via Exponential Moving\nAverage (EMA), along with the original stereo model (student), where teacher\npredictions serve as pseudo supervisory signals to effectively distill\nknowledge into the student model. State-of-the-arts performance is achieved\nwith the proposed method on several stereo matching such as ETH3D and KITTI\n2015. Additionally, to demonstrate that our model effectively leverages\nlocality inductive bias, we provide the attention distance measurement.\n","authors":["Jihye Ahn","Hyesong Choi","Soomin Kim","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02838v1","updated":"2024-09-04T16:06:23Z","published":"2024-09-04T16:06:23Z","title":"iConFormer: Dynamic Parameter-Efficient Tuning with Input-Conditioned\n Adaptation","summary":" Transfer learning based on full fine-tuning (FFT) of the pre-trained encoder\nand task-specific decoder becomes increasingly complex as deep models grow\nexponentially. Parameter efficient fine-tuning (PEFT) approaches using adapters\nconsisting of small learnable layers have emerged as an alternative to FFT,\nachieving comparable performance while maintaining high training efficiency.\nHowever, the inflexibility of the adapter with respect to input instances\nlimits its capability of learning task-specific information in diverse\ndownstream tasks. In this paper, we propose a novel PEFT approach,\ninput-Conditioned transFormer, termed iConFormer, that leverages a dynamic\nadapter conditioned on the input instances. To secure flexible learning ability\non input instances in various downstream tasks, we introduce an\ninput-Conditioned Network (iCoN) in the dynamic adapter that enables\ninstance-level feature transformation. To be specific, iCoN generates\nchannel-wise convolutional kernels for each feature and transform it using\nadaptive convolution process to effectively capture task-specific and\nfine-grained details tailor to downstream tasks. Experimental results\ndemonstrate that by tuning just 1.6% to 2.8% of the Transformer backbone\nparameters, iConFormer achieves performance comparable to FFT in monocular\ndepth estimation and semantic segmentation, while outperforming it in image\nclassification and instance segmentation. Also, the proposed method\nconsistently outperforms recent PEFT methods for all the tasks mentioned above.\n","authors":["Hayeon Jo","Hyesong Choi","Minhee Cho","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08784v2","updated":"2024-09-04T15:52:08Z","published":"2024-08-16T14:56:17Z","title":"Multi-task Learning Approach for Intracranial Hemorrhage Prognosis","summary":" Prognosis after intracranial hemorrhage (ICH) is influenced by a complex\ninterplay between imaging and tabular data. Rapid and reliable prognosis are\ncrucial for effective patient stratification and informed treatment\ndecision-making. In this study, we aim to enhance image-based prognosis by\nlearning a robust feature representation shared between prognosis and the\nclinical and demographic variables most highly correlated with it. Our approach\nmimics clinical decision-making by reinforcing the model to learn valuable\nprognostic data embedded in the image. We propose a 3D multi-task image model\nto predict prognosis, Glasgow Coma Scale and age, improving accuracy and\ninterpretability. Our method outperforms current state-of-the-art baseline\nimage models, and demonstrates superior performance in ICH prognosis compared\nto four board-certified neuroradiologists using only CT scans as input. We\nfurther validate our model with interpretability saliency maps. Code is\navailable at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git.\n","authors":["Miriam Cobo","Amaia Pérez del Barrio","Pablo Menéndez Fernández-Miranda","Pablo Sanz Bellón","Lara Lloret Iglesias","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2408.08784v2.pdf","comment":"16 pages. Accepted at Machine Learning in Medical Imaging Workshop @\n MICCAI 2024 (MLMI2024). This is the submitted manuscript with added link to\n github repo, funding acknowledgements and authors' names and affiliations. No\n further post submission improvements or corrections were integrated. Final\n version not published yet"},{"id":"http://arxiv.org/abs/2409.02828v1","updated":"2024-09-04T15:50:16Z","published":"2024-09-04T15:50:16Z","title":"ExpLLM: Towards Chain of Thought for Facial Expression Recognition","summary":" Facial expression recognition (FER) is a critical task in multimedia with\nsignificant implications across various domains. However, analyzing the causes\nof facial expressions is essential for accurately recognizing them. Current\napproaches, such as those based on facial action units (AUs), typically provide\nAU names and intensities but lack insight into the interactions and\nrelationships between AUs and the overall expression. In this paper, we propose\na novel method called ExpLLM, which leverages large language models to generate\nan accurate chain of thought (CoT) for facial expression recognition.\nSpecifically, we have designed the CoT mechanism from three key perspectives:\nkey observations, overall emotional interpretation, and conclusion. The key\nobservations describe the AU's name, intensity, and associated emotions. The\noverall emotional interpretation provides an analysis based on multiple AUs and\ntheir interactions, identifying the dominant emotions and their relationships.\nFinally, the conclusion presents the final expression label derived from the\npreceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed\nto construct this expression CoT and generate instruction-description data for\ntraining our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets\ndemonstrate that ExpLLM outperforms current state-of-the-art FER methods.\nExpLLM also surpasses the latest GPT-4o in expression CoT generation,\nparticularly in recognizing micro-expressions where GPT-4o frequently fails.\n","authors":["Xing Lan","Jian Xue","Ji Qi","Dongmei Jiang","Ke Lu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2409.02828v1.pdf","comment":"project page: https://starhiking.github.io/ExpLLM_Page/"},{"id":"http://arxiv.org/abs/2409.02826v1","updated":"2024-09-04T15:45:32Z","published":"2024-09-04T15:45:32Z","title":"Automatic facial axes standardization of 3D fetal ultrasound images","summary":" Craniofacial anomalies indicate early developmental disturbances and are\nusually linked to many genetic syndromes. Early diagnosis is critical, yet\nultrasound (US) examinations often fail to identify these features. This study\npresents an AI-driven tool to assist clinicians in standardizing fetal facial\naxes/planes in 3D US, reducing sonographer workload and facilitating the facial\nevaluation. Our network, structured into three blocks-feature extractor,\nrotation and translation regression, and spatial transformer-processes three\northogonal 2D slices to estimate the necessary transformations for\nstandardizing the facial planes in the 3D US. These transformations are applied\nto the original 3D US using a differentiable module (the spatial transformer\nblock), yielding a standardized 3D US and the corresponding 2D facial standard\nplanes. The dataset used consists of 1180 fetal facial 3D US images acquired\nbetween weeks 20 and 35 of gestation. Results show that our network\nconsiderably reduces inter-observer rotation variability in the test set, with\na mean geodesic angle difference of 14.12$^{\\circ}$ $\\pm$ 18.27$^{\\circ}$ and\nan Euclidean angle error of 7.45$^{\\circ}$ $\\pm$ 14.88$^{\\circ}$. These\nfindings demonstrate the network's ability to effectively standardize facial\naxes, crucial for consistent fetal facial assessments. In conclusion, the\nproposed network demonstrates potential for improving the consistency and\naccuracy of fetal facial assessments in clinical settings, facilitating early\nevaluation of craniofacial anomalies.\n","authors":["Antonia Alomar","Ricardo Rubio","Laura Salort","Gerard Albaiges","Antoni Payà","Gemma Piella","Federico Sukno"],"pdf_url":"https://arxiv.org/pdf/2409.02826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02825v1","updated":"2024-09-04T15:43:10Z","published":"2024-09-04T15:43:10Z","title":"Deep Learning Meets Satellite Images -- An Evaluation on Handcrafted and\n Learning-based Features for Multi-date Satellite Stereo Images","summary":" A critical step in the digital surface models(DSM) generation is feature\nmatching. Off-track (or multi-date) satellite stereo images, in particular, can\nchallenge the performance of feature matching due to spectral distortions\nbetween images, long baseline, and wide intersection angles. Feature matching\nmethods have evolved over the years from handcrafted methods (e.g., SIFT) to\nlearning-based methods (e.g., SuperPoint and SuperGlue). In this paper, we\ncompare the performance of different features, also known as feature extraction\nand matching methods, applied to satellite imagery. A wide range of stereo\npairs(~500) covering two separate study sites are used. SIFT, as a widely used\nclassic feature extraction and matching algorithm, is compared with seven\ndeep-learning matching methods: SuperGlue, LightGlue, LoFTR, ASpanFormer, DKM,\nGIM-LightGlue, and GIM-DKM. Results demonstrate that traditional matching\nmethods are still competitive in this age of deep learning, although for\nparticular scenarios learning-based methods are very promising.\n","authors":["Shuang Song","Luca Morelli","Xinyi Wu","Rongjun Qin","Hessah Albanwan","Fabio Remondino"],"pdf_url":"https://arxiv.org/pdf/2409.02825v1.pdf","comment":"ECCV2024 Workshop - TradiCV"},{"id":"http://arxiv.org/abs/2408.10283v2","updated":"2024-09-04T15:36:52Z","published":"2024-08-19T00:31:05Z","title":"SDE-based Multiplicative Noise Removal","summary":" Multiplicative noise, also known as speckle or pepper noise, commonly affects\nimages produced by synthetic aperture radar (SAR), lasers, or optical lenses.\nUnlike additive noise, which typically arises from thermal processes or\nexternal factors, multiplicative noise is inherent to the system, originating\nfrom the fluctuation in diffuse reflections. These fluctuations result in\nmultiple copies of the same signal with varying magnitudes being combined.\nConsequently, despeckling, or removing multiplicative noise, necessitates\ndifferent techniques compared to those used for additive noise removal.\n In this paper, we propose a novel approach using Stochastic Differential\nEquations based diffusion models to address multiplicative noise. We\ndemonstrate that multiplicative noise can be effectively modeled as a Geometric\nBrownian Motion process in the logarithmic domain. Utilizing the Fokker-Planck\nequation, we derive the corresponding reverse process for image denoising. To\nvalidate our method, we conduct extensive experiments on two different\ndatasets, comparing our approach to both classical signal processing techniques\nand contemporary CNN-based noise removal models. Our results indicate that the\nproposed method significantly outperforms existing methods on perception-based\nmetrics such as FID and LPIPS, while maintaining competitive performance on\ntraditional metrics like PSNR and SSIM.\n","authors":["An Vuong","Thinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.10283v2.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.14279v2","updated":"2024-09-04T15:33:15Z","published":"2024-08-26T13:55:42Z","title":"Learning Local Pattern Modularization for Point Cloud Reconstruction\n from Unseen Classes","summary":" It is challenging to reconstruct 3D point clouds in unseen classes from\nsingle 2D images. Instead of object-centered coordinate system, current methods\ngeneralized global priors learned in seen classes to reconstruct 3D shapes from\nunseen classes in viewer-centered coordinate system. However, the\nreconstruction accuracy and interpretability are still eager to get improved.\nTo resolve this issue, we introduce to learn local pattern modularization for\nreconstructing 3D shapes in unseen classes, which achieves both good\ngeneralization ability and high reconstruction accuracy. Our insight is to\nlearn a local prior which is class-agnostic and easy to generalize in\nobject-centered coordinate system. Specifically, the local prior is learned via\na process of learning and customizing local pattern modularization in seen\nclasses. During this process, we first learn a set of patterns in local\nregions, which is the basis in the object-centered coordinate system to\nrepresent an arbitrary region on shapes across different classes. Then, we\nmodularize each region on an initially reconstructed shape using the learned\nlocal patterns. Based on that, we customize the local pattern modularization\nusing the input image by refining the reconstruction with more details. Our\nmethod enables to reconstruct high fidelity point clouds from unseen classes in\nobject-centered coordinate system without requiring a large number of patterns\nor any additional information, such as segmentation supervision or camera\nposes. Our experimental results under widely used benchmarks show that our\nmethod achieves the state-of-the-art reconstruction accuracy for shapes from\nunseen classes. The code is available at https://github.com/chenchao15/Unseen.\n","authors":["Chao Chen","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2408.14279v2.pdf","comment":"14pages, 11figures, accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02813v1","updated":"2024-09-04T15:31:26Z","published":"2024-09-04T15:31:26Z","title":"MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding\n Benchmark","summary":" This paper introduces MMMU-Pro, a robust version of the Massive\nMulti-discipline Multimodal Understanding and Reasoning (MMMU) benchmark.\nMMMU-Pro rigorously assesses multimodal models' true understanding and\nreasoning capabilities through a three-step process based on MMMU: (1)\nfiltering out questions answerable by text-only models, (2) augmenting\ncandidate options, and (3) introducing a vision-only input setting where\nquestions are embedded within images. This setting challenges AI to truly \"see\"\nand \"read\" simultaneously, testing a fundamental human cognitive skill of\nseamlessly integrating visual and textual information. Results show that model\nperformance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8%\nto 26.9% across models. We explore the impact of OCR prompts and Chain of\nThought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT\ngenerally improves performance. MMMU-Pro provides a more rigorous evaluation\ntool, closely mimicking real-world scenarios and offering valuable directions\nfor future research in multimodal AI.\n","authors":["Xiang Yue","Tianyu Zheng","Yuansheng Ni","Yubo Wang","Kai Zhang","Shengbang Tong","Yuxuan Sun","Ming Yin","Botao Yu","Ge Zhang","Huan Sun","Yu Su","Wenhu Chen","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2409.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01021v2","updated":"2024-09-04T15:25:27Z","published":"2024-09-02T08:01:32Z","title":"CONDA: Condensed Deep Association Learning for Co-Salient Object\n Detection","summary":" Inter-image association modeling is crucial for co-salient object detection.\nDespite satisfactory performance, previous methods still have limitations on\nsufficient inter-image association modeling. Because most of them focus on\nimage feature optimization under the guidance of heuristically calculated raw\ninter-image associations. They directly rely on raw associations which are not\nreliable in complex scenarios, and their image feature optimization approach is\nnot explicit for inter-image association modeling. To alleviate these\nlimitations, this paper proposes a deep association learning strategy that\ndeploys deep networks on raw associations to explicitly transform them into\ndeep association features. Specifically, we first create hyperassociations to\ncollect dense pixel-pair-wise raw associations and then deploys deep\naggregation networks on them. We design a progressive association generation\nmodule for this purpose with additional enhancement of the hyperassociation\ncalculation. More importantly, we propose a correspondence-induced association\ncondensation module that introduces a pretext task, i.e. semantic\ncorrespondence estimation, to condense the hyperassociations for computational\nburden reduction and noise elimination. We also design an object-aware cycle\nconsistency loss for high-quality correspondence estimations. Experimental\nresults in three benchmark datasets demonstrate the remarkable effectiveness of\nour proposed method with various training settings.\n","authors":["Long Li","Nian Liu","Dingwen Zhang","Zhongyu Li","Salman Khan","Rao Anwer","Hisham Cholakkal","Junwei Han","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2409.01021v2.pdf","comment":"There is an error. In Sec 4.1, the number of images in some dataset\n is incorrect and needs to be revised"},{"id":"http://arxiv.org/abs/2308.13495v3","updated":"2024-09-04T15:12:03Z","published":"2023-08-25T17:10:22Z","title":"Open Gaze: Open Source eye tracker for smartphone devices using Deep\n Learning","summary":" Eye tracking has been a pivotal tool in diverse fields such as vision\nresearch, language analysis, and usability assessment. The majority of prior\ninvestigations, however, have concentrated on expansive desktop displays\nemploying specialized, costly eye tracking hardware that lacks scalability.\nRemarkably little insight exists into ocular movement patterns on smartphones,\ndespite their widespread adoption and significant usage. In this manuscript, we\npresent an open-source implementation of a smartphone-based gaze tracker that\nemulates the methodology proposed by a GooglePaper (whose source code remains\nproprietary). Our focus is on attaining accuracy comparable to that attained\nthrough the GooglePaper's methodology, without the necessity for supplementary\nhardware. Through the integration of machine learning techniques, we unveil an\naccurate eye tracking solution that is native to smartphones. Our approach\ndemonstrates precision akin to the state-of-the-art mobile eye trackers, which\nare characterized by a cost that is two orders of magnitude higher. Leveraging\nthe vast MIT GazeCapture dataset, which is available through registration on\nthe dataset's website, we successfully replicate crucial findings from previous\nstudies concerning ocular motion behavior in oculomotor tasks and saliency\nanalyses during natural image observation. Furthermore, we emphasize the\napplicability of smartphone-based gaze tracking in discerning reading\ncomprehension challenges. Our findings exhibit the inherent potential to\namplify eye movement research by significant proportions, accommodating\nparticipation from thousands of subjects with explicit consent. This\nscalability not only fosters advancements in vision research, but also extends\nits benefits to domains such as accessibility enhancement and healthcare\napplications.\n","authors":["Sushmanth reddy","Jyothi Swaroop Reddy"],"pdf_url":"https://arxiv.org/pdf/2308.13495v3.pdf","comment":"This paper results are incorrectly reported. The paper is not\n authentic and conclusions are not correct"},{"id":"http://arxiv.org/abs/2311.12912v3","updated":"2024-09-04T15:09:32Z","published":"2023-11-21T17:27:20Z","title":"Q-Seg: Quantum Annealing-Based Unsupervised Image Segmentation","summary":" We present Q-Seg, a novel unsupervised image segmentation method based on\nquantum annealing, tailored for existing quantum hardware. We formulate the\npixel-wise segmentation problem, which assimilates spectral and spatial\ninformation of the image, as a graph-cut optimization task. Our method\nefficiently leverages the interconnected qubit topology of the D-Wave Advantage\ndevice, offering superior scalability over existing quantum approaches and\noutperforming several tested state-of-the-art classical methods. Empirical\nevaluations on synthetic datasets have shown that Q-Seg has better runtime\nperformance than the state-of-the-art classical optimizer Gurobi. The method\nhas also been tested on earth observation image segmentation, a critical area\nwith noisy and unreliable annotations. In the era of noisy intermediate-scale\nquantum, Q-Seg emerges as a reliable contender for real-world applications in\ncomparison to advanced techniques like Segment Anything. Consequently, Q-Seg\noffers a promising solution using available quantum hardware, especially in\nsituations constrained by limited labeled data and the need for efficient\ncomputational runtime.\n","authors":["Supreeth Mysore Venkatesh","Antonio Macaluso","Marlon Nuske","Matthias Klusch","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2311.12912v3.pdf","comment":"12 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.02792v1","updated":"2024-09-04T15:06:44Z","published":"2024-09-04T15:06:44Z","title":"UnLearning from Experience to Avoid Spurious Correlations","summary":" While deep neural networks can achieve state-of-the-art performance in many\ntasks, these models are more fragile than they appear. They are prone to\nlearning spurious correlations in their training data, leading to surprising\nfailure cases. In this paper, we propose a new approach that addresses the\nissue of spurious correlations: UnLearning from Experience (ULE). Our method is\nbased on using two classification models trained in parallel: student and\nteacher models. Both models receive the same batches of training data. The\nstudent model is trained with no constraints and pursues the spurious\ncorrelations in the data. The teacher model is trained to solve the same\nclassification problem while avoiding the mistakes of the student model. As\ntraining is done in parallel, the better the student model learns the spurious\ncorrelations, the more robust the teacher model becomes. The teacher model uses\nthe gradient of the student's output with respect to its input to unlearn\nmistakes made by the student. We show that our method is effective on the\nWaterbirds, CelebA, Spawrious and UrbanCars datasets.\n","authors":["Jeff Mitchell","Jesús Martínez del Rincón","Niall McLaughlin"],"pdf_url":"https://arxiv.org/pdf/2409.02792v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2401.11421v2","updated":"2024-09-04T15:01:15Z","published":"2024-01-21T07:57:04Z","title":"Enhancing the vision-language foundation model with key semantic\n knowledge-emphasized report refinement","summary":" Recently, vision-language representation learning has made remarkable\nadvancements in building up medical foundation models, holding immense\npotential for transforming the landscape of clinical research and medical care.\nThe underlying hypothesis is that the rich knowledge embedded in radiology\nreports can effectively assist and guide the learning process, reducing the\nneed for additional labels. However, these reports tend to be complex and\nsometimes even consist of redundant descriptions that make the representation\nlearning too challenging to capture the key semantic information. This paper\ndevelops a novel iterative vision-language representation learning framework by\nproposing a key semantic knowledge-emphasized report refinement method.\nParticularly, raw radiology reports are refined to highlight the key\ninformation according to a constructed clinical dictionary and two\nmodel-optimized knowledge-enhancement metrics. The iterative framework is\ndesigned to progressively learn, starting from gaining a general understanding\nof the patient's condition based on raw reports and gradually refines and\nextracts critical information essential to the fine-grained analysis tasks. The\neffectiveness of the proposed framework is validated on various downstream\nmedical image analysis tasks, including disease classification,\nregion-of-interest segmentation, and phrase grounding. Our framework surpasses\nseven state-of-the-art methods in both fine-tuning and zero-shot settings,\ndemonstrating its encouraging potential for different clinical applications.\n","authors":["Weijian Huang","Cheng Li","Hao Yang","Jiarun Liu","Yong Liang","Hairong Zheng","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08381v2","updated":"2024-09-04T14:52:59Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations change\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations changes through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that how ID changes through the network differs\nnoticeably between natural and medical image models. Specifically, medical\nimage models peak in representation ID earlier in the network, implying a\ndifference in the image features and their abstractness that are typically used\nfor downstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02770v1","updated":"2024-09-04T14:49:35Z","published":"2024-09-04T14:49:35Z","title":"Validation of musculoskeletal segmentation model with uncertainty\n estimation for bone and muscle assessment in hip-to-knee clinical CT images","summary":" Deep learning-based image segmentation has allowed for the fully automated,\naccurate, and rapid analysis of musculoskeletal (MSK) structures from medical\nimages. However, current approaches were either applied only to 2D\ncross-sectional images, addressed few structures, or were validated on small\ndatasets, which limit the application in large-scale databases. This study\naimed to validate an improved deep learning model for volumetric MSK\nsegmentation of the hip and thigh with uncertainty estimation from clinical\ncomputed tomography (CT) images. Databases of CT images from multiple\nmanufacturers/scanners, disease status, and patient positioning were used. The\nsegmentation accuracy, and accuracy in estimating the structures volume and\ndensity, i.e., mean HU, were evaluated. An approach for segmentation failure\ndetection based on predictive uncertainty was also investigated. The model has\nshown an overall improvement with respect to all segmentation accuracy and\nstructure volume/density evaluation metrics. The predictive uncertainty yielded\nlarge areas under the receiver operating characteristic (AUROC) curves\n(AUROCs>=.95) in detecting inaccurate and failed segmentations. The high\nsegmentation and muscle volume/density estimation accuracy, along with the high\naccuracy in failure detection based on the predictive uncertainty, exhibited\nthe model's reliability for analyzing individual MSK structures in large-scale\nCT databases.\n","authors":["Mazen Soufi","Yoshito Otake","Makoto Iwasa","Keisuke Uemura","Tomoki Hakotani","Masahiro Hashimoto","Yoshitake Yamada","Minoru Yamada","Yoichi Yokoyama","Masahiro Jinzaki","Suzushi Kusano","Masaki Takao","Seiji Okada","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2409.02770v1.pdf","comment":"29 pages, 7+10supp figures, 8 tables"},{"id":"http://arxiv.org/abs/2408.11571v2","updated":"2024-09-04T14:38:03Z","published":"2024-08-21T12:27:36Z","title":"CHOTA: A Higher Order Accuracy Metric for Cell Tracking","summary":" The evaluation of cell tracking results steers the development of tracking\nmethods, significantly impacting biomedical research. This is quantitatively\nachieved by means of evaluation metrics. Unfortunately, current metrics favor\nlocal correctness and weakly reward global coherence, impeding high-level\nbiological analysis. To also foster global coherence, we propose the CHOTA\nmetric (Cell-specific Higher Order Tracking Accuracy) which unifies the\nevaluation of all relevant aspects of cell tracking: cell detections and local\nassociations, global coherence, and lineage tracking. We achieve this by\nintroducing a new definition of the term 'trajectory' that includes the entire\ncell lineage and by including this into the well-established HOTA metric from\ngeneral multiple object tracking. Furthermore, we provide a detailed survey of\ncontemporary cell tracking metrics to compare our novel CHOTA metric and to\nshow its advantages. All metrics are extensively evaluated on state-of-the-art\nreal-data cell tracking results and synthetic results that simulate specific\ntracking errors. We show that CHOTA is sensitive to all tracking errors and\ngives a good indication of the biologically relevant capability of a method to\nreconstruct the full lineage of cells. It introduces a robust and comprehensive\nalternative to the currently used metrics in cell tracking. Python code is\navailable at https://github.com/CellTrackingChallenge/py-ctcmetrics .\n","authors":["Timo Kaiser","Vladimir Ulman","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2408.11571v2.pdf","comment":"Accepted at BIC Workshop at European Conference on Computer Vision\n 2024, 14 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.02699v1","updated":"2024-09-04T13:35:15Z","published":"2024-09-04T13:35:15Z","title":"CLDA: Collaborative Learning for Enhanced Unsupervised Domain Adaptation","summary":" Unsupervised Domain Adaptation (UDA) endeavors to bridge the gap between a\nmodel trained on a labeled source domain and its deployment in an unlabeled\ntarget domain. However, current high-performance models demand significant\nresources, resulting in prohibitive deployment costs and highlighting the need\nfor small yet effective models. For UDA of lightweight models, Knowledge\nDistillation (KD) in a Teacher-Student framework can be a common approach, but\nwe find that domain shift in UDA leads to a significant increase in non-salient\nparameters in the teacher model, degrading model's generalization ability and\ntransferring misleading information to the student model. Interestingly, we\nobserved that this phenomenon occurs considerably less in the student model.\nDriven by this insight, we introduce Collaborative Learning, a method that\nupdates the teacher's non-salient parameters using the student model and at the\nsame time enhance the student's performance using the updated teacher model.\nExperiments across various tasks and datasets show consistent performance\nimprovements for both student and teacher models. For example, in semantic\nsegmentation, CLDA achieves an improvement of +0.7% mIoU for teacher and +1.4%\nmIoU for student compared to the baseline model in the GTA to Cityscapes. In\nthe Synthia to Cityscapes, it achieves an improvement of +0.8% mIoU for teacher\nand +2.0% mIoU for student.\n","authors":["Minhee Cho","Hyesong Choi","Hayeon Jo","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02683v1","updated":"2024-09-04T13:15:10Z","published":"2024-09-04T13:15:10Z","title":"Rethinking HTG Evaluation: Bridging Generation and Recognition","summary":" The evaluation of generative models for natural image tasks has been\nextensively studied. Similar protocols and metrics are used in cases with\nunique particularities, such as Handwriting Generation, even if they might not\nbe completely appropriate. In this work, we introduce three measures tailored\nfor HTG evaluation, $ \\text{HTG}_{\\text{HTR}} $, $ \\text{HTG}_{\\text{style}} $,\nand $ \\text{HTG}_{\\text{OOV}} $, and argue that they are more expedient to\nevaluate the quality of generated handwritten images. The metrics rely on the\nrecognition error/accuracy of Handwriting Text Recognition and Writer\nIdentification models and emphasize writing style, textual content, and\ndiversity as the main aspects that adhere to the content of handwritten images.\nWe conduct comprehensive experiments on the IAM handwriting database,\nshowcasing that widely used metrics such as FID fail to properly quantify the\ndiversity and the practical utility of generated handwriting samples. Our\nfindings show that our metrics are richer in information and underscore the\nnecessity of standardized evaluation protocols in HTG. The proposed metrics\nprovide a more robust and informative protocol for assessing HTG quality,\ncontributing to improved performance in HTR. Code for the evaluation protocol\nis available at: https://github.com/koninik/HTG_evaluation.\n","authors":["Konstantina Nikolaidou","George Retsinas","Giorgos Sfikas","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2409.02683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02676v1","updated":"2024-09-04T13:06:40Z","published":"2024-09-04T13:06:40Z","title":"Improved Single Camera BEV Perception Using Multi-Camera Training","summary":" Bird's Eye View (BEV) map prediction is essential for downstream autonomous\ndriving tasks like trajectory prediction. In the past, this was accomplished\nthrough the use of a sophisticated sensor configuration that captured a\nsurround view from multiple cameras. However, in large-scale production, cost\nefficiency is an optimization goal, so that using fewer cameras becomes more\nrelevant. But the consequence of fewer input images correlates with a\nperformance drop. This raises the problem of developing a BEV perception model\nthat provides a sufficient performance on a low-cost sensor setup. Although,\nprimarily relevant for inference time on production cars, this cost restriction\nis less problematic on a test vehicle during training. Therefore, the objective\nof our approach is to reduce the aforementioned performance drop as much as\npossible using a modern multi-camera surround view model reduced for\nsingle-camera inference. The approach includes three features, a modern masking\ntechnique, a cyclic Learning Rate (LR) schedule, and a feature reconstruction\nloss for supervising the transition from six-camera inputs to one-camera input\nduring training. Our method outperforms versions trained strictly with one\ncamera or strictly with six-camera surround view for single-camera inference\nresulting in reduced hallucination and better quality of the BEV map.\n","authors":["Daniel Busch","Ido Freeman","Richard Meyes","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.02676v1.pdf","comment":"This Paper has been accepted to the 27th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2024)"},{"id":"http://arxiv.org/abs/2409.02675v1","updated":"2024-09-04T13:05:00Z","published":"2024-09-04T13:05:00Z","title":"Multi-Head Attention Residual Unfolded Network for Model-Based\n Pansharpening","summary":" The objective of pansharpening and hypersharpening is to accurately combine a\nhigh-resolution panchromatic (PAN) image with a low-resolution multispectral\n(MS) or hyperspectral (HS) image, respectively. Unfolding fusion methods\nintegrate the powerful representation capabilities of deep learning with the\nrobustness of model-based approaches. These techniques involve unrolling the\nsteps of the optimization scheme derived from the minimization of an energy\ninto a deep learning framework, resulting in efficient and highly interpretable\narchitectures. In this paper, we propose a model-based deep unfolded method for\nsatellite image fusion. Our approach is based on a variational formulation that\nincorporates the classic observation model for MS/HS data, a high-frequency\ninjection constraint based on the PAN image, and an arbitrary convex prior. For\nthe unfolding stage, we introduce upsampling and downsampling layers that use\ngeometric information encoded in the PAN image through residual networks. The\nbackbone of our method is a multi-head attention residual network (MARNet),\nwhich replaces the proximity operator in the optimization scheme and combines\nmultiple head attentions with residual learning to exploit image\nself-similarities via nonlocal operators defined in terms of patches.\nAdditionally, we incorporate a post-processing module based on the MARNet\narchitecture to further enhance the quality of the fused images. Experimental\nresults on PRISMA, Quickbird, and WorldView2 datasets demonstrate the superior\nperformance of our method and its ability to generalize across different sensor\nconfigurations and varying spatial and spectral resolutions. The source code\nwill be available at https://github.com/TAMI-UIB/MARNet.\n","authors":["Ivan Pereira-Sánchez","Eloi Sans","Julia Navarro","Joan Duran"],"pdf_url":"https://arxiv.org/pdf/2409.02675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00917v2","updated":"2024-09-04T13:04:03Z","published":"2024-09-02T03:15:19Z","title":"Large Scale Unsupervised Brain MRI Image Registration Solution for\n Learn2Reg 2024","summary":" In this paper, we summarize the methods and experimental results we proposed\nfor Task 2 in the learn2reg 2024 Challenge. This task focuses on unsupervised\nregistration of anatomical structures in brain MRI images between different\npatients. The difficulty lies in: (1) without segmentation labels, and (2) a\nlarge amount of data. To address these challenges, we built an efficient\nbackbone network and explored several schemes to further enhance registration\naccuracy. Under the guidance of the NCC loss function and smoothness\nregularization loss function, we obtained a smooth and reasonable deformation\nfield. According to the leaderboard, our method achieved a Dice coefficient of\n77.34%, which is 1.4% higher than the TransMorph. Overall, we won second place\non the leaderboard for Task 2.\n","authors":["Yuxi Zhang","Xiang Chen","Jiazheng Wang","Min Liu","Yaonan Wang","Dongdong Liu","Renjiu Hu","Hang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.00917v2.pdf","comment":"MICCAI Learn2Reg 2024 Challenge & WBIR 2024 Workshop on Biomedical\n Imaging Registration"},{"id":"http://arxiv.org/abs/2405.11614v2","updated":"2024-09-04T13:02:15Z","published":"2024-05-19T17:09:43Z","title":"Nickel and Diming Your GAN: A Dual-Method Approach to Enhancing GAN\n Efficiency via Knowledge Distillation","summary":" In this paper, we address the challenge of compressing generative adversarial\nnetworks (GANs) for deployment in resource-constrained environments by\nproposing two novel methodologies: Distribution Matching for Efficient\ncompression (DiME) and Network Interactive Compression via Knowledge Exchange\nand Learning (NICKEL). DiME employs foundation models as embedding kernels for\nefficient distribution matching, leveraging maximum mean discrepancy to\nfacilitate effective knowledge distillation. Simultaneously, NICKEL employs an\ninteractive compression method that enhances the communication between the\nstudent generator and discriminator, achieving a balanced and stable\ncompression process. Our comprehensive evaluation on the StyleGAN2 architecture\nwith the FFHQ dataset shows the effectiveness of our approach, with NICKEL &\nDiME achieving FID scores of 10.45 and 15.93 at compression rates of 95.73% and\n98.92%, respectively. Remarkably, our methods sustain generative quality even\nat an extreme compression rate of 99.69%, surpassing the previous\nstate-of-the-art performance by a large margin. These findings not only\ndemonstrate our methodologies' capacity to significantly lower GANs'\ncomputational demands but also pave the way for deploying high-quality GAN\nmodels in settings with limited resources. Our code will be released soon.\n","authors":["Sangyeop Yeo","Yoojin Jang","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2405.11614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01821v2","updated":"2024-09-04T12:58:11Z","published":"2024-09-03T12:03:45Z","title":"When Does Visual Prompting Outperform Linear Probing for Vision-Language\n Models? A Likelihood Perspective","summary":" Adapting pre-trained models to new tasks can exhibit varying effectiveness\nacross datasets. Visual prompting, a state-of-the-art parameter-efficient\ntransfer learning method, can significantly improve the performance of\nout-of-distribution tasks. On the other hand, linear probing, a standard\ntransfer learning method, can sometimes become the best approach. We propose a\nlog-likelihood ratio (LLR) approach to analyze the comparative benefits of\nvisual prompting and linear probing. By employing the LLR score alongside\nresource-efficient visual prompts approximations, our cost-effective measure\nattains up to a 100-fold reduction in run time compared to full training, while\nachieving prediction accuracies up to 91%. The source code is available at\nhttps://github.com/IBM/VP-LLR.\n","authors":["Hsi-Ai Tsao","Lei Hsiung","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2409.01821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02664v1","updated":"2024-09-04T12:46:30Z","published":"2024-09-04T12:46:30Z","title":"Standing on the Shoulders of Giants: Reprogramming Visual-Language Model\n for General Deepfake Detection","summary":" The proliferation of deepfake faces poses huge potential negative impacts on\nour daily lives. Despite substantial advancements in deepfake detection over\nthese years, the generalizability of existing methods against forgeries from\nunseen datasets or created by emerging generative models remains constrained.\nIn this paper, inspired by the zero-shot advantages of Vision-Language Models\n(VLMs), we propose a novel approach that repurposes a well-trained VLM for\ngeneral deepfake detection. Motivated by the model reprogramming paradigm that\nmanipulates the model prediction via data perturbations, our method can\nreprogram a pretrained VLM model (e.g., CLIP) solely based on manipulating its\ninput without tuning the inner parameters. Furthermore, we insert a pseudo-word\nguided by facial identity into the text prompt. Extensive experiments on\nseveral popular benchmarks demonstrate that (1) the cross-dataset and\ncross-manipulation performances of deepfake detection can be significantly and\nconsistently improved (e.g., over 88% AUC in cross-dataset setting from FF++ to\nWildDeepfake) using a pre-trained CLIP model with our proposed reprogramming\nmethod; (2) our superior performances are at less cost of trainable parameters,\nmaking it a promising approach for real-world applications.\n","authors":["Kaiqing Lin","Yuzhen Lin","Weixiang Li","Taiping Yao","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2409.02664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02657v1","updated":"2024-09-04T12:30:25Z","published":"2024-09-04T12:30:25Z","title":"PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for\n One-Shot Talking Head Generation","summary":" While previous audio-driven talking head generation (THG) methods generate\nhead poses from driving audio, the generated poses or lips cannot match the\naudio well or are not editable. In this study, we propose \\textbf{PoseTalk}, a\nTHG system that can freely generate lip-synchronized talking head videos with\nfree head poses conditioned on text prompts and audio. The core insight of our\nmethod is using head pose to connect visual, linguistic, and audio signals.\nFirst, we propose to generate poses from both audio and text prompts, where the\naudio offers short-term variations and rhythm correspondence of the head\nmovements and the text prompts describe the long-term semantics of head\nmotions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to\ngenerate motion latent from text prompts and audio cues in a pose latent space.\nSecond, we observe a loss-imbalance problem: the loss for the lip region\ncontributes less than 4\\% of the total reconstruction loss caused by both pose\nand lip, making optimization lean towards head movements rather than lip\nshapes. To address this issue, we propose a refinement-based learning strategy\nto synthesize natural talking videos using two cascaded networks, i.e.,\nCoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce\nanimated images in novel poses and the RefineNet focuses on learning finer lip\nmotions by progressively estimating lip motions from low-to-high resolutions,\nyielding improved lip-synchronization performance. Experiments demonstrate our\npose prediction strategy achieves better pose diversity and realness compared\nto text-only or audio-only, and our video generator model outperforms\nstate-of-the-art methods in synthesizing talking videos with natural head\nmotions. Project: https://junleen.github.io/projects/posetalk.\n","authors":["Jun Ling","Yiwen Wang","Han Xue","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2409.02657v1.pdf","comment":"7+5 pages, 15 figures"},{"id":"http://arxiv.org/abs/2409.02653v1","updated":"2024-09-04T12:28:44Z","published":"2024-09-04T12:28:44Z","title":"Skip-and-Play: Depth-Driven Pose-Preserved Image Generation for Any\n Objects","summary":" The emergence of diffusion models has enabled the generation of diverse\nhigh-quality images solely from text, prompting subsequent efforts to enhance\nthe controllability of these models. Despite the improvement in\ncontrollability, pose control remains limited to specific objects (e.g.,\nhumans) or poses (e.g., frontal view) due to the fact that pose is generally\ncontrolled via camera parameters (e.g., rotation angle) or keypoints (e.g.,\neyes, nose). Specifically, camera parameters-conditional pose control models\ngenerate unrealistic images depending on the object, owing to the small size of\n3D datasets for training. Also, keypoint-based approaches encounter challenges\nin acquiring reliable keypoints for various objects (e.g., church) or poses\n(e.g., back view). To address these limitations, we propose depth-based pose\ncontrol, as depth maps are easily obtainable from a single depth estimation\nmodel regardless of objects and poses, unlike camera parameters and keypoints.\nHowever, depth-based pose control confronts issues of shape dependency, as\ndepth maps influence not only the pose but also the shape of the generated\nimages. To tackle this issue, we propose Skip-and-Play (SnP), designed via\nanalysis of the impact of three components of depth-conditional ControlNet on\nthe pose and the shape of the generated images. To be specific, based on the\nanalysis, we selectively skip parts of the components to mitigate shape\ndependency on the depth map while preserving the pose. Through various\nexperiments, we demonstrate the superiority of SnP over baselines and showcase\nthe ability of SnP to generate images of diverse objects and poses. Remarkably,\nSnP exhibits the ability to generate images even when the objects in the\ncondition (e.g., a horse) and the prompt (e.g., a hedgehog) differ from each\nother.\n","authors":["Kyungmin Jo","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2409.02653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02648v1","updated":"2024-09-04T12:26:19Z","published":"2024-09-04T12:26:19Z","title":"Creating a Microstructure Latent Space with Rich Material Information\n for Multiphase Alloy Design","summary":" The intricate microstructure serves as the cornerstone for the\ncomposition/processing-structure-property (CPSP) connection in multiphase\nalloys. Traditional alloy design methods often overlook microstructural\ndetails, which diminishes the reliability and effectiveness of the outcomes.\nThis study introduces an improved alloy design algorithm that integrates\nauthentic microstructural information to establish precise CPSP relationships.\nThe approach utilizes a deep-learning framework based on a variational\nautoencoder to map real microstructural data to a latent space, enabling the\nprediction of composition, processing steps, and material properties from the\nlatent space vector. By integrating this deep learning model with a specific\nsampling strategy in the latent space, a novel, microstructure-centered\nalgorithm for multiphase alloy design is developed. This algorithm is\ndemonstrated through the design of a unified dual-phase steel, and the results\nare assessed at three performance levels. Moreover, an exploration into the\nlatent vector space of the model highlights its seamless interpolation ability\nand its rich material information content. Notably, the current configuration\nof the latent space is particularly advantageous for alloy design, offering an\nexhaustive representation of microstructure, composition, processing, and\nproperty variations essential for multiphase alloys.\n","authors":["Xudong Ma","Yuqi Zhang","Chenchong Wang","Ming Wang","Mingxin Huang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2409.02648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02647v1","updated":"2024-09-04T12:23:47Z","published":"2024-09-04T12:23:47Z","title":"Learning-Based Error Detection System for Advanced Vehicle Instrument\n Cluster Rendering","summary":" The automotive industry is currently expanding digital display options with\nevery new model that comes onto the market. This entails not just an expansion\nin dimensions, resolution, and customization choices, but also the capability\nto employ novel display effects like overlays while assembling the content of\nthe display cluster. Unfortunately, this raises the need for appropriate\nmonitoring systems that can detect rendering errors and apply appropriate\ncountermeasures when required. Classical solutions such as Cyclic Redundancy\nChecks (CRC) will soon be no longer viable as any sort of alpha blending,\nwarping of scaling of content can cause unwanted CRC violations. Therefore, we\npropose a novel monitoring approach to verify correctness of displayed content\nusing telltales (e.g. warning signs) as example. It uses a learning-based\napproach to separate \"good\" telltales, i.e. those that a human driver will\nunderstand correctly, and \"corrupted\" telltales, i.e. those that will not be\nvisible or perceived correctly. As a result, it possesses inherent resilience\nagainst individual pixel errors and implicitly supports changing backgrounds,\noverlay or scaling effects. This is underlined by our experimental study where\nall \"corrupted\" test patterns were correctly classified, while no false alarms\nwere triggered.\n","authors":["Cornelius Bürkle","Fabian Oboril","Kay-Ulrich Scholl"],"pdf_url":"https://arxiv.org/pdf/2409.02647v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.02638v1","updated":"2024-09-04T12:06:33Z","published":"2024-09-04T12:06:33Z","title":"MADiff: Motion-Aware Mamba Diffusion Models for Hand Trajectory\n Prediction on Egocentric Videos","summary":" Understanding human intentions and actions through egocentric videos is\nimportant on the path to embodied artificial intelligence. As a branch of\negocentric vision techniques, hand trajectory prediction plays a vital role in\ncomprehending human motion patterns, benefiting downstream tasks in extended\nreality and robot manipulation. However, capturing high-level human intentions\nconsistent with reasonable temporal causality is challenging when only\negocentric videos are available. This difficulty is exacerbated under camera\negomotion interference and the absence of affordance labels to explicitly guide\nthe optimization of hand waypoint distribution. In this work, we propose a\nnovel hand trajectory prediction method dubbed MADiff, which forecasts future\nhand waypoints with diffusion models. The devised denoising operation in the\nlatent space is achieved by our proposed motion-aware Mamba, where the camera\nwearer's egomotion is integrated to achieve motion-driven selective scan\n(MDSS). To discern the relationship between hands and scenarios without\nexplicit affordance supervision, we leverage a foundation model that fuses\nvisual and language features to capture high-level semantics from video clips.\nComprehensive experiments conducted on five public datasets with the existing\nand our proposed new evaluation metrics demonstrate that MADiff predicts\ncomparably reasonable hand trajectories compared to the state-of-the-art\nbaselines, and achieves real-time performance. We will release our code and\npretrained models of MADiff at the project page:\nhttps://irmvlab.github.io/madiff.github.io.\n","authors":["Junyi Ma","Xieyuanli Chen","Wentao Bao","Jingyi Xu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00180v4","updated":"2024-09-04T11:56:13Z","published":"2023-03-01T02:14:20Z","title":"MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN\n for Precise Facial Expression Intensity Estimation","summary":" This paper presents MMA-MRNNet, a novel deep learning architecture for\ndynamic multi-output Facial Expression Intensity Estimation (FEIE) from video\ndata. Traditional approaches to this task often rely on complex 3-D CNNs, which\nrequire extensive pre-training and assume that facial expressions are uniformly\ndistributed across all frames of a video. These methods struggle to handle\nvideos of varying lengths, often resorting to ad-hoc strategies that either\ndiscard valuable information or introduce bias. MMA-MRNNet addresses these\nchallenges through a two-stage process. First, the Multiple Models of Affect\n(MMA) extractor component is a Multi-Task Learning CNN that concurrently\nestimates valence-arousal, recognizes basic facial expressions, and detects\naction units in each frame. These representations are then processed by a\nMasked RNN component, which captures temporal dependencies and dynamically\nupdates weights according to the true length of the input video, ensuring that\nonly the most relevant features are used for the final prediction. The proposed\nunimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction\ndataset and demonstrated significantly superior performance, surpassing\nstate-of-the-art methods by a wide margin, regardless of whether they were\nunimodal, multimodal, or ensemble approaches. Finally, we demonstrated the\neffectiveness of the MMA component of our proposed method across multiple\nin-the-wild datasets, where it consistently outperformed all state-of-the-art\nmethods across various metrics.\n","authors":["Dimitrios Kollias","Andreas Psaroudakis","Anastasios Arsenos","Paraskevi Theofilou","Chunchang Shao","Guanyu Hu","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2303.00180v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02634v1","updated":"2024-09-04T11:55:14Z","published":"2024-09-04T11:55:14Z","title":"Loopy: Taming Audio-Driven Portrait Avatar with Long-Term Motion\n Dependency","summary":" With the introduction of diffusion-based video generation techniques,\naudio-conditioned human video generation has recently achieved significant\nbreakthroughs in both the naturalness of motion and the synthesis of portrait\ndetails. Due to the limited control of audio signals in driving human motion,\nexisting methods often add auxiliary spatial signals to stabilize movements,\nwhich may compromise the naturalness and freedom of motion. In this paper, we\npropose an end-to-end audio-only conditioned video diffusion model named Loopy.\nSpecifically, we designed an inter- and intra-clip temporal module and an\naudio-to-latents module, enabling the model to leverage long-term motion\ninformation from the data to learn natural motion patterns and improving\naudio-portrait movement correlation. This method removes the need for manually\nspecified spatial motion templates used in existing methods to constrain motion\nduring inference. Extensive experiments show that Loopy outperforms recent\naudio-driven portrait diffusion models, delivering more lifelike and\nhigh-quality results across various scenarios.\n","authors":["Jianwen Jiang","Chao Liang","Jiaqi Yang","Gaojie Lin","Tianyun Zhong","Yanbo Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.02634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02629v1","updated":"2024-09-04T11:47:00Z","published":"2024-09-04T11:47:00Z","title":"AdvSecureNet: A Python Toolkit for Adversarial Machine Learning","summary":" Machine learning models are vulnerable to adversarial attacks. Several tools\nhave been developed to research these vulnerabilities, but they often lack\ncomprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch\nbased toolkit for adversarial machine learning that is the first to natively\nsupport multi-GPU setups for attacks, defenses, and evaluation. It is the first\ntoolkit that supports both CLI and API interfaces and external YAML\nconfiguration files to enhance versatility and reproducibility. The toolkit\nincludes multiple attacks, defenses and evaluation metrics. Rigiorous software\nengineering practices are followed to ensure high code quality and\nmaintainability. The project is available as an open-source project on GitHub\nat https://github.com/melihcatal/advsecurenet and installable via PyPI.\n","authors":["Melih Catal","Manuel Günther"],"pdf_url":"https://arxiv.org/pdf/2409.02629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16582v2","updated":"2024-09-04T11:14:18Z","published":"2024-03-25T09:49:42Z","title":"In the Search for Optimal Multi-view Learning Models for Crop\n Classification with Global Remote Sensing Data","summary":" Studying and analyzing cropland is a difficult task due to its dynamic and\nheterogeneous growth behavior. Usually, diverse data sources can be collected\nfor its estimation. Although deep learning models have proven to excel in the\ncrop classification task, they face substantial challenges when dealing with\nmultiple inputs, named Multi-View Learning (MVL). The methods used in the MVL\nscenario can be structured based on the encoder architecture, the fusion\nstrategy, and the optimization technique. The literature has primarily focused\non using specific encoder architectures for local regions, lacking a deeper\nexploration of other components in the MVL methodology. In contrast, we\ninvestigate the simultaneous selection of the fusion strategy and encoder\narchitecture, assessing global-scale cropland and crop-type classifications. We\nuse a range of five fusion strategies (Input, Feature, Decision, Ensemble,\nHybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible\nconfigurations in the MVL method. We use the CropHarvest dataset for\nvalidation, which provides optical, radar, weather time series, and topographic\ninformation as input data. We found that in scenarios with a limited number of\nlabeled samples, a unique configuration is insufficient for all the cases.\nInstead, a specialized combination should be meticulously sought, including an\nencoder and fusion strategy. To streamline this search process, we suggest\nidentifying the optimal encoder architecture tailored for a particular fusion\nstrategy, and then determining the most suitable fusion strategy for the\nclassification task. We provide a methodological framework for researchers\nexploring crop classification through an MVL methodology.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.16582v2.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2407.15512v2","updated":"2024-09-04T11:01:47Z","published":"2024-07-22T09:58:29Z","title":"Increasing the Robustness of Model Predictions to Missing Sensors in\n Earth Observation","summary":" Multi-sensor ML models for EO aim to enhance prediction accuracy by\nintegrating data from various sources. However, the presence of missing data\nposes a significant challenge, particularly in non-persistent sensors that can\nbe affected by external factors. Existing literature has explored strategies\nlike temporal dropout and sensor-invariant models to address the generalization\nto missing data issues. Inspired by these works, we study two novel methods\ntailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and\nEnsemble Sensor Invariant (ESensI). Through experimentation on three\nmulti-sensor temporal EO datasets, we demonstrate that these methods\neffectively increase the robustness of model predictions to missing sensors.\nParticularly, we focus on how the predictive performance of models drops when\nsensors are missing at different levels. We observe that ensemble multi-sensor\nmodels are the most robust to the lack of sensors. In addition, the sensor\ndropout component in ISensD shows promising robustness results.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2407.15512v2.pdf","comment":"Accepted at the MACLEAN workshop in the ECML/PKDD 2024"},{"id":"http://arxiv.org/abs/2401.15113v3","updated":"2024-09-04T10:59:10Z","published":"2024-01-25T20:41:17Z","title":"Scalable Glacier Mapping using Deep Learning and Open Earth Observation\n Data Matches the Accuracy of Manual Delineation","summary":" Accurate global glacier mapping is critical for understanding climate change\nimpacts. Despite its importance, automated glacier mapping at a global scale\nremains largely unexplored. Here we address this gap and propose\nGlacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep\nlearning model, and five strategies for multitemporal global-scale glacier\nmapping using open satellite imagery. Assessing the spatial, temporal and\ncross-sensor generalisation shows that our best strategy achieves intersection\nover union >0.85 on previously unobserved images in most cases, which drops to\n>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90\nfor regions dominated by clean ice. A comparative validation against human\nexpert uncertainties in terms of area and distance deviations underscores\nGlaViTU performance, approaching or matching expert-level delineation. Adding\nsynthetic aperture radar data, namely, backscatter and interferometric\ncoherence, increases the accuracy in all regions where available. The\ncalibrated confidence for glacier extents is reported making the predictions\nmore reliable and interpretable. We also release a benchmark dataset that\ncovers 9% of glaciers worldwide. Our results support efforts towards automated\nmultitemporal and global glacier mapping.\n","authors":["Konstantin A. Maslov","Claudio Persello","Thomas Schellenberger","Alfred Stein"],"pdf_url":"https://arxiv.org/pdf/2401.15113v3.pdf","comment":"after major revision, expanded validation"},{"id":"http://arxiv.org/abs/2409.02611v1","updated":"2024-09-04T10:56:05Z","published":"2024-09-04T10:56:05Z","title":"GoT-CQA: Graph-of-Thought Guided Compositional Reasoning for Chart\n Question Answering","summary":" Chart Question Answering (CQA) aims at answering questions based on the\nvisual chart content, which plays an important role in chart sumarization,\nbusiness data analysis, and data report generation. CQA is a challenging\nmulti-modal task because of the strong context dependence and complex reasoning\nrequirement. The former refers to answering this question strictly based on the\nanalysis of the visual content or internal data of the given chart, while the\nlatter emphasizes the various logical and numerical reasoning involved in\nanswer prediction process. In this paper, we pay more attention on the complex\nreasoning in CQA task, and propose a novel Graph-of-Thought (GoT) guided\ncompositional reasoning model called GoT-CQA to overcome this problem. At\nfirst, we transform the chart-oriented question into a directed acyclic GoT\ncomposed of multiple operator nodes, including localization, numerical and\nlogical operator. It intuitively reflects the human brain's solution process to\nthis question. After that, we design an efficient auto-compositional reasoning\nframework guided by the GoT, to excute the multi-step reasoning operations in\nvarious types of questions. Comprehensive experiments on ChartQA and PlotQA-D\ndatasets show that GoT-CQA achieves outstanding performance, especially in\ncomplex human-written and reasoning questions, comparing with the latest\npopular baselines.\n","authors":["Lingling Zhang","Muye Huang","QianYing Wang","Yaxian Wang","Wenjun Wu","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02608v1","updated":"2024-09-04T10:45:33Z","published":"2024-09-04T10:45:33Z","title":"A Medical Multimodal Large Language Model for Pediatric Pneumonia","summary":" Pediatric pneumonia is the leading cause of death among children under five\nyears worldwide, imposing a substantial burden on affected families. Currently,\nthere are three significant hurdles in diagnosing and treating pediatric\npneumonia. Firstly, pediatric pneumonia shares similar symptoms with other\nrespiratory diseases, making rapid and accurate differential diagnosis\nchallenging. Secondly, primary hospitals often lack sufficient medical\nresources and experienced doctors. Lastly, providing personalized diagnostic\nreports and treatment recommendations is labor-intensive and time-consuming. To\ntackle these challenges, we proposed a Medical Multimodal Large Language Model\nfor Pediatric Pneumonia (P2Med-MLLM). It was capable of handling diverse\nclinical tasks, such as generating free-text radiology reports and medical\nrecords within a unified framework. Specifically, P2Med-MLLM can process both\npure text and image-text data, trained on an extensive and large-scale dataset\n(P2Med-MD), including real clinical information from 163,999 outpatient and\n8,684 inpatient cases. This dataset comprised 2D chest X-ray images, 3D chest\nCT images, corresponding radiology reports, and outpatient and inpatient\nrecords. We designed a three-stage training strategy to enable P2Med-MLLM to\ncomprehend medical knowledge and follow instructions for various clinical\ntasks. To rigorously evaluate P2Med-MLLM's performance, we developed\nP2Med-MBench, a benchmark consisting of 642 meticulously verified samples by\npediatric pulmonology specialists, covering six clinical decision-support tasks\nand a balanced variety of diseases. The automated scoring results demonstrated\nthe superiority of P2Med-MLLM. This work plays a crucial role in assisting\nprimary care doctors with prompt disease diagnosis and treatment planning,\nreducing severe symptom mortality rates, and optimizing the allocation of\nmedical resources.\n","authors":["Weiwei Tian","Xinyu Huang","Tianhao Cheng","Wen He","Jinwu Fang","Rui Feng","Daoying Geng","Xiaobo Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02608v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.16766v2","updated":"2024-09-04T10:42:41Z","published":"2024-08-29T17:59:30Z","title":"CSGO: Content-Style Composition in Text-to-Image Generation","summary":" The diffusion model has shown exceptional capabilities in controlled image\ngeneration, which has further fueled interest in image style transfer. Existing\nworks mainly focus on training free-based methods (e.g., image inversion) due\nto the scarcity of specific data. In this study, we present a data construction\npipeline for content-style-stylized image triplets that generates and\nautomatically cleanses stylized data triplets. Based on this pipeline, we\nconstruct a dataset IMAGStyle, the first large-scale style transfer dataset\ncontaining 210k image triplets, available for the community to explore and\nresearch. Equipped with IMAGStyle, we propose CSGO, a style transfer model\nbased on end-to-end training, which explicitly decouples content and style\nfeatures employing independent feature injection. The unified CSGO implements\nimage-driven style transfer, text-driven stylized synthesis, and text\nediting-driven stylized synthesis. Extensive experiments demonstrate the\neffectiveness of our approach in enhancing style control capabilities in image\ngeneration. Additional visualization and access to the source code can be\nlocated on the project page: \\url{https://csgo-gen.github.io/}.\n","authors":["Peng Xing","Haofan Wang","Yanpeng Sun","Qixun Wang","Xu Bai","Hao Ai","Renyuan Huang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2408.16766v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01574v3","updated":"2024-09-04T10:42:23Z","published":"2023-09-04T12:53:54Z","title":"Object-Size-Driven Design of Convolutional Neural Networks: Virtual Axle\n Detection based on Raw Data","summary":" As infrastructure ages, the need for efficient monitoring methods becomes\nincreasingly critical. Bridge Weigh-In-Motion (BWIM) systems are crucial for\ncost-efficient load and thus residual service life determination of road and\nrailway infrastructure. However, conventional BWIM systems require additional\nsensors for axle detection, which have to be installed in potentially\ninaccessible locations or in locations that interfere with bridge operation.\nThis study addresses this challenge by replacing dedicated axle detectors with\na novel approach to real-time detection of train axles using sensors\narbitrarily placed on bridges. The proposed Virtual Axle Detector with Enhanced\nReceptive Field (VADER) has been validated on a single-track railway bridge,\ndemonstrating that it achieves to detect 99.9% of axles with a spatial error of\n3.69cm using only acceleration measurements. Using raw data as input\noutperforms the state-of-the-art spectrogram-based method in both speed and\nmemory usage by 99%, making real-time application feasible for the first time.\nAdditionally, we introduce the Maximum Receptive Field (MRF) rule, a novel\napproach to optimise hyperparameters of Convolutional Neural Networks (CNNs)\nbased on the size of objects, which in this case relates to the fundamental\nfrequency of a bridge. The MRF rule effectively narrows the hyperparameter\nsearch space, potentially replacing the need for extensive hyperparameter\ntuning. Since the MRF rule is theoretically applicable to all unstructured\ndata, it could have implications for a wide range of deep learning problems\nfrom earthquake prediction to object recognition.\n","authors":["Henik Riedel","Robert Steven Lorenzen","Clemens Hübler"],"pdf_url":"https://arxiv.org/pdf/2309.01574v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02599v1","updated":"2024-09-04T10:30:11Z","published":"2024-09-04T10:30:11Z","title":"A Fashion Item Recommendation Model in Hyperbolic Space","summary":" In this work, we propose a fashion item recommendation model that\nincorporates hyperbolic geometry into user and item representations. Using\nhyperbolic space, our model aims to capture implicit hierarchies among items\nbased on their visual data and users' purchase history. During training, we\napply a multi-task learning framework that considers both hyperbolic and\nEuclidean distances in the loss function. Our experiments on three data sets\nshow that our model performs better than previous models trained in Euclidean\nspace only, confirming the effectiveness of our model. Our ablation studies\nshow that multi-task learning plays a key role, and removing the Euclidean loss\nsubstantially deteriorates the model performance.\n","authors":["Ryotaro Shimizu","Yu Wang","Masanari Kimura","Yuki Hirakawa","Takashi Wada","Yuki Saito","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.02599v1.pdf","comment":"This work was presented at the CVFAD Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2409.02598v1","updated":"2024-09-04T10:29:59Z","published":"2024-09-04T10:29:59Z","title":"SurgTrack: CAD-Free 3D Tracking of Real-world Surgical Instruments","summary":" Vision-based surgical navigation has received increasing attention due to its\nnon-invasive, cost-effective, and flexible advantages. In particular, a\ncritical element of the vision-based navigation system is tracking surgical\ninstruments. Compared with 2D instrument tracking methods, 3D instrument\ntracking has broader value in clinical practice, but is also more challenging\ndue to weak texture, occlusion, and lack of Computer-Aided Design (CAD) models\nfor 3D registration. To solve these challenges, we propose the SurgTrack, a\ntwo-stage 3D instrument tracking method for CAD-free and robust real-world\napplications. In the first registration stage, we incorporate an Instrument\nSigned Distance Field (SDF) modeling the 3D representation of instruments,\nachieving CAD-freed 3D registration. Due to this, we can obtain the location\nand orientation of instruments in the 3D space by matching the video stream\nwith the registered SDF model. In the second tracking stage, we devise a\nposture graph optimization module, leveraging the historical tracking results\nof the posture memory pool to optimize the tracking results and improve the\nocclusion robustness. Furthermore, we collect the Instrument3D dataset to\ncomprehensively evaluate the 3D tracking of surgical instruments. The extensive\nexperiments validate the superiority and scalability of our SurgTrack, by\noutperforming the state-of-the-arts with a remarkable improvement. The code and\ndataset are available at https://github.com/wenwucode/SurgTrack.\n","authors":["Wenwu Guo","Jinlin Wu","Zhen Chen","Qingxiang Zhao","Miao Xu","Zhen Lei","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02584v1","updated":"2024-09-04T10:06:42Z","published":"2024-09-04T10:06:42Z","title":"BMI Prediction from Handwritten English Characters Using a Convolutional\n Neural Network","summary":" A person's Body Mass Index, or BMI, is the most widely used parameter for\nassessing their health. BMI is a crucial predictor of potential diseases that\nmay arise at higher body fat levels because it is correlated with body fat.\nConversely, a community's or an individual's nutritional status can be\ndetermined using the BMI. Although deep learning models are used in several\nstudies to estimate BMI from face photos and other data, no previous research\nestablished a clear connection between deep learning techniques for handwriting\nanalysis and BMI prediction. This article addresses this research gap with a\ndeep learning approach to estimating BMI from handwritten characters by\ndeveloping a convolutional neural network (CNN). A dataset containing samples\nfrom 48 people in lowercase English scripts is successfully captured for the\nBMI prediction task. The proposed CNN-based approach reports a commendable\naccuracy of 99.92%. Performance comparison with other popular CNN architectures\nreveals that AlexNet and InceptionV3 achieve the second and third-best\nperformance, with the accuracy of 99.69% and 99.53%, respectively.\n","authors":["N. T. Diba","N. Akter","S. A. H. Chowdhury","J. E. Giti"],"pdf_url":"https://arxiv.org/pdf/2409.02584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02581v1","updated":"2024-09-04T10:03:11Z","published":"2024-09-04T10:03:11Z","title":"Object Gaussian for Monocular 6D Pose Estimation from Sparse Views","summary":" Monocular object pose estimation, as a pivotal task in computer vision and\nrobotics, heavily depends on accurate 2D-3D correspondences, which often demand\ncostly CAD models that may not be readily available. Object 3D reconstruction\nmethods offer an alternative, among which recent advancements in 3D Gaussian\nSplatting (3DGS) afford a compelling potential. Yet its performance still\nsuffers and tends to overfit with fewer input views. Embracing this challenge,\nwe introduce SGPose, a novel framework for sparse view object pose estimation\nusing Gaussian-based methods. Given as few as ten views, SGPose generates a\ngeometric-aware representation by starting with a random cuboid initialization,\neschewing reliance on Structure-from-Motion (SfM) pipeline-derived geometry as\nrequired by traditional 3DGS methods. SGPose removes the dependence on CAD\nmodels by regressing dense 2D-3D correspondences between images and the\nreconstructed model from sparse input and random initialization, while the\ngeometric-consistent depth supervision and online synthetic view warping are\nkey to the success. Experiments on typical benchmarks, especially on the\nOcclusion LM-O dataset, demonstrate that SGPose outperforms existing methods\neven under sparse view constraints, under-scoring its potential in real-world\napplications.\n","authors":["Luqing Luo","Shichu Sun","Jiangang Yang","Linfang Zheng","Jinwei Du","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02574v1","updated":"2024-09-04T09:48:27Z","published":"2024-09-04T09:48:27Z","title":"Solving Video Inverse Problems Using Image Diffusion Models","summary":" Recently, diffusion model-based inverse problem solvers (DIS) have emerged as\nstate-of-the-art approaches for addressing inverse problems, including image\nsuper-resolution, deblurring, inpainting, etc. However, their application to\nvideo inverse problems arising from spatio-temporal degradation remains largely\nunexplored due to the challenges in training video diffusion models. To address\nthis issue, here we introduce an innovative video inverse solver that leverages\nonly image diffusion models. Specifically, by drawing inspiration from the\nsuccess of the recent decomposed diffusion sampler (DDS), our method treats the\ntime dimension of a video as the batch dimension of image diffusion models and\nsolves spatio-temporal optimization problems within denoised spatio-temporal\nbatches derived from each image diffusion model. Moreover, we introduce a\nbatch-consistent diffusion sampling strategy that encourages consistency across\nbatches by synchronizing the stochastic noise components in image diffusion\nmodels. Our approach synergistically combines batch-consistent sampling with\nsimultaneous optimization of denoised spatio-temporal batches at each reverse\ndiffusion step, resulting in a novel and efficient diffusion sampling strategy\nfor video inverse problems. Experimental results demonstrate that our method\neffectively addresses various spatio-temporal degradations in video inverse\nproblems, achieving state-of-the-art reconstructions. Project page:\nhttps://solving-video-inverse.github.io/main/\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2409.02574v1.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2312.06726v4","updated":"2024-09-04T09:45:08Z","published":"2023-12-11T05:57:09Z","title":"Filter & Align: Leveraging Human Knowledge to Curate Image-Text Data","summary":" The increasing availability of image-text pairs has largely fueled the rapid\nadvancement in vision-language foundation models. However, the vast scale of\nthese datasets inevitably introduces significant variability in data quality,\nwhich can adversely affect the model performance. This highlights the critical\nrole of data filtering, not only to enhance training efficiency but also to\nimprove overall data quality. Existing methods typically rely on metrics such\nas CLIP Score and BLIP Score, which are derived from pre-trained models.\nHowever, these models are often trained on uncurated, noisy datasets, which can\nperpetuate errors and misalignments in the filtered dataset. We present a novel\nalgorithm that incorporates human knowledge on image-text alignment to guide\nfiltering vast corpus of web-crawled image-text datasets into a compact and\nhigh-quality form. To systemically capture human preferences on image-text\nalignments, we collect a diverse image-text dataset where each image is\nassociated with multiple captions from various sources, and establish a\ncomprehensive set of both subjective and objective criteria for critically\nguiding the alignment assessment from labelers. Additionally, we train a reward\nmodel on these human-preference annotations to internalize the nuanced human\nunderstanding of image-text alignment. The resulting reward model thus can act\nas a human-like referee to filter image-text pairs. Extensive experiments\ndemonstrate that we can maintain, sometimes even improve, model performance\nwhile compressing the image-text datasets up to ~90%. An impressive example is\nthat, by aggressively reducing the total training sample from 130M to only\n15.5M, our BLIP-B/16 models consistently show an average improvement of 2.9% on\nretrieval tasks and 11.5% on captioning tasks compared to full-size-dataset\ncounterparts.\n","authors":["Lei Zhang","Fangxun Shu","Tianyang Liu","Sucheng Ren","Hao Jiang","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.06726v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10904v2","updated":"2024-09-04T09:42:07Z","published":"2024-04-16T20:51:36Z","title":"Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression\n Recognition","summary":" Human communication is multi-modal; e.g., face-to-face interaction involves\nauditory signals (speech) and visual signals (face movements and hand\ngestures). Hence, it is essential to exploit multiple modalities when designing\nmachine learning-based facial expression recognition systems. In addition,\ngiven the ever-growing quantities of video data that capture human facial\nexpressions, such systems should utilize raw unlabeled videos without requiring\nexpensive annotations. Therefore, in this work, we employ a multitask\nmulti-modal self-supervised learning method for facial expression recognition\nfrom in-the-wild video data. Our model combines three self-supervised objective\nfunctions: First, a multi-modal contrastive loss, that pulls diverse data\nmodalities of the same video together in the representation space. Second, a\nmulti-modal clustering loss that preserves the semantic structure of input data\nin the representation space. Finally, a multi-modal data reconstruction loss.\nWe conduct a comprehensive study on this multimodal multi-task self-supervised\nlearning method on three facial expression recognition benchmarks. To that end,\nwe examine the performance of learning through different combinations of\nself-supervised tasks on the facial expression recognition downstream task. Our\nmodel ConCluGen outperforms several multi-modal self-supervised and fully\nsupervised baselines on the CMU-MOSEI dataset. Our results generally show that\nmulti-modal self-supervision tasks offer large performance gains for\nchallenging tasks such as facial expression recognition, while also reducing\nthe amount of manual annotations required. We release our pre-trained models as\nwell as source code publicly\n","authors":["Marah Halawa","Florian Blume","Pia Bideau","Martin Maier","Rasha Abdel Rahman","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2404.10904v2.pdf","comment":"The paper will appear in the CVPR 2024 workshops proceedings"},{"id":"http://arxiv.org/abs/2409.02567v1","updated":"2024-09-04T09:35:09Z","published":"2024-09-04T09:35:09Z","title":"Evaluation Study on SAM 2 for Class-agnostic Instance-level Segmentation","summary":" Segment Anything Model (SAM) has demonstrated powerful zero-shot segmentation\nperformance in natural scenes. The recently released Segment Anything Model 2\n(SAM2) has further heightened researchers' expectations towards image\nsegmentation capabilities. To evaluate the performance of SAM2 on\nclass-agnostic instance-level segmentation tasks, we adopt different prompt\nstrategies for SAM2 to cope with instance-level tasks for three relevant\nscenarios: Salient Instance Segmentation (SIS), Camouflaged Instance\nSegmentation (CIS), and Shadow Instance Detection (SID). In addition, to\nfurther explore the effectiveness of SAM2 in segmenting granular object\nstructures, we also conduct detailed tests on the high-resolution Dichotomous\nImage Segmentation (DIS) benchmark to assess the fine-grained segmentation\ncapability. Qualitative and quantitative experimental results indicate that the\nperformance of SAM2 varies significantly across different scenarios. Besides,\nSAM2 is not particularly sensitive to segmenting high-resolution fine details.\nWe hope this technique report can drive the emergence of SAM2-based adapters,\naiming to enhance the performance ceiling of large vision models on\nclass-agnostic instance segmentation tasks.\n","authors":["Tiantian Zhang","Zhangjun Zhou","Jialun Pei"],"pdf_url":"https://arxiv.org/pdf/2409.02567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17472v2","updated":"2024-09-04T09:34:13Z","published":"2024-06-25T11:30:31Z","title":"UHD-IQA Benchmark Database: Pushing the Boundaries of Blind Photo\n Quality Assessment","summary":" We introduce a novel Image Quality Assessment (IQA) dataset comprising 6073\nUHD-1 (4K) images, annotated at a fixed width of 3840 pixels. Contrary to\nexisting No-Reference (NR) IQA datasets, ours focuses on highly aesthetic\nphotos of high technical quality, filling a gap in the literature. The images,\ncarefully curated to exclude synthetic content, are sufficiently diverse to\ntrain general NR-IQA models. Importantly, the dataset is annotated with\nperceptual quality ratings obtained through a crowdsourcing study. Ten expert\nraters, comprising photographers and graphics artists, assessed each image at\nleast twice in multiple sessions spanning several days, resulting in 20 highly\nreliable ratings per image. Annotators were rigorously selected based on\nseveral metrics, including self-consistency, to ensure their reliability. The\ndataset includes rich metadata with user and machine-generated tags from over\n5,000 categories and popularity indicators such as favorites, likes, downloads,\nand views. With its unique characteristics, such as its focus on high-quality\nimages, reliable crowdsourced annotations, and high annotation resolution, our\ndataset opens up new opportunities for advancing perceptual image quality\nassessment research and developing practical NR-IQA models that apply to modern\nphotos. Our dataset is available at\nhttps://database.mmsp-kn.de/uhd-iqa-benchmark-database.html\n","authors":["Vlad Hosu","Lorenzo Agnolucci","Oliver Wiedemann","Daisuke Iso","Dietmar Saupe"],"pdf_url":"https://arxiv.org/pdf/2406.17472v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02566v1","updated":"2024-09-04T09:32:40Z","published":"2024-09-04T09:32:40Z","title":"How Do You Perceive My Face? Recognizing Facial Expressions in\n Multi-Modal Context by Modeling Mental Representations","summary":" Facial expression perception in humans inherently relies on prior knowledge\nand contextual cues, contributing to efficient and flexible processing. For\ninstance, multi-modal emotional context (such as voice color, affective text,\nbody pose, etc.) can prompt people to perceive emotional expressions in\nobjectively neutral faces. Drawing inspiration from this, we introduce a novel\napproach for facial expression classification that goes beyond simple\nclassification tasks. Our model accurately classifies a perceived face and\nsynthesizes the corresponding mental representation perceived by a human when\nobserving a face in context. With this, our model offers visual insights into\nits internal decision-making process. We achieve this by learning two\nindependent representations of content and context using a VAE-GAN\narchitecture. Subsequently, we propose a novel attention mechanism for\ncontext-dependent feature adaptation. The adapted representation is used for\nclassification and to generate a context-augmented expression. We evaluate\nsynthesized expressions in a human study, showing that our model effectively\nproduces approximations of human mental representations. We achieve\nState-of-the-Art classification accuracies of 81.01% on the RAVDESS dataset and\n79.34% on the MEAD dataset. We make our code publicly available.\n","authors":["Florian Blume","Runfeng Qu","Pia Bideau","Martin Maier","Rasha Abdel Rahman","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2409.02566v1.pdf","comment":"GCPR 2024"},{"id":"http://arxiv.org/abs/2409.02562v1","updated":"2024-09-04T09:29:24Z","published":"2024-09-04T09:29:24Z","title":"Interacting Multiple Model-based Joint Homography Matrix and Multiple\n Object State Estimation","summary":" A novel MOT algorithm, IMM Joint Homography State Estimation (IMM-JHSE), is\nproposed. By jointly modelling the camera projection matrix as part of track\nstate vectors, IMM-JHSE removes the explicit influence of camera motion\ncompensation techniques on predicted track position states, which was prevalent\nin previous approaches. Expanding upon this, static and dynamic camera motion\nmodels are combined through the use of an IMM filter. A simple bounding box\nmotion model is used to predict bounding box positions to incorporate image\nplane information. In addition to applying an IMM to camera motion, a\nnon-standard IMM approach is applied where bounding-box-based BIoU scores are\nmixed with ground-plane-based Mahalanobis distances in an IMM-like fashion to\nperform association only. Finally, IMM-JHSE makes use of dynamic process and\nmeasurement noise estimation techniques. IMM-JHSE improves upon related\ntechniques on the DanceTrack and KITTI-car datasets, increasing HOTA by 2.64\nand 2.11, respectively, while offering competitive performance on the MOT17,\nMOT20 and KITTI-pedestrian datasets.\n","authors":["Paul Johannes Claasen","Johan Pieter de Villiers"],"pdf_url":"https://arxiv.org/pdf/2409.02562v1.pdf","comment":"Preprint submitted to Information Fusion"},{"id":"http://arxiv.org/abs/2303.17249v4","updated":"2024-09-04T09:27:35Z","published":"2023-03-30T09:29:03Z","title":"Model-agnostic explainable artificial intelligence for object detection\n in image data","summary":" In recent years, deep neural networks have been widely used for building\nhigh-performance Artificial Intelligence (AI) systems for computer vision\napplications. Object detection is a fundamental task in computer vision, which\nhas been greatly progressed through developing large and intricate AI models.\nHowever, the lack of transparency is a big challenge that may not allow the\nwidespread adoption of these models. Explainable artificial intelligence is a\nfield of research where methods are developed to help users understand the\nbehavior, decision logics, and vulnerabilities of AI systems. Previously, few\nexplanation methods were developed for object detection based on random\nmasking. However, random masks may raise some issues regarding the actual\nimportance of pixels within an image. In this paper, we design and implement a\nblack-box explanation method named Black-box Object Detection Explanation by\nMasking (BODEM) through adopting a hierarchical random masking approach for\nobject detection systems. We propose a hierarchical random masking framework in\nwhich coarse-grained masks are used in lower levels to find salient regions\nwithin an image, and fine-grained mask are used to refine the salient regions\nin higher levels. Experimentations on various object detection datasets and\nmodels showed that BODEM can effectively explain the behavior of object\ndetectors. Moreover, our method outperformed Detector Randomized Input Sampling\nfor Explanation (D-RISE) and Local Interpretable Model-agnostic Explanations\n(LIME) with respect to different quantitative measures of explanation\neffectiveness. The experimental results demonstrate that BODEM can be an\neffective method for explaining and validating object detection systems in\nblack-box testing scenarios.\n","authors":["Milad Moradi","Ke Yan","David Colwell","Matthias Samwald","Rhona Asgari"],"pdf_url":"https://arxiv.org/pdf/2303.17249v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02555v1","updated":"2024-09-04T09:21:13Z","published":"2024-09-04T09:21:13Z","title":"Low-Resolution Object Recognition with Cross-Resolution Relational\n Contrastive Distillation","summary":" Recognizing objects in low-resolution images is a challenging task due to the\nlack of informative details. Recent studies have shown that knowledge\ndistillation approaches can effectively transfer knowledge from a\nhigh-resolution teacher model to a low-resolution student model by aligning\ncross-resolution representations. However, these approaches still face\nlimitations in adapting to the situation where the recognized objects exhibit\nsignificant representation discrepancies between training and testing images.\nIn this study, we propose a cross-resolution relational contrastive\ndistillation approach to facilitate low-resolution object recognition. Our\napproach enables the student model to mimic the behavior of a well-trained\nteacher model which delivers high accuracy in identifying high-resolution\nobjects. To extract sufficient knowledge, the student learning is supervised\nwith contrastive relational distillation loss, which preserves the similarities\nin various relational structures in contrastive representation space. In this\nmanner, the capability of recovering missing details of familiar low-resolution\nobjects can be effectively enhanced, leading to a better knowledge transfer.\nExtensive experiments on low-resolution object classification and\nlow-resolution face recognition clearly demonstrate the effectiveness and\nadaptability of our approach.\n","authors":["Kangkai Zhang","Shiming Ge","Ruixin Shi","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02555v1.pdf","comment":"This paper is accepted by IEEE Transactions on Circuits and Systems\n for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2409.02546v1","updated":"2024-09-04T09:03:47Z","published":"2024-09-04T09:03:47Z","title":"Real-Time Dynamic Scale-Aware Fusion Detection Network: Take Road Damage\n Detection as an example","summary":" Unmanned Aerial Vehicle (UAV)-based Road Damage Detection (RDD) is important\nfor daily maintenance and safety in cities, especially in terms of\nsignificantly reducing labor costs. However, current UAV-based RDD research is\nstill faces many challenges. For example, the damage with irregular size and\ndirection, the masking of damage by the background, and the difficulty of\ndistinguishing damage from the background significantly affect the ability of\nUAV to detect road damage in daily inspection. To solve these problems and\nimprove the performance of UAV in real-time road damage detection, we design\nand propose three corresponding modules: a feature extraction module that\nflexibly adapts to shape and background; a module that fuses multiscale\nperception and adapts to shape and background ; an efficient downsampling\nmodule. Based on these modules, we designed a multi-scale, adaptive road damage\ndetection model with the ability to automatically remove background\ninterference, called Dynamic Scale-Aware Fusion Detection Model (RT-DSAFDet).\nExperimental results on the UAV-PDD2023 public dataset show that our model\nRT-DSAFDet achieves a mAP50 of 54.2%, which is 11.1% higher than that of\nYOLOv10-m, an efficient variant of the latest real-time object detection model\nYOLOv10, while the amount of parameters is reduced to 1.8M and FLOPs to 4.6G,\nwith a decreased by 88% and 93%, respectively. Furthermore, on the large\ngeneralized object detection public dataset MS COCO2017 also shows the\nsuperiority of our model with mAP50-95 is the same as YOLOv9-t, but with 0.5%\nhigher mAP50, 10% less parameters volume, and 40% less FLOPs.\n","authors":["Weichao Pan","Xu Wang","Wenqing Huan"],"pdf_url":"https://arxiv.org/pdf/2409.02546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13085v2","updated":"2024-09-04T09:02:33Z","published":"2024-08-23T14:12:03Z","title":"Map-Free Visual Relocalization Enhanced by Instance Knowledge and Depth\n Knowledge","summary":" Map-free relocalization technology is crucial for applications in autonomous\nnavigation and augmented reality, but relying on pre-built maps is often\nimpractical. It faces significant challenges due to limitations in matching\nmethods and the inherent lack of scale in monocular images. These issues lead\nto substantial rotational and metric errors and even localization failures in\nreal-world scenarios. Large matching errors significantly impact the overall\nrelocalization process, affecting both rotational and translational accuracy.\nDue to the inherent limitations of the camera itself, recovering the metric\nscale from a single image is crucial, as this significantly impacts the\ntranslation error. To address these challenges, we propose a map-free\nrelocalization method enhanced by instance knowledge and depth knowledge. By\nleveraging instance-based matching information to improve global matching\nresults, our method significantly reduces the possibility of mismatching across\ndifferent objects. The robustness of instance knowledge across the scene helps\nthe feature point matching model focus on relevant regions and enhance matching\naccuracy. Additionally, we use estimated metric depth from a single image to\nreduce metric errors and improve scale recovery accuracy. By integrating\nmethods dedicated to mitigating large translational and rotational errors, our\napproach demonstrates superior performance in map-free relocalization\ntechniques.\n","authors":["Mingyu Xiao","Runze Chen","Haiyong Luo","Fang Zhao","Juan Wang","Xuepeng Ma"],"pdf_url":"https://arxiv.org/pdf/2408.13085v2.pdf","comment":"17 pages,6 figures"},{"id":"http://arxiv.org/abs/2409.02545v1","updated":"2024-09-04T09:02:01Z","published":"2024-09-04T09:02:01Z","title":"UniTT-Stereo: Unified Training of Transformer for Enhanced Stereo\n Matching","summary":" Unlike other vision tasks where Transformer-based approaches are becoming\nincreasingly common, stereo depth estimation is still dominated by\nconvolution-based approaches. This is mainly due to the limited availability of\nreal-world ground truth for stereo matching, which is a limiting factor in\nimproving the performance of Transformer-based stereo approaches. In this\npaper, we propose UniTT-Stereo, a method to maximize the potential of\nTransformer-based stereo architectures by unifying self-supervised learning\nused for pre-training with stereo matching framework based on supervised\nlearning. To be specific, we explore the effectiveness of reconstructing\nfeatures of masked portions in an input image and at the same time predicting\ncorresponding points in another image from the perspective of locality\ninductive bias, which is crucial in training models with limited training data.\nMoreover, to address these challenging tasks of reconstruction-and-prediction,\nwe present a new strategy to vary a masking ratio when training the stereo\nmodel with stereo-tailored losses. State-of-the-art performance of UniTT-Stereo\nis validated on various benchmarks such as ETH3D, KITTI 2012, and KITTI 2015\ndatasets. Lastly, to investigate the advantages of the proposed approach, we\nprovide a frequency analysis of feature maps and the analysis of locality\ninductive bias based on attention maps.\n","authors":["Soomin Kim","Hyesong Choi","Jihye Ahn","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02543v1","updated":"2024-09-04T09:01:21Z","published":"2024-09-04T09:01:21Z","title":"StyleTokenizer: Defining Image Style by a Single Instance for\n Controlling Diffusion Models","summary":" Despite the burst of innovative methods for controlling the diffusion\nprocess, effectively controlling image styles in text-to-image generation\nremains a challenging task. Many adapter-based methods impose image\nrepresentation conditions on the denoising process to accomplish image control.\nHowever these conditions are not aligned with the word embedding space, leading\nto interference between image and text control conditions and the potential\nloss of semantic information from the text prompt. Addressing this issue\ninvolves two key challenges. Firstly, how to inject the style representation\nwithout compromising the effectiveness of text representation in control.\nSecondly, how to obtain the accurate style representation from a single\nreference image. To tackle these challenges, we introduce StyleTokenizer, a\nzero-shot style control image generation method that aligns style\nrepresentation with text representation using a style tokenizer. This alignment\neffectively minimizes the impact on the effectiveness of text prompts.\nFurthermore, we collect a well-labeled style dataset named Style30k to train a\nstyle feature extractor capable of accurately representing style while\nexcluding other content information. Experimental results demonstrate that our\nmethod fully grasps the style characteristics of the reference image,\ngenerating appealing images that are consistent with both the target image\nstyle and text prompt. The code and dataset are available at\nhttps://github.com/alipay/style-tokenizer.\n","authors":["Wen Li","Muyuan Fang","Cheng Zou","Biao Gong","Ruobing Zheng","Meng Wang","Jingdong Chen","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02543v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2409.02529v1","updated":"2024-09-04T08:42:42Z","published":"2024-09-04T08:42:42Z","title":"Sample what you cant compress","summary":" For learned image representations, basic autoencoders often produce blurry\nresults. Reconstruction quality can be improved by incorporating additional\npenalties such as adversarial (GAN) and perceptual losses. Arguably, these\napproaches lack a principled interpretation. Concurrently, in generative\nsettings diffusion has demonstrated a remarkable ability to create crisp, high\nquality results and has solid theoretical underpinnings (from variational\ninference to direct study as the Fisher Divergence). Our work combines\nautoencoder representation learning with diffusion and is, to our knowledge,\nthe first to demonstrate the efficacy of jointly learning a continuous encoder\nand decoder under a diffusion-based loss. We demonstrate that this approach\nyields better reconstruction quality as compared to GAN-based autoencoders\nwhile being easier to tune. We also show that the resulting representation is\neasier to model with a latent diffusion model as compared to the representation\nobtained from a state-of-the-art GAN-based loss. Since our decoder is\nstochastic, it can generate details not encoded in the otherwise deterministic\nlatent representation; we therefore name our approach \"Sample what you can't\ncompress\", or SWYCC for short.\n","authors":["Vighnesh Birodkar","Gabriel Barcik","James Lyon","Sergey Ioffe","David Minnen","Joshua V. Dillon"],"pdf_url":"https://arxiv.org/pdf/2409.02529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11965v2","updated":"2024-09-04T08:35:32Z","published":"2024-08-21T19:36:27Z","title":"CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT\n Volumes","summary":" The rapid increase of computed tomography (CT) scans and their time-consuming\nmanual analysis have created an urgent need for robust automated analysis\ntechniques in clinical settings. These aim to assist radiologists and help them\nmanaging their growing workload. Existing methods typically generate entire\nreports directly from 3D CT images, without explicitly focusing on observed\nabnormalities. This unguided approach often results in repetitive content or\nincomplete reports, failing to prioritize anomaly-specific descriptions. We\npropose a new anomaly-guided report generation model, which first predicts\nabnormalities and then generates targeted descriptions for each. Evaluation on\na public dataset demonstrates significant improvements in report quality and\nclinical relevance. We extend our work by conducting an ablation study to\ndemonstrate its effectiveness.\n","authors":["Theo Di Piazza"],"pdf_url":"https://arxiv.org/pdf/2408.11965v2.pdf","comment":"15 pages, 9 figures, submitted to ISBI 2025"},{"id":"http://arxiv.org/abs/2409.02513v1","updated":"2024-09-04T08:24:53Z","published":"2024-09-04T08:24:53Z","title":"SG-MIM: Structured Knowledge Guided Efficient Pre-training for Dense\n Prediction","summary":" Masked Image Modeling (MIM) techniques have redefined the landscape of\ncomputer vision, enabling pre-trained models to achieve exceptional performance\nacross a broad spectrum of tasks. Despite their success, the full potential of\nMIM-based methods in dense prediction tasks, particularly in depth estimation,\nremains untapped. Existing MIM approaches primarily rely on single-image\ninputs, which makes it challenging to capture the crucial structured\ninformation, leading to suboptimal performance in tasks requiring fine-grained\nfeature representation. To address these limitations, we propose SG-MIM, a\nnovel Structured knowledge Guided Masked Image Modeling framework designed to\nenhance dense prediction tasks by utilizing structured knowledge alongside\nimages. SG-MIM employs a lightweight relational guidance framework, allowing it\nto guide structured knowledge individually at the feature level rather than\nnaively combining at the pixel level within the same architecture, as is common\nin traditional multi-modal pre-training methods. This approach enables the\nmodel to efficiently capture essential information while minimizing\ndiscrepancies between pre-training and downstream tasks. Furthermore, SG-MIM\nemploys a selective masking strategy to incorporate structured knowledge,\nmaximizing the synergy between general representation learning and structured\nknowledge-specific learning. Our method requires no additional annotations,\nmaking it a versatile and efficient solution for a wide range of applications.\nOur evaluations on the KITTI, NYU-v2, and ADE20k datasets demonstrate SG-MIM's\nsuperiority in monocular depth estimation and semantic segmentation.\n","authors":["Sumin Son","Hyesong Choi","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03651v2","updated":"2024-09-04T08:23:00Z","published":"2024-08-07T09:30:51Z","title":"Path-SAM2: Transfer SAM2 for digital pathology semantic segmentation","summary":" The semantic segmentation task in pathology plays an indispensable role in\nassisting physicians in determining the condition of tissue lesions. With the\nproposal of Segment Anything Model (SAM), more and more foundation models have\nseen rapid development in the field of image segmentation. Recently, SAM2 has\ngarnered widespread attention in both natural image and medical image\nsegmentation. Compared to SAM, it has significantly improved in terms of\nsegmentation accuracy and generalization performance. We compared the\nfoundational models based on SAM and found that their performance in semantic\nsegmentation of pathological images was hardly satisfactory. In this paper, we\npropose Path-SAM2, which for the first time adapts the SAM2 model to cater to\nthe task of pathological semantic segmentation. We integrate the largest\npretrained vision encoder for histopathology (UNI) with the original SAM2\nencoder, adding more pathology-based prior knowledge. Additionally, we\nintroduce a learnable Kolmogorov-Arnold Networks (KAN) classification module to\nreplace the manual prompt process. In three adenoma pathological datasets,\nPath-SAM2 has achieved state-of-the-art performance.This study demonstrates the\ngreat potential of adapting SAM2 to pathology image segmentation tasks. We plan\nto release the code and model weights for this paper at:\nhttps://github.com/simzhangbest/SAM2PATH\n","authors":["Mingya Zhang","Liang Wang","Zhihao Chen","Yiyuan Ge","Xianping Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03651v2.pdf","comment":"5 pages , 5 figures"},{"id":"http://arxiv.org/abs/2409.02508v1","updated":"2024-09-04T08:08:21Z","published":"2024-09-04T08:08:21Z","title":"TLD: A Vehicle Tail Light signal Dataset and Benchmark","summary":" Understanding other drivers' intentions is crucial for safe driving. The role\nof taillights in conveying these intentions is underemphasized in current\nautonomous driving systems. Accurately identifying taillight signals is\nessential for predicting vehicle behavior and preventing collisions.\nOpen-source taillight datasets are scarce, often small and inconsistently\nannotated. To address this gap, we introduce a new large-scale taillight\ndataset called TLD. Sourced globally, our dataset covers diverse traffic\nscenarios. To our knowledge, TLD is the first dataset to separately annotate\nbrake lights and turn signals in real driving scenarios. We collected 17.78\nhours of driving videos from the internet. This dataset consists of 152k\nlabeled image frames sampled at a rate of 2 Hz, along with 1.5 million\nunlabeled frames interspersed throughout. Additionally, we have developed a\ntwo-stage vehicle light detection model consisting of two primary modules: a\nvehicle detector and a taillight classifier. Initially, YOLOv10 and DeepSORT\ncaptured consecutive vehicle images over time. Subsequently, the two\nclassifiers work simultaneously to determine the states of the brake lights and\nturn signals. A post-processing procedure is then used to eliminate noise\ncaused by misidentifications and provide the taillight states of the vehicle\nwithin a given time frame. Our method shows exceptional performance on our\ndataset, establishing a benchmark for vehicle taillight detection. The dataset\nis available at https://huggingface.co/datasets/ChaiJohn/TLD/tree/main\n","authors":["Jinhao Chai","Shiyi Mu","Shugong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.02508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02497v1","updated":"2024-09-04T07:46:42Z","published":"2024-09-04T07:46:42Z","title":"A Learnable Color Correction Matrix for RAW Reconstruction","summary":" Autonomous driving algorithms usually employ sRGB images as model input due\nto their compatibility with the human visual system. However, visually pleasing\nsRGB images are possibly sub-optimal for downstream tasks when compared to RAW\nimages. The availability of RAW images is constrained by the difficulties in\ncollecting real-world driving data and the associated challenges of annotation.\nTo address this limitation and support research in RAW-domain driving\nperception, we design a novel and ultra-lightweight RAW reconstruction method.\nThe proposed model introduces a learnable color correction matrix (CCM), which\nuses only a single convolutional layer to approximate the complex inverse image\nsignal processor (ISP). Experimental results demonstrate that simulated RAW\n(simRAW) images generated by our method provide performance improvements\nequivalent to those produced by more complex inverse ISP methods when\npretraining RAW-domain object detectors, which highlights the effectiveness and\npracticality of our approach.\n","authors":["Anqi Liu","Shiyi Mu","Shugong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.02497v1.pdf","comment":"Accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2409.02494v1","updated":"2024-09-04T07:45:06Z","published":"2024-09-04T07:45:06Z","title":"Plane2Depth: Hierarchical Adaptive Plane Guidance for Monocular Depth\n Estimation","summary":" Monocular depth estimation aims to infer a dense depth map from a single\nimage, which is a fundamental and prevalent task in computer vision. Many\nprevious works have shown impressive depth estimation results through carefully\ndesigned network structures, but they usually ignore the planar information and\ntherefore perform poorly in low-texture areas of indoor scenes. In this paper,\nwe propose Plane2Depth, which adaptively utilizes plane information to improve\ndepth prediction within a hierarchical framework. Specifically, in the proposed\nplane guided depth generator (PGDG), we design a set of plane queries as\nprototypes to softly model planes in the scene and predict per-pixel plane\ncoefficients. Then the predicted plane coefficients can be converted into\nmetric depth values with the pinhole camera model. In the proposed adaptive\nplane query aggregation (APGA) module, we introduce a novel feature interaction\napproach to improve the aggregation of multi-scale plane features in a top-down\nmanner. Extensive experiments show that our method can achieve outstanding\nperformance, especially in low-texture or repetitive areas. Furthermore, under\nthe same backbone network, our method outperforms the state-of-the-art methods\non the NYU-Depth-v2 dataset, achieves competitive results with state-of-the-art\nmethods KITTI dataset and can be generalized to unseen scenes effectively.\n","authors":["Li Liu","Ruijie Zhu","Jiacheng Deng","Ziyang Song","Wenfei Yang","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02494v1.pdf","comment":"14 pages, 12 figures, 8 tables"},{"id":"http://arxiv.org/abs/2207.13137v2","updated":"2024-09-04T07:41:41Z","published":"2022-07-19T03:58:00Z","title":"Bayesian Evidential Learning for Few-Shot Classification","summary":" Few-Shot Classification(FSC) aims to generalize from base classes to novel\nclasses given very limited labeled samples, which is an important step on the\npath toward human-like machine learning. State-of-the-art solutions involve\nlearning to find a good metric and representation space to compute the distance\nbetween samples. Despite the promising accuracy performance, how to model\nuncertainty for metric-based FSC methods effectively is still a challenge. To\nmodel uncertainty, We place a distribution over class probability based on the\ntheory of evidence. As a result, uncertainty modeling and metric learning can\nbe decoupled. To reduce the uncertainty of classification, we propose a\nBayesian evidence fusion theorem. Given observed samples, the network learns to\nget posterior distribution parameters given the prior parameters produced by\nthe pre-trained network. Detailed gradient analysis shows that our method\nprovides a smooth optimization target and can capture the uncertainty. The\nproposed method is agnostic to metric learning strategies and can be\nimplemented as a plug-and-play module. We integrate our method into several\nnewest FSC methods and demonstrate the improved accuracy and uncertainty\nquantification on standard FSC benchmarks.\n","authors":["Xiongkun Linghu","Yan Bai","Yihang Lou","Shengsen Wu","Jinze Li","Jianzhong He","Tao Bai"],"pdf_url":"https://arxiv.org/pdf/2207.13137v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.02492v1","updated":"2024-09-04T07:35:12Z","published":"2024-09-04T07:35:12Z","title":"Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of\n Data-Driven Optimization Routine","summary":" Diffusion tensor imaging (DTI) holds significant importance in clinical\ndiagnosis and neuroscience research. However, conventional model-based fitting\nmethods often suffer from sensitivity to noise, leading to decreased accuracy\nin estimating DTI parameters. While traditional data-driven deep learning\nmethods have shown potential in terms of accuracy and efficiency, their limited\ngeneralization to out-of-training-distribution data impedes their broader\napplication due to the diverse scan protocols used across centers, scanners,\nand studies. This work aims to tackle these challenges and promote the use of\nDTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI\ncombines the weighted linear least squares fitting algorithm and regularization\nby denoising technique. The former fits DW images from diverse acquisition\nsettings into diffusion tensor field, while the latter applies a deep\nlearning-based denoiser to regularize the diffusion tensor field instead of the\nDW images, which is free from the limitation of fixed-channel assignment of the\nnetwork. The optimization object is solved using the alternating direction\nmethod of multipliers and then unrolled to construct a deep neural network,\nleveraging a data-driven strategy to learn network parameters. Extensive\nvalidation experiments are conducted utilizing both internally simulated\ndatasets and externally obtained in-vivo datasets. The results, encompassing\nboth qualitative and quantitative analyses, showcase that the proposed method\nattains state-of-the-art performance in DTI parameter estimation. Notably, it\ndemonstrates superior generalization, accuracy, and efficiency, rendering it\nhighly reliable for widespread application in the field.\n","authors":["Jialong Li","Zhicheng Zhang","Yunwei Chen","Qiqi Lu","Ye Wu","Xiaoming Liu","QianJin Feng","Yanqiu Feng","Xinyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02490v1","updated":"2024-09-04T07:33:09Z","published":"2024-09-04T07:33:09Z","title":"TP-GMOT: Tracking Generic Multiple Object by Textual Prompt with\n Motion-Appearance Cost (MAC) SORT","summary":" While Multi-Object Tracking (MOT) has made substantial advancements, it is\nlimited by heavy reliance on prior knowledge and limited to predefined\ncategories. In contrast, Generic Multiple Object Tracking (GMOT), tracking\nmultiple objects with similar appearance, requires less prior information about\nthe targets but faces challenges with variants like viewpoint, lighting,\nocclusion, and resolution. Our contributions commence with the introduction of\nthe \\textbf{\\text{Refer-GMOT dataset}} a collection of videos, each accompanied\nby fine-grained textual descriptions of their attributes. Subsequently, we\nintroduce a novel text prompt-based open-vocabulary GMOT framework, called\n\\textbf{\\text{TP-GMOT}}, which can track never-seen object categories with zero\ntraining examples. Within \\text{TP-GMOT} framework, we introduce two novel\ncomponents: (i) {\\textbf{\\text{TP-OD}}, an object detection by a textual\nprompt}, for accurately detecting unseen objects with specific characteristics.\n(ii) Motion-Appearance Cost SORT \\textbf{\\text{MAC-SORT}}, a novel object\nassociation approach that adeptly integrates motion and appearance-based\nmatching strategies to tackle the complex task of tracking multiple generic\nobjects with high similarity. Our contributions are benchmarked on the\n\\text{Refer-GMOT} dataset for GMOT task. Additionally, to assess the\ngeneralizability of the proposed \\text{TP-GMOT} framework and the effectiveness\nof \\text{MAC-SORT} tracker, we conduct ablation studies on the DanceTrack and\nMOT20 datasets for the MOT task. Our dataset, code, and models will be publicly\navailable at: https://fsoft-aic.github.io/TP-GMOT\n","authors":["Duy Le Dinh Anh","Kim Hoang Tran","Ngan Hoang Le"],"pdf_url":"https://arxiv.org/pdf/2409.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02486v1","updated":"2024-09-04T07:25:50Z","published":"2024-09-04T07:25:50Z","title":"Boosting Generalizability towards Zero-Shot Cross-Dataset Single-Image\n Indoor Depth by Meta-Initialization","summary":" Indoor robots rely on depth to perform tasks like navigation or obstacle\ndetection, and single-image depth estimation is widely used to assist\nperception. Most indoor single-image depth prediction focuses less on model\ngeneralizability to unseen datasets, concerned with in-the-wild robustness for\nsystem deployment. This work leverages gradient-based meta-learning to gain\nhigher generalizability on zero-shot cross-dataset inference. Unlike the\nmost-studied meta-learning of image classification associated with explicit\nclass labels, no explicit task boundaries exist for continuous depth values\ntied to highly varying indoor environments regarding object arrangement and\nscene composition. We propose fine-grained task that treats each RGB-D\nmini-batch as a task in our meta-learning formulation. We first show that our\nmethod on limited data induces a much better prior (max 27.8% in RMSE). Then,\nfinetuning on meta-learned initialization consistently outperforms baselines\nwithout the meta approach. Aiming at generalization, we propose zero-shot\ncross-dataset protocols and validate higher generalizability induced by our\nmeta-initialization, as a simple and useful plugin to many existing depth\nestimation methods. The work at the intersection of depth and meta-learning\npotentially drives both research to step closer to practical robotic and\nmachine perception usage.\n","authors":["Cho-Ying Wu","Yiqi Zhong","Junying Wang","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2409.02486v1.pdf","comment":"IROS 2024. The version supersedes 2305.07269. arXiv admin note: text\n overlap with arXiv:2305.07269"},{"id":"http://arxiv.org/abs/2409.02483v1","updated":"2024-09-04T07:20:01Z","published":"2024-09-04T07:20:01Z","title":"TASAR: Transferable Attack on Skeletal Action Recognition","summary":" Skeletal sequences, as well-structured representations of human behaviors,\nare crucial in Human Activity Recognition (HAR). The transferability of\nadversarial skeletal sequences enables attacks in real-world HAR scenarios,\nsuch as autonomous driving, intelligent surveillance, and human-computer\ninteractions. However, existing Skeleton-based HAR (S-HAR) attacks exhibit weak\nadversarial transferability and, therefore, cannot be considered true\ntransfer-based S-HAR attacks. More importantly, the reason for this failure\nremains unclear. In this paper, we study this phenomenon through the lens of\nloss surface, and find that its sharpness contributes to the poor\ntransferability in S-HAR. Inspired by this observation, we assume and\nempirically validate that smoothening the rugged loss landscape could\npotentially improve adversarial transferability in S-HAR. To this end, we\npropose the first Transfer-based Attack on Skeletal Action Recognition, TASAR.\nTASAR explores the smoothed model posterior without re-training the pre-trained\nsurrogates, which is achieved by a new post-train Dual Bayesian optimization\nstrategy. Furthermore, unlike previous transfer-based attacks that treat each\nframe independently and overlook temporal coherence within sequences, TASAR\nincorporates motion dynamics into the Bayesian attack gradient, effectively\ndisrupting the spatial-temporal coherence of S-HARs. To exhaustively evaluate\nthe effectiveness of existing methods and our method, we build the first\nlarge-scale robust S-HAR benchmark, comprising 7 S-HAR models, 10 attack\nmethods, 3 S-HAR datasets and 2 defense models. Extensive results demonstrate\nthe superiority of TASAR. Our benchmark enables easy comparisons for future\nstudies, with the code available in the supplementary material.\n","authors":["Yunfeng Diao","Baiqi Wu","Ruixuan Zhang","Ajian Liu","Xingxing Wei","Meng Wang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02483v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.08572"},{"id":"http://arxiv.org/abs/2409.02482v1","updated":"2024-09-04T07:18:26Z","published":"2024-09-04T07:18:26Z","title":"Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes","summary":" High-quality real-time view synthesis methods are based on volume rendering,\nsplatting, or surface rendering. While surface-based methods generally are the\nfastest, they cannot faithfully model fuzzy geometry like hair. In turn,\nalpha-blending techniques excel at representing fuzzy materials but require an\nunbounded number of samples per ray (P1). Further overheads are induced by\nempty space skipping in volume rendering (P2) and sorting input primitives in\nsplatting (P3). These problems are exacerbated on low-performance graphics\nhardware, e.g. on mobile devices. We present a novel representation for\nreal-time view synthesis where the (P1) number of sampling locations is small\nand bounded, (P2) sampling locations are efficiently found via rasterization,\nand (P3) rendering is sorting-free. We achieve this by representing objects as\nsemi-transparent multi-layer meshes, rendered in fixed layer order from\noutermost to innermost. We model mesh layers as SDF shells with optimal spacing\nlearned during training. After baking, we fit UV textures to the corresponding\nmeshes. We show that our method can represent challenging fuzzy objects while\nachieving higher frame rates than volume-based and splatting-based methods on\nlow-end and mobile devices.\n","authors":["Stefano Esposito","Anpei Chen","Christian Reiser","Samuel Rota Bulò","Lorenzo Porzi","Katja Schwarz","Christian Richardt","Michael Zollhöfer","Peter Kontschieder","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2409.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11936v2","updated":"2024-09-04T07:17:15Z","published":"2024-03-18T16:34:38Z","title":"AI-Assisted Cervical Cancer Screening","summary":" Visual Inspection with Acetic Acid (VIA) remains the most feasible cervical\ncancer screening test in resource-constrained settings of low- and\nmiddle-income countries (LMICs), which are often performed screening camps or\nprimary/community health centers by nurses instead of the preferred but\nunavailable expert Gynecologist. To address the highly subjective nature of the\ntest, various handheld devices integrating cameras or smartphones have been\nrecently explored to capture cervical images during VIA and aid decision-making\nvia telemedicine or AI models. Most studies proposing AI models retrospectively\nuse a relatively small number of already collected images from specific\ndevices, digital cameras, or smartphones; the challenges and protocol for\nquality image acquisition during VIA in resource-constrained camp settings,\nchallenges in getting gold standard, data imbalance, etc. are often overlooked.\nWe present a novel approach and describe the end-to-end design process to build\na robust smartphone-based AI-assisted system that does not require buying a\nseparate integrated device: the proposed protocol for quality image acquisition\nin resource-constrained settings, dataset collected from 1,430 women during VIA\nperformed by nurses in screening camps, preprocessing pipeline, and training\nand evaluation of a deep-learning-based classification model aimed to identify\n(pre)cancerous lesions. Our work shows that the readily available smartphones\nand a suitable protocol can capture the cervix images with the required details\nfor the VIA test well; the deep-learning-based classification model provides\npromising results to assist nurses in VIA screening; and provides a direction\nfor large-scale data collection and validation in resource-constrained\nsettings.\n","authors":["Kanchan Poudel","Lisasha Poudel","Prabin Raj Shakya","Atit Poudel","Archana Shrestha","Bishesh Khanal"],"pdf_url":"https://arxiv.org/pdf/2403.11936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13393v3","updated":"2024-09-04T06:32:00Z","published":"2024-06-19T09:36:18Z","title":"Style-NeRF2NeRF: 3D Style Transfer From Style-Aligned Multi-View Images","summary":" We propose a simple yet effective pipeline for stylizing a 3D scene,\nharnessing the power of 2D image diffusion models. Given a NeRF model\nreconstructed from a set of multi-view images, we perform 3D style transfer by\nrefining the source NeRF model using stylized images generated by a\nstyle-aligned image-to-image diffusion model. Given a target style prompt, we\nfirst generate perceptually similar multi-view images by leveraging a\ndepth-conditioned diffusion model with an attention-sharing mechanism. Next,\nbased on the stylized multi-view images, we propose to guide the style transfer\nprocess with the sliced Wasserstein loss based on the feature maps extracted\nfrom a pre-trained CNN model. Our pipeline consists of decoupled steps,\nallowing users to test various prompt ideas and preview the stylized 3D result\nbefore proceeding to the NeRF fine-tuning stage. We demonstrate that our method\ncan transfer diverse artistic styles to real-world 3D scenes with competitive\nquality. Result videos are also available on our project page:\nhttps://haruolabs.github.io/style-n2n/\n","authors":["Haruo Fujiwara","Yusuke Mukuta","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2406.13393v3.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.09777v3","updated":"2024-09-04T06:09:15Z","published":"2024-05-16T02:46:19Z","title":"Rethinking Barely-Supervised Volumetric Medical Image Segmentation from\n an Unsupervised Domain Adaptation Perspective","summary":" This paper investigates an extremely challenging problem: barely-supervised\nvolumetric medical image segmentation (BSS). A BSS training dataset consists of\ntwo parts: 1) a barely-annotated labeled set, where each labeled image contains\nonly a single-slice annotation, and 2) an unlabeled set comprising numerous\nunlabeled volumetric images. State-of-the-art BSS methods employ a\nregistration-based paradigm, which uses inter-slice image registration to\npropagate single-slice annotations into volumetric pseudo labels, constructing\na completely annotated labeled set, to which a semi-supervised segmentation\nscheme can be applied. However, the paradigm has a critical limitation: the\npseudo-labels generated by image registration are unreliable and noisy.\nMotivated by this, we propose a new perspective: instead of solving BSS within\na semi-supervised learning scheme, this work formulates BSS as an unsupervised\ndomain adaptation problem. To this end, we propose a novel BSS framework,\n\\textbf{B}arely-supervised learning \\textbf{via} unsupervised domain\n\\textbf{A}daptation (BvA), as an alternative to the dominant registration\nparadigm. Specifically, we first design a novel noise-free labeled data\nconstruction algorithm (NFC) for slice-to-volume labeled data synthesis. Then,\nwe introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the\ndomain shifts. Extensive experiments demonstrate that our method provides a\npromising alternative for BSS. Remarkably, the proposed method, trained on the\nleft atrial segmentation dataset with \\textbf{only one} barely-labeled image,\nachieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%.\nThe code is available at https://github.com/Senyh/BvA.\n","authors":["Zhiqiang Shen","Peng Cao","Junming Su","Jinzhu Yang","Osmar R. Zaiane"],"pdf_url":"https://arxiv.org/pdf/2405.09777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06170v2","updated":"2024-09-04T05:53:17Z","published":"2024-08-12T14:16:10Z","title":"Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment\n Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging","summary":" Purpose:\n To evaluate the zero-shot performance of Segment Anything Model 2 (SAM 2) in\n3D segmentation of abdominal organs in CT scans, and to investigate the effects\nof prompt settings on segmentation results.\n Materials and Methods:\n Using a subset of the TotalSegmentator CT dataset (n = 123) from eight\ninstitutions, we assessed SAM 2's ability to segment eight abdominal organs.\nSegmentation was initiated from three different z-coordinate levels (caudal,\nmid, and cranial levels) of each organ. Performance was measured using the Dice\nsimilarity coefficient (DSC). We also analyzed the impact of \"negative\nprompts,\" which explicitly exclude certain regions from the segmentation\nprocess, on accuracy. Additionally, we analyzed organ volumes to contextualize\nthe segmentation performance.\n Results:\n As a zero-shot approach, larger organs with clear boundaries demonstrated\nhigh segmentation performance, with mean(median) DSCs as follows: liver\n0.821(0.898), left kidney 0.870(0.921), right kidney 0.862(0.935), and spleen\n0.891(0.932). Smaller organs showed lower performance: gallbladder\n0.531(0.590), pancreas 0.361(0.359), and adrenal glands, right 0.203(0.109),\nleft 0.308(0.231). The initial slice for segmentation and the use of negative\nprompts significantly influenced the results. By removing negative prompts from\nthe input, the DSCs significantly decreased for six organs. Moderate positive\ncorrelations were observed between volume sizes and DSCs.\n Conclusion:\n SAM 2 demonstrated promising zero-shot performance in segmenting certain\nabdominal organs in CT scans, particularly larger organs with clear boundaries.\nPerformance was significantly influenced by input negative prompts and initial\nslice selection, highlighting the importance of optimizing these factors for\neffective segmentation.\n","authors":["Yosuke Yamagishi","Shouhei Hanaoka","Tomohiro Kikuchi","Takahiro Nakao","Yuta Nakamura","Yukihiro Nomura","Soichiro Miki","Takeharu Yoshikawa","Osamu Abe"],"pdf_url":"https://arxiv.org/pdf/2408.06170v2.pdf","comment":"20 pages, 7 figures (including 2 supplemental figure), 4 tables"},{"id":"http://arxiv.org/abs/2405.08621v3","updated":"2024-09-04T05:28:51Z","published":"2024-05-14T14:01:15Z","title":"RMT-BVQA: Recurrent Memory Transformer-based Blind Video Quality\n Assessment for Enhanced Video Content","summary":" With recent advances in deep learning, numerous algorithms have been\ndeveloped to enhance video quality, reduce visual artifacts, and improve\nperceptual quality. However, little research has been reported on the quality\nassessment of enhanced content - the evaluation of enhancement methods is often\nbased on quality metrics that were designed for compression applications. In\nthis paper, we propose a novel blind deep video quality assessment (VQA) method\nspecifically for enhanced video content. It employs a new Recurrent Memory\nTransformer (RMT) based network architecture to obtain video quality\nrepresentations, which is optimized through a novel content-quality-aware\ncontrastive learning strategy based on a new database containing 13K training\npatches with enhanced content. The extracted quality representations are then\ncombined through linear regression to generate video-level quality indices. The\nproposed method, RMT-BVQA, has been evaluated on the VDPVE (VQA Dataset for\nPerceptual Video Enhancement) database through a five-fold cross validation.\nThe results show its superior correlation performance when compared to ten\nexisting no-reference quality metrics.\n","authors":["Tianhao Peng","Chen Feng","Duolikun Danier","Fan Zhang","Benoit Vallade","Alex Mackin","David Bull"],"pdf_url":"https://arxiv.org/pdf/2405.08621v3.pdf","comment":"This paper has been accepted by the ECCV 2024 AIM Advances in Image\n Manipulation workshop"},{"id":"http://arxiv.org/abs/2405.04274v2","updated":"2024-09-04T05:24:25Z","published":"2024-05-07T12:42:23Z","title":"Group-aware Parameter-efficient Updating for Content-Adaptive Neural\n Video Compression","summary":" Content-adaptive compression is crucial for enhancing the adaptability of the\npre-trained neural codec for various contents. Although these methods have been\nvery practical in neural image compression (NIC), their application in neural\nvideo compression (NVC) is still limited due to two main aspects: 1), video\ncompression relies heavily on temporal redundancy, therefore updating just one\nor a few frames can lead to significant errors accumulating over time; 2), NVC\nframeworks are generally more complex, with many large components that are not\neasy to update quickly during encoding. To address the previously mentioned\nchallenges, we have developed a content-adaptive NVC technique called\nGroup-aware Parameter-Efficient Updating (GPU). Initially, to minimize error\naccumulation, we adopt a group-aware approach for updating encoder parameters.\nThis involves adopting a patch-based Group of Pictures (GoP) training strategy\nto segment a video into patch-based GoPs, which will be updated to facilitate a\nglobally optimized domain-transferable solution. Subsequently, we introduce a\nparameter-efficient delta-tuning strategy, which is achieved by integrating\nseveral light-weight adapters into each coding component of the encoding\nprocess by both serial and parallel configuration. Such architecture-agnostic\nmodules stimulate the components with large parameters, thereby reducing both\nthe update cost and the encoding time. We incorporate our GPU into the latest\nNVC framework and conduct comprehensive experiments, whose results showcase\noutstanding video compression efficiency across four video benchmarks and\nadaptability of one medical image benchmark.\n","authors":["Zhenghao Chen","Luping Zhou","Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2405.04274v2.pdf","comment":"Accepted by ACM MM 2024, Melbourne, Australia"},{"id":"http://arxiv.org/abs/2409.02453v1","updated":"2024-09-04T05:19:57Z","published":"2024-09-04T05:19:57Z","title":"FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video\n Reconstruction in Resource and Timing Constrained Network Settings","summary":" Despite the growing adoption of video processing via Internet of Things (IoT)\ndevices due to their cost-effectiveness, transmitting captured data to nearby\nservers poses challenges due to varying timing constraints and scarcity of\nnetwork bandwidth. Existing video compression methods face difficulties in\nrecovering compressed data when incomplete data is provided. Here, we introduce\n\\emph{\\project}, a deep-learning based solution that utilizes previously\nreceived data to predict the missing segments of a frame, enabling the\nreconstruction of a frame from partially received data.\n","authors":["John Li","Shehab Sarar Ahmed","Deepak Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02448v1","updated":"2024-09-04T05:06:34Z","published":"2024-09-04T05:06:34Z","title":"Detecting Korean Food Using Image using Hierarchical Model","summary":" A solution was made available for Korean Food lovers who have dietary\nrestrictions to identify the Korean food before consuming. Just by uploading a\nclear photo of the dish, people can get to know what they are eating. Image\nprocessing techniques together with machine learning helped to come up with\nthis solution.\n","authors":["Hoang Khanh Lam","Kahandakanaththage Maduni Pramuditha Perera"],"pdf_url":"https://arxiv.org/pdf/2409.02448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02438v1","updated":"2024-09-04T04:29:49Z","published":"2024-09-04T04:29:49Z","title":"Non-target Divergence Hypothesis: Toward Understanding Domain Gaps in\n Cross-Modal Knowledge Distillation","summary":" Compared to single-modal knowledge distillation, cross-modal knowledge\ndistillation faces more severe challenges due to domain gaps between\nmodalities. Although various methods have proposed various solutions to\novercome these challenges, there is still limited research on how domain gaps\naffect cross-modal knowledge distillation. This paper provides an in-depth\nanalysis and evaluation of this issue. We first introduce the Non-Target\nDivergence Hypothesis (NTDH) to reveal the impact of domain gaps on cross-modal\nknowledge distillation. Our key finding is that domain gaps between modalities\nlead to distribution differences in non-target classes, and the smaller these\ndifferences, the better the performance of cross-modal knowledge distillation.\nSubsequently, based on Vapnik-Chervonenkis (VC) theory, we derive the upper and\nlower bounds of the approximation error for cross-modal knowledge distillation,\nthereby theoretically validating the NTDH. Finally, experiments on five\ncross-modal datasets further confirm the validity, generalisability, and\napplicability of the NTDH.\n","authors":["Yilong Chen","Zongyi Xu","Xiaoshui Huang","Shanshan Zhao","Xinqi Jiang","Xinyu Gao","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2409.02438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00359v2","updated":"2024-09-04T04:21:06Z","published":"2023-09-30T12:30:25Z","title":"CrossDF: Improving Cross-Domain Deepfake Detection with Deep Information\n Decomposition","summary":" Deepfake technology poses a significant threat to security and social trust.\nAlthough existing detection methods have shown high performance in identifying\nforgeries within datasets that use the same deepfake techniques for both\ntraining and testing, they suffer from sharp performance degradation when faced\nwith cross-dataset scenarios where unseen deepfake techniques are tested. To\naddress this challenge, we propose a Deep Information Decomposition (DID)\nframework to enhance the performance of Cross-dataset Deepfake Detection\n(CrossDF). Unlike most existing deepfake detection methods, our framework\nprioritizes high-level semantic features over specific visual artifacts.\nSpecifically, it adaptively decomposes facial features into deepfake-related\nand irrelevant information, only using the intrinsic deepfake-related\ninformation for real/fake discrimination. Moreover, it optimizes these two\nkinds of information to be independent with a de-correlation learning module,\nthereby enhancing the model's robustness against various irrelevant information\nchanges and generalization ability to unseen forgery methods. Our extensive\nexperimental evaluation and comparison with existing state-of-the-art detection\nmethods validate the effectiveness and superiority of the DID framework on\ncross-dataset deepfake detection.\n","authors":["Shanmin Yang","Hui Guo","Shu Hu","Bin Zhu","Ying Fu","Siwei Lyu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.00359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02429v1","updated":"2024-09-04T04:16:58Z","published":"2024-09-04T04:16:58Z","title":"Training-free Color-Style Disentanglement for Constrained Text-to-Image\n Synthesis","summary":" We consider the problem of independently, in a disentangled fashion,\ncontrolling the outputs of text-to-image diffusion models with color and style\nattributes of a user-supplied reference image. We present the first\ntraining-free, test-time-only method to disentangle and condition text-to-image\nmodels on color and style attributes from reference image. To realize this, we\npropose two key innovations. Our first contribution is to transform the latent\ncodes at inference time using feature transformations that make the covariance\nmatrix of current generation follow that of the reference image, helping\nmeaningfully transfer color. Next, we observe that there exists a natural\ndisentanglement between color and style in the LAB image space, which we\nexploit to transform the self-attention feature maps of the image being\ngenerated with respect to those of the reference computed from its L channel.\nBoth these operations happen purely at test time and can be done independently\nor merged. This results in a flexible method where color and style information\ncan come from the same reference image or two different sources, and a new\ngeneration can seamlessly fuse them in either scenario.\n","authors":["Aishwarya Agarwal","Srikrishna Karanam","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2409.02429v1.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2409.02426v1","updated":"2024-09-04T04:14:02Z","published":"2024-09-04T04:14:02Z","title":"Diffusion Models Learn Low-Dimensional Distributions via Subspace\n Clustering","summary":" Recent empirical studies have demonstrated that diffusion models can\neffectively learn the image distribution and generate new samples. Remarkably,\nthese models can achieve this even with a small number of training samples\ndespite a large image dimension, circumventing the curse of dimensionality. In\nthis work, we provide theoretical insights into this phenomenon by leveraging\nkey empirical observations: (i) the low intrinsic dimensionality of image data,\n(ii) a union of manifold structure of image data, and (iii) the low-rank\nproperty of the denoising autoencoder in trained diffusion models. These\nobservations motivate us to assume the underlying data distribution of image\ndata as a mixture of low-rank Gaussians and to parameterize the denoising\nautoencoder as a low-rank model according to the score function of the assumed\ndistribution. With these setups, we rigorously show that optimizing the\ntraining loss of diffusion models is equivalent to solving the canonical\nsubspace clustering problem over the training samples. Based on this\nequivalence, we further show that the minimal number of samples required to\nlearn the underlying distribution scales linearly with the intrinsic dimensions\nunder the above data and model assumptions. This insight sheds light on why\ndiffusion models can break the curse of dimensionality and exhibit the phase\ntransition in learning distributions. Moreover, we empirically establish a\ncorrespondence between the subspaces and the semantic representations of image\ndata, facilitating image editing. We validate these results with corroborated\nexperimental results on both simulated distributions and image datasets.\n","authors":["Peng Wang","Huijie Zhang","Zekai Zhang","Siyi Chen","Yi Ma","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02426v1.pdf","comment":"39 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.02418v1","updated":"2024-09-04T03:46:17Z","published":"2024-09-04T03:46:17Z","title":"MOSMOS: Multi-organ segmentation facilitated by medical report\n supervision","summary":" Owing to a large amount of multi-modal data in modern medical systems, such\nas medical images and reports, Medical Vision-Language Pre-training (Med-VLP)\nhas demonstrated incredible achievements in coarse-grained downstream tasks\n(i.e., medical classification, retrieval, and visual question answering).\nHowever, the problem of transferring knowledge learned from Med-VLP to\nfine-grained multi-organ segmentation tasks has barely been investigated.\nMulti-organ segmentation is challenging mainly due to the lack of large-scale\nfully annotated datasets and the wide variation in the shape and size of the\nsame organ between individuals with different diseases. In this paper, we\npropose a novel pre-training & fine-tuning framework for Multi-Organ\nSegmentation by harnessing Medical repOrt Supervision (MOSMOS). Specifically,\nwe first introduce global contrastive learning to maximally align the medical\nimage-report pairs in the pre-training stage. To remedy the granularity\ndiscrepancy, we further leverage multi-label recognition to implicitly learn\nthe semantic correspondence between image pixels and organ tags. More\nimportantly, our pre-trained models can be transferred to any segmentation\nmodel by introducing the pixel-tag attention maps. Different network settings,\ni.e., 2D U-Net and 3D UNETR, are utilized to validate the generalization. We\nhave extensively evaluated our approach using different diseases and modalities\non BTCV, AMOS, MMWHS, and BRATS datasets. Experimental results in various\nsettings demonstrate the effectiveness of our framework. This framework can\nserve as the foundation to facilitate future research on automatic annotation\ntasks under the supervision of medical reports.\n","authors":["Weiwei Tian","Xinyu Huang","Junlin Hou","Caiyue Ren","Longquan Jiang","Rui-Wei Zhao","Gang Jin","Yuejie Zhang","Daoying Geng"],"pdf_url":"https://arxiv.org/pdf/2409.02418v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.02415v1","updated":"2024-09-04T03:41:42Z","published":"2024-09-04T03:41:42Z","title":"Local map Construction Methods with SD map: A Novel Survey","summary":" In recent years, significant academic advancements have been made in the\nfield of autonomous vehicles, with Local maps emerging as a crucial component\nof autonomous driving technology. Local maps not only provide intricate details\nof road networks but also serve as fundamental inputs for critical tasks such\nas vehicle localization, navigation, and decision-making. Given the\ncharacteristics of SD map (Standard Definition Map), which include low cost,\nease of acquisition, and high versatility, perception methods that integrate SD\nmap as prior information have demonstrated significant potential in the field\nof Local map perception. The purpose of this paper is to provide researchers\nwith a comprehensive overview and summary of the latest advancements in the\nintegration of SD map as prior information for Local map perception methods.\nThis review begins by introducing the task definition and general pipeline of\nlocal map perception methods that incorporate SD maps as prior information,\nalong with relevant public datasets. And then it focuses on the representation\nand encoding methods of multi-source information, as well as the methods for\nfusing multi-source information. In response to this burgeoning trend, this\narticle presents a comprehensive and meticulous overview of the diverse\nresearch efforts in this particular field. Finally, the article addresses\npertinent issues and future challenges with the aim of guiding researchers in\nunderstanding the current trends and methodologies prevalent in the field.\n","authors":["Jiaqi Li","Pingfan Jia","Jiaxing Chen","Jiaxi Liu","Lei He"],"pdf_url":"https://arxiv.org/pdf/2409.02415v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.18820v4","updated":"2024-09-04T03:35:58Z","published":"2024-04-29T16:02:38Z","title":"Towards Extreme Image Compression with Latent Feature Guidance and\n Diffusion Prior","summary":" Image compression at extremely low bitrates (below 0.1 bits per pixel (bpp))\nis a significant challenge due to substantial information loss. In this work,\nwe propose a novel two-stage extreme image compression framework that exploits\nthe powerful generative capability of pre-trained diffusion models to achieve\nrealistic image reconstruction at extremely low bitrates. In the first stage,\nwe treat the latent representation of images in the diffusion space as\nguidance, employing a VAE-based compression approach to compress images and\ninitially decode the compressed information into content variables. The second\nstage leverages pre-trained stable diffusion to reconstruct images under the\nguidance of content variables. Specifically, we introduce a small control\nmodule to inject content information while keeping the stable diffusion model\nfixed to maintain its generative capability. Furthermore, we design a space\nalignment loss to force the content variables to align with the diffusion space\nand provide the necessary constraints for optimization. Extensive experiments\ndemonstrate that our method significantly outperforms state-of-the-art\napproaches in terms of visual performance at extremely low bitrates. The source\ncode and trained models are available at https://github.com/huai-chang/DiffEIC.\n","authors":["Zhiyuan Li","Yanhui Zhou","Hao Wei","Chenyang Ge","Jingwen Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.18820v4.pdf","comment":"Accepted by IEEE TCSVT"},{"id":"http://arxiv.org/abs/2408.07341v2","updated":"2024-09-04T03:22:05Z","published":"2024-08-14T07:34:12Z","title":"Robust Semi-supervised Multimodal Medical Image Segmentation via Cross\n Modality Collaboration","summary":" Multimodal learning leverages complementary information derived from\ndifferent modalities, thereby enhancing performance in medical image\nsegmentation. However, prevailing multimodal learning methods heavily rely on\nextensive well-annotated data from various modalities to achieve accurate\nsegmentation performance. This dependence often poses a challenge in clinical\nsettings due to limited availability of such data. Moreover, the inherent\nanatomical misalignment between different imaging modalities further\ncomplicates the endeavor to enhance segmentation performance. To address this\nproblem, we propose a novel semi-supervised multimodal segmentation framework\nthat is robust to scarce labeled data and misaligned modalities. Our framework\nemploys a novel cross modality collaboration strategy to distill\nmodality-independent knowledge, which is inherently associated with each\nmodality, and integrates this information into a unified fusion layer for\nfeature amalgamation. With a channel-wise semantic consistency loss, our\nframework ensures alignment of modality-independent information from a\nfeature-wise perspective across modalities, thereby fortifying it against\nmisalignments in multimodal scenarios. Furthermore, our framework effectively\nintegrates contrastive consistent learning to regulate anatomical structures,\nfacilitating anatomical-wise prediction alignment on unlabeled data in\nsemi-supervised segmentation tasks. Our method achieves competitive performance\ncompared to other multimodal methods across three tasks: cardiac, abdominal\nmulti-organ, and thyroid-associated orbitopathy segmentations. It also\ndemonstrates outstanding robustness in scenarios involving scarce labeled data\nand misaligned modalities.\n","authors":["Xiaogen Zhou","Yiyou Sun","Min Deng","Winnie Chiu Wing Chu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2408.07341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20461v3","updated":"2024-09-04T03:17:22Z","published":"2024-07-29T23:40:13Z","title":"Weakly Supervised Intracranial Hemorrhage Segmentation with YOLO and an\n Uncertainty Rectified Segment Anything Model","summary":" Intracranial hemorrhage (ICH) is a life-threatening condition that requires\nrapid and accurate diagnosis to improve treatment outcomes and patient survival\nrates. Recent advancements in supervised deep learning have greatly improved\nthe analysis of medical images, but often rely on extensive datasets with\nhigh-quality annotations, which are costly, time-consuming, and require medical\nexpertise to prepare. To mitigate the need for large amounts of expert-prepared\nsegmentation data, we have developed a novel weakly supervised ICH segmentation\nmethod that utilizes the YOLO object detection model and an\nuncertainty-rectified Segment Anything Model (SAM). In addition, we have\nproposed a novel point prompt generator for this model to further improve\nsegmentation results with YOLO-predicted bounding box prompts. Our approach\nachieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along\nwith a mean Dice score of 0.629 for ICH segmentation, outperforming existing\nweakly supervised and popular supervised (UNet and Swin-UNETR) approaches.\nOverall, the proposed method provides a robust and accurate alternative to the\nmore commonly used supervised techniques for ICH quantification without\nrequiring refined segmentation ground truths during model training.\n","authors":["Pascal Spiegler","Amirhossein Rasoulian","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.20461v3.pdf","comment":"Manuscript was accepted at SWITCH2024. 10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.02406v1","updated":"2024-09-04T03:14:48Z","published":"2024-09-04T03:14:48Z","title":"Hadamard Row-Wise Generation Algorithm","summary":" In this paper, we introduce an efficient algorithm for generating specific\nHadamard rows, addressing the memory demands of pre-computing the entire\nmatrix. Leveraging Sylvester's recursive construction, our method generates the\nrequired $i$-th row on demand, significantly reducing computational resources.\nThe algorithm uses the Kronecker product to construct the desired row from the\nbinary representation of the index, without creating the full matrix. This\napproach is particularly useful for single-pixel imaging systems that need only\none row at a time.\n","authors":["Brayan Monroy","Jorge Bacca"],"pdf_url":"https://arxiv.org/pdf/2409.02406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16340v2","updated":"2024-09-04T03:11:10Z","published":"2024-08-29T08:23:57Z","title":"Learned Image Transmission with Hierarchical Variational Autoencoder","summary":" In this paper, we introduce an innovative hierarchical joint source-channel\ncoding (HJSCC) framework for image transmission, utilizing a hierarchical\nvariational autoencoder (VAE). Our approach leverages a combination of\nbottom-up and top-down paths at the transmitter to autoregressively generate\nmultiple hierarchical representations of the original image. These\nrepresentations are then directly mapped to channel symbols for transmission by\nthe JSCC encoder. We extend this framework to scenarios with a feedback link,\nmodeling transmission over a noisy channel as a probabilistic sampling process\nand deriving a novel generative formulation for JSCC with feedback. Compared\nwith existing approaches, our proposed HJSCC provides enhanced adaptability by\ndynamically adjusting transmission bandwidth, encoding these representations\ninto varying amounts of channel symbols. Additionally, we introduce a rate\nattention module to guide the JSCC encoder in optimizing its encoding strategy\nbased on prior information. Extensive experiments on images of varying\nresolutions demonstrate that our proposed model outperforms existing baselines\nin rate-distortion performance and maintains robustness against channel noise.\n","authors":["Guangyi Zhang","Hanlei Li","Yunlong Cai","Qiyu Hu","Guanding Yu","Runmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00025v2","updated":"2024-09-04T03:11:02Z","published":"2024-08-16T19:22:02Z","title":"A Novel Approach to Classify Power Quality Signals Using Vision\n Transformers","summary":" With the rapid integration of electronically interfaced renewable energy\nresources and loads into smart grids, there is increasing interest in power\nquality disturbances (PQD) classification to enhance the security and\nefficiency of these grids. This paper introduces a new approach to PQD\nclassification based on the Vision Transformer (ViT) model. When a PQD occurs,\nthe proposed approach first converts the power quality signal into an image and\nthen utilizes a pre-trained ViT to accurately determine the class of the PQD.\nUnlike most previous works, which were limited to a few disturbance classes or\nsmall datasets, the proposed method is trained and tested on a large dataset\nwith 17 disturbance classes. Our experimental results show that the proposed\nViT-based approach achieves PQD classification precision and recall of 98.28%\nand 97.98%, respectively, outperforming recently proposed techniques applied to\nthe same dataset.\n","authors":["Ahmad Mohammad Saber","Alaa Selim","Mohamed M. Hammad","Amr Youssef","Deepa Kundur","Ehab El-Saadany"],"pdf_url":"https://arxiv.org/pdf/2409.00025v2.pdf","comment":"IECON 2024-50th Annual Conference of the IEEE Industrial Electronics\n Society, Chicago, U.S.A, 2024, pp. 1-6"},{"id":"http://arxiv.org/abs/2408.15461v2","updated":"2024-09-04T02:45:56Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":" Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v2.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2404.17364v3","updated":"2024-09-04T02:45:32Z","published":"2024-04-26T12:27:57Z","title":"MV-VTON: Multi-View Virtual Try-On with Diffusion Models","summary":" The goal of image-based virtual try-on is to generate an image of the target\nperson naturally wearing the given clothing. However, existing methods solely\nfocus on the frontal try-on using the frontal clothing. When the views of the\nclothing and person are significantly inconsistent, particularly when the\nperson's view is non-frontal, the results are unsatisfactory. To address this\nchallenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to\nreconstruct the dressing results from multiple views using the given clothes.\nGiven that single-view clothes provide insufficient information for MV-VTON, we\ninstead employ two images, i.e., the frontal and back views of the clothing, to\nencompass the complete view as much as possible. Moreover, we adopt diffusion\nmodels that have demonstrated superior abilities to perform our MV-VTON. In\nparticular, we propose a view-adaptive selection method where hard-selection\nand soft-selection are applied to the global and local clothing feature\nextraction, respectively. This ensures that the clothing features are roughly\nfit to the person's view. Subsequently, we suggest joint attention blocks to\nalign and fuse clothing features with person features. Additionally, we collect\na MV-VTON dataset MVG, in which each person has multiple photos with diverse\nviews and poses. Experiments show that the proposed method not only achieves\nstate-of-the-art results on MV-VTON task using our MVG dataset, but also has\nsuperiority on frontal-view virtual try-on task using VITON-HD and DressCode\ndatasets. Codes and datasets are publicly released at\nhttps://github.com/hywang2002/MV-VTON .\n","authors":["Haoyu Wang","Zhilu Zhang","Donglin Di","Shiliang Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.17364v3.pdf","comment":"Project url: https://hywang2002.github.io/MV-VTON/"},{"id":"http://arxiv.org/abs/2409.01128v2","updated":"2024-09-04T02:40:52Z","published":"2024-09-02T10:07:24Z","title":"Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in\n Federated Class Continual Learning","summary":" Federated Class Continual Learning (FCCL) merges the challenges of\ndistributed client learning with the need for seamless adaptation to new\nclasses without forgetting old ones. The key challenge in FCCL is catastrophic\nforgetting, an issue that has been explored to some extent in Continual\nLearning (CL). However, due to privacy preservation requirements, some\nconventional methods, such as experience replay, are not directly applicable to\nFCCL. Existing FCCL methods mitigate forgetting by generating historical data\nthrough federated training of GANs or data-free knowledge distillation.\nHowever, these approaches often suffer from unstable training of generators or\nlow-quality generated data, limiting their guidance for the model. To address\nthis challenge, we propose a novel method of data replay based on diffusion\nmodels. Instead of training a diffusion model, we employ a pre-trained\nconditional diffusion model to reverse-engineer each class, searching the\ncorresponding input conditions for each class within the model's input space,\nsignificantly reducing computational resources and time consumption while\nensuring effective generation. Furthermore, we enhance the classifier's domain\ngeneralization ability on generated and real data through contrastive learning,\nindirectly improving the representational capability of generated data for real\ndata. Comprehensive experiments demonstrate that our method significantly\noutperforms existing baselines. Code is available at\nhttps://github.com/jinglin-liang/DDDR.\n","authors":["Jinglin Liang","Jin Zhong","Hanlin Gu","Zhongqi Lu","Xingxing Tang","Gang Dai","Shuangping Huang","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01128v2.pdf","comment":"Accepted by ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.02390v1","updated":"2024-09-04T02:38:52Z","published":"2024-09-04T02:38:52Z","title":"Neural Dynamics Model of Visual Decision-Making: Learning from Human\n Experts","summary":" Uncovering the fundamental neural correlates of biological intelligence,\ndeveloping mathematical models, and conducting computational simulations are\ncritical for advancing new paradigms in artificial intelligence (AI). In this\nstudy, we implemented a comprehensive visual decision-making model that spans\nfrom visual input to behavioral output, using a neural dynamics modeling\napproach. Drawing inspiration from the key components of the dorsal visual\npathway in primates, our model not only aligns closely with human behavior but\nalso reflects neural activities in primates, and achieving accuracy comparable\nto convolutional neural networks (CNNs). Moreover, magnetic resonance imaging\n(MRI) identified key neuroimaging features such as structural connections and\nfunctional connectivity that are associated with performance in perceptual\ndecision-making tasks. A neuroimaging-informed fine-tuning approach was\nintroduced and applied to the model, leading to performance improvements that\nparalleled the behavioral variations observed among subjects. Compared to\nclassical deep learning models, our model more accurately replicates the\nbehavioral performance of biological intelligence, relying on the structural\ncharacteristics of biological neural networks rather than extensive training\ndata, and demonstrating enhanced resilience to perturbation.\n","authors":["Jie Su","Fang Cai","Shu-Kuo Zhao","Xin-Yi Wang","Tian-Yi Qian","Da-Hui Wang","Bo Hong"],"pdf_url":"https://arxiv.org/pdf/2409.02390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02389v1","updated":"2024-09-04T02:37:38Z","published":"2024-09-04T02:37:38Z","title":"Multi-modal Situated Reasoning in 3D Scenes","summary":" Situation awareness is essential for understanding and reasoning about 3D\nscenes in embodied AI agents. However, existing datasets and benchmarks for\nsituated understanding are limited in data modality, diversity, scale, and task\nscope. To address these limitations, we propose Multi-modal Situated Question\nAnswering (MSQA), a large-scale multi-modal situated reasoning dataset,\nscalably collected leveraging 3D scene graphs and vision-language models (VLMs)\nacross a diverse range of real-world 3D scenes. MSQA includes 251K situated\nquestion-answering pairs across 9 distinct question categories, covering\ncomplex scenarios within 3D scenes. We introduce a novel interleaved\nmulti-modal input setting in our benchmark to provide text, image, and point\ncloud for situation and question description, resolving ambiguity in previous\nsingle-modality convention (e.g., text). Additionally, we devise the\nMulti-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models'\nsituated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN\nhighlight the limitations of existing vision-language models and underscore the\nimportance of handling multi-modal interleaved inputs and situation modeling.\nExperiments on data scaling and cross-domain transfer further demonstrate the\nefficacy of leveraging MSQA as a pre-training dataset for developing more\npowerful situated reasoning models.\n","authors":["Xiongkun Linghu","Jiangyong Huang","Xuesong Niu","Xiaojian Ma","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02389v1.pdf","comment":"Project page: https://msr3d.github.io/"},{"id":"http://arxiv.org/abs/2409.02385v1","updated":"2024-09-04T02:25:10Z","published":"2024-09-04T02:25:10Z","title":"Unified Framework with Consistency across Modalities for Human Activity\n Recognition","summary":" Recognizing human activities in videos is challenging due to the\nspatio-temporal complexity and context-dependence of human interactions. Prior\nstudies often rely on single input modalities, such as RGB or skeletal data,\nlimiting their ability to exploit the complementary advantages across\nmodalities. Recent studies focus on combining these two modalities using simple\nfeature fusion techniques. However, due to the inherent disparities in\nrepresentation between these input modalities, designing a unified neural\nnetwork architecture to effectively leverage their complementary information\nremains a significant challenge. To address this, we propose a comprehensive\nmultimodal framework for robust video-based human activity recognition. Our key\ncontribution is the introduction of a novel compositional query machine, called\nCOMPUTER ($\\textbf{COMP}ositional h\\textbf{U}man-cen\\textbf{T}ric\nqu\\textbf{ER}y$ machine), a generic neural architecture that models the\ninteractions between a human of interest and its surroundings in both space and\ntime. Thanks to its versatile design, COMPUTER can be leveraged to distill\ndistinctive representations for various input modalities. Additionally, we\nintroduce a consistency loss that enforces agreement in prediction between\nmodalities, exploiting the complementary information from multimodal inputs for\nrobust human movement recognition. Through extensive experiments on action\nlocalization and group activity recognition tasks, our approach demonstrates\nsuperior performance when compared with state-of-the-art methods. Our code is\navailable at: https://github.com/tranxuantuyen/COMPUTER.\n","authors":["Tuyen Tran","Thao Minh Le","Hung Tran","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2409.02385v1.pdf","comment":"Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2407.05576v2","updated":"2024-09-04T02:20:29Z","published":"2024-07-08T03:17:10Z","title":"ORMNet: Object-centric Relationship Modeling for Egocentric Hand-object\n Segmentation","summary":" Egocentric hand-object segmentation (EgoHOS) is a promising new task aiming\nat segmenting hands and interacting objects in egocentric images. Although\nEgoHOS has the potential to enable various applications, current methods\nstruggle to achieve both high performance and end-to-end optimization\nsimultaneously. Moreover, existing approaches fail to fully leverage hand cues\nto assist the interacting-object segmentation and overlook the coupled\nrelationships between diverse interacting-object categories, resulting in\nperformance deficiencies. To address these limitations, this paper proposes a\nnovel Object-centric Relationship Modeling Network (ORMNet) to fulfill\nend-to-end and effective EgoHOS by modeling relationships between hands and\nobjects as well as objects and objects. Specifically, a Hand-Object Relation\n(HOR) module is introduced to capture the correlation between hands and\nobjects, which uses hand features to guide the network to extract more\ndistinguishing interacting-object features. Besides, we find the coupling\nrelations between diverse interacting-object categories and design the Object\nRelation Decoupling (ORD) strategy to disentangle them, emphasizing learning of\nthe interaction between hands and objects and reducing the confusion of\ninteracting-object classification. In-domain experiments show that ORMNet has\nnotably exceptional segmentation performance compared with state-of-the-art\nmethods, while out-of-domain experiments further exhibit its robust\ngeneralization capability. The project is available at\nhttps://github.com/yuggiehk/ORMNet/\n","authors":["Yuejiao Su","Yi Wang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2407.05576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02382v1","updated":"2024-09-04T02:18:35Z","published":"2024-09-04T02:18:35Z","title":"GGS: Generalizable Gaussian Splatting for Lane Switching in Autonomous\n Driving","summary":" We propose GGS, a Generalizable Gaussian Splatting method for Autonomous\nDriving which can achieve realistic rendering under large viewpoint changes.\nPrevious generalizable 3D gaussian splatting methods are limited to rendering\nnovel views that are very close to the original pair of images, which cannot\nhandle large differences in viewpoint. Especially in autonomous driving\nscenarios, images are typically collected from a single lane. The limited\ntraining perspective makes rendering images of a different lane very\nchallenging. To further improve the rendering capability of GGS under large\nviewpoint changes, we introduces a novel virtual lane generation module into\nGSS method to enables high-quality lane switching even without a multi-lane\ndataset. Besides, we design a diffusion loss to supervise the generation of\nvirtual lane image to further address the problem of lack of data in the\nvirtual lanes. Finally, we also propose a depth refinement module to optimize\ndepth estimation in the GSS model. Extensive validation of our method, compared\nto existing approaches, demonstrates state-of-the-art performance.\n","authors":["Huasong Han","Kaixuan Zhou","Xiaoxiao Long","Yusen Wang","Chunxia Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.02382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07076v2","updated":"2024-09-04T01:59:59Z","published":"2024-07-09T17:49:23Z","title":"MADE-for-ASD: A Multi-Atlas Deep Ensemble Network for Diagnosing Autism\n Spectrum Disorder","summary":" In response to the global need for efficient early diagnosis of Autism\nSpectrum Disorder (ASD), this paper bridges the gap between traditional,\ntime-consuming diagnostic methods and potential automated solutions. We propose\na multi-atlas deep ensemble network, MADE-for-ASD, that integrates multiple\natlases of the brain's functional magnetic resonance imaging (fMRI) data\nthrough a weighted deep ensemble network. Our approach integrates demographic\ninformation into the prediction workflow, which enhances ASD diagnosis\nperformance and offers a more holistic perspective on patient profiling. We\nexperiment with the well-known publicly available ABIDE (Autism Brain Imaging\nData Exchange) I dataset, consisting of resting state fMRI data from 17\ndifferent laboratories around the globe. Our proposed system achieves 75.20%\naccuracy on the entire dataset and 96.40% on a specific subset $-$ both\nsurpassing reported ASD diagnosis accuracy in ABIDE I fMRI studies.\nSpecifically, our model improves by 4.4 percentage points over prior works on\nthe same amount of data. The model exhibits a sensitivity of 82.90% and a\nspecificity of 69.70% on the entire dataset, and 91.00% and 99.50%,\nrespectively, on the specific subset. We leverage the F-score to pinpoint the\ntop 10 ROI in ASD diagnosis, such as precuneus and anterior\ncingulate/ventromedial. The proposed system can potentially pave the way for\nmore cost-effective, efficient and scalable strategies in ASD diagnosis. Codes\nand evaluations are publicly available at\nhttps://github.com/hasan-rakibul/MADE-for-ASD.\n","authors":["Xuehan Liu","Md Rakibul Hasan","Tom Gedeon","Md Zakir Hossain"],"pdf_url":"https://arxiv.org/pdf/2407.07076v2.pdf","comment":"Xuehan Liu and Md Rakibul Hasan contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02376v1","updated":"2024-09-04T01:54:20Z","published":"2024-09-04T01:54:20Z","title":"Coral Model Generation from Single Images for Virtual Reality\n Applications","summary":" With the rapid development of VR technology, the demand for high-quality 3D\nmodels is increasing. Traditional methods struggle with efficiency and quality\nin large-scale customization. This paper introduces a deep-learning framework\nthat generates high-precision 3D coral models from a single image. Using the\nCoral dataset, the framework extracts geometric and texture features, performs\n3D reconstruction, and optimizes design and material blending. Advanced\noptimization and polygon count control ensure shape accuracy, detail retention,\nand flexible output for various complexities, catering to high-quality\nrendering and real-time interaction needs.The project incorporates Explainable\nAI (XAI) to transform AI-generated models into interactive \"artworks,\" best\nviewed in VR and XR. This enhances model interpretability and human-machine\ncollaboration. Real-time feedback in VR interactions displays information like\ncoral species and habitat, enriching user experience. The generated models\nsurpass traditional methods in detail, visual quality, and efficiency. This\nresearch offers an intelligent approach to 3D content creation for VR, lowering\nproduction barriers, and promoting widespread VR applications. Additionally,\nintegrating XAI provides new insights into AI-generated visual content and\nadvances research in 3D vision interpretability.\n","authors":["Jie Fu","Shun Fu","Mick Grierson"],"pdf_url":"https://arxiv.org/pdf/2409.02376v1.pdf","comment":"In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts\n 2024) arXiv:2406.14485"},{"id":"http://arxiv.org/abs/2409.02374v1","updated":"2024-09-04T01:47:01Z","published":"2024-09-04T01:47:01Z","title":"Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable\n Image Editing","summary":" Recently, diffusion models have emerged as a powerful class of generative\nmodels. Despite their success, there is still limited understanding of their\nsemantic spaces. This makes it challenging to achieve precise and disentangled\nimage generation without additional training, especially in an unsupervised\nway. In this work, we improve the understanding of their semantic spaces from\nintriguing observations: among a certain range of noise levels, (1) the learned\nposterior mean predictor (PMP) in the diffusion model is locally linear, and\n(2) the singular vectors of its Jacobian lie in low-dimensional semantic\nsubspaces. We provide a solid theoretical basis to justify the linearity and\nlow-rankness in the PMP. These insights allow us to propose an unsupervised,\nsingle-step, training-free LOw-rank COntrollable image editing (LOCO Edit)\nmethod for precise local editing in diffusion models. LOCO Edit identified\nediting directions with nice properties: homogeneity, transferability,\ncomposability, and linearity. These properties of LOCO Edit benefit greatly\nfrom the low-dimensional semantic subspace. Our method can further be extended\nto unsupervised or text-supervised editing in various text-to-image diffusion\nmodels (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the\neffectiveness and efficiency of LOCO Edit. The codes will be released at\nhttps://github.com/ChicyChen/LOCO-Edit.\n","authors":["Siyi Chen","Huijie Zhang","Minzhe Guo","Yifu Lu","Peng Wang","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02371v1","updated":"2024-09-04T01:41:09Z","published":"2024-09-04T01:41:09Z","title":"Unfolding Videos Dynamics via Taylor Expansion","summary":" Taking inspiration from physical motion, we present a new self-supervised\ndynamics learning strategy for videos: Video Time-Differentiation for Instance\nDiscrimination (ViDiDi). ViDiDi is a simple and data-efficient strategy,\nreadily applicable to existing self-supervised video representation learning\nframeworks based on instance discrimination. At its core, ViDiDi observes\ndifferent aspects of a video through various orders of temporal derivatives of\nits frame sequence. These derivatives, along with the original frames, support\nthe Taylor series expansion of the underlying continuous dynamics at discrete\ntimes, where higher-order derivatives emphasize higher-order motion features.\nViDiDi learns a single neural network that encodes a video and its temporal\nderivatives into consistent embeddings following a balanced alternating\nlearning algorithm. By learning consistent representations for original frames\nand derivatives, the encoder is steered to emphasize motion features over\nstatic backgrounds and uncover the hidden dynamics in original frames. Hence,\nvideo representations are better separated by dynamic features. We integrate\nViDiDi into existing instance discrimination frameworks (VICReg, BYOL, and\nSimCLR) for pretraining on UCF101 or Kinetics and test on standard benchmarks\nincluding video retrieval, action recognition, and action detection. The\nperformances are enhanced by a significant margin without the need for large\nmodels or extensive datasets.\n","authors":["Siyi Chen","Minkyu Choi","Zesen Zhao","Kuan Han","Qing Qu","Zhongming Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02368v1","updated":"2024-09-04T01:38:37Z","published":"2024-09-04T01:38:37Z","title":"Pluralistic Salient Object Detection","summary":" We introduce pluralistic salient object detection (PSOD), a novel task aimed\nat generating multiple plausible salient segmentation results for a given input\nimage. Unlike conventional SOD methods that produce a single segmentation mask\nfor salient objects, this new setting recognizes the inherent complexity of\nreal-world images, comprising multiple objects, and the ambiguity in defining\nsalient objects due to different user intentions. To study this task, we\npresent two new SOD datasets \"DUTS-MM\" and \"DUS-MQ\", along with newly designed\nevaluation metrics. DUTS-MM builds upon the DUTS dataset but enriches the\nground-truth mask annotations from three aspects which 1) improves the mask\nquality especially for boundary and fine-grained structures; 2) alleviates the\nannotation inconsistency issue; and 3) provides multiple ground-truth masks for\nimages with saliency ambiguity. DUTS-MQ consists of approximately 100K\nimage-mask pairs with human-annotated preference scores, enabling the learning\nof real human preferences in measuring mask quality. Building upon these two\ndatasets, we propose a simple yet effective pluralistic SOD baseline based on a\nMixture-of-Experts (MOE) design. Equipped with two prediction heads, it\nsimultaneously predicts multiple masks using different query prompts and\npredicts human preference scores for each mask candidate. Extensive experiments\nand analyses underscore the significance of our proposed datasets and affirm\nthe effectiveness of our PSOD framework.\n","authors":["Xuelu Feng","Yunsheng Li","Dongdong Chen","Chunming Qiao","Junsong Yuan","Lu Yuan","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2409.02368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v3","updated":"2024-09-04T01:25:55Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v3.pdf","comment":"Accepted by NCMMSC2024"},{"id":"http://arxiv.org/abs/2307.10593v3","updated":"2024-09-04T01:13:40Z","published":"2023-07-20T05:15:03Z","title":"Asynchronous Blob Tracker for Event Cameras","summary":" Event-based cameras are popular for tracking fast-moving objects due to their\nhigh temporal resolution, low latency, and high dynamic range. In this paper,\nwe propose a novel algorithm for tracking event blobs using raw events\nasynchronously in real time. We introduce the concept of an event blob as a\nspatio-temporal likelihood of event occurrence where the conditional spatial\nlikelihood is blob-like. Many real-world objects such as car headlights or any\nquickly moving foreground objects generate event blob data. The proposed\nalgorithm uses a nearest neighbour classifier with a dynamic threshold criteria\nfor data association coupled with an extended Kalman filter to track the event\nblob state. Our algorithm achieves highly accurate blob tracking, velocity\nestimation, and shape estimation even under challenging lighting conditions and\nhigh-speed motions (> 11000 pixels/s). The microsecond time resolution achieved\nmeans that the filter output can be used to derive secondary information such\nas time-to-contact or range estimation, that will enable applications to\nreal-world problems such as collision avoidance in autonomous driving.\n","authors":["Ziwei Wang","Timothy Molloy","Pieter van Goor","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2307.10593v3.pdf","comment":"18 pages, 16 figures. The manuscript was accepted on August 7, 2024,\n by IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2312.02078v2","updated":"2024-09-04T00:06:20Z","published":"2023-12-04T17:41:52Z","title":"From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video\n Solution to Enhance Community Safety","summary":" This article adopts and evaluates an AI-enabled Smart Video Solution (SVS)\ndesigned to enhance safety in the real world. The system integrates with\nexisting infrastructure camera networks, leveraging recent advancements in AI\nfor easy adoption. Prioritizing privacy and ethical standards, pose based data\nis used for downstream AI tasks such as anomaly detection. Cloud-based\ninfrastructure and mobile app are deployed, enabling real-time alerts within\ncommunities. The SVS employs innovative data representation and visualization\ntechniques, such as the Occupancy Indicator, Statistical Anomaly Detection,\nBird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance\npublic safety. Evaluation of the SVS demonstrates its capacity to convert\ncomplex computer vision outputs into actionable insights for stakeholders,\ncommunity partners, law enforcement, urban planners, and social scientists.\nThis article presents a comprehensive real-world deployment and evaluation of\nthe SVS, implemented in a community college environment across 16 cameras. The\nsystem integrates AI-driven visual processing, supported by statistical\nanalysis, database management, cloud communication, and user notifications.\nAdditionally, the article evaluates the end-to-end latency from the moment an\nAI algorithm detects anomalous behavior in real-time at the camera level to the\ntime stakeholders receive a notification. The results demonstrate the system's\nrobustness, effectively managing 16 CCTV cameras with a consistent throughput\nof 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end\nlatency of 26.76 seconds between anomaly detection and alert issuance.\n","authors":["Shanle Yao","Babak Rahimi Ardabili","Armin Danesh Pazho","Ghazal Alinezhad Noghre","Christopher Neff","Lauren Bourque","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2312.02078v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.02864v1","updated":"2024-09-04T16:43:14Z","published":"2024-09-04T16:43:14Z","title":"Bioinformatics Retrieval Augmentation Data (BRAD) Digital Assistant","summary":" We present a prototype for a Bioinformatics Retrieval Augmentation Data\n(BRAD) digital assistant. BRAD integrates a suite of tools to handle a wide\nrange of bioinformatics tasks, from code execution to online search. We\ndemonstrate BRAD's capabilities through (1) improved question-and-answering\nwith retrieval augmented generation (RAG), (2) BRAD's ability to run and write\ncomplex software pipelines, and (3) BRAD's ability to organize and distribute\ntasks across individual and teams of agents. We use BRAD for automation of\nbioinformatics workflows, performing tasks ranging from gene enrichment and\nsearching the archive to automatic code generation and running biomarker\nidentification pipelines. BRAD is a step toward the ultimate goal to develop a\ndigital twin of laboratories driven by self-contained loops for hypothesis\ngeneration and testing of digital biology experiments.\n","authors":["Joshua Pickard","Marc Andrew Choi","Natalie Oliven","Cooper Stansbury","Jillian Cwycyshyn","Nicholas Galioto","Alex Gorodetsky","Alvaro Velasquez","Indika Rajapakse"],"pdf_url":"https://arxiv.org/pdf/2409.02864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00847v2","updated":"2024-09-04T16:39:22Z","published":"2024-09-01T21:30:14Z","title":"The Design of an LLM-powered Unstructured Analytics System","summary":" LLMs demonstrate an uncanny ability to process unstructured data, and as\nsuch, have the potential to go beyond search and run complex, semantic analyses\nat scale. We describe the design of an unstructured analytics system, Aryn, and\nthe tenets and use cases that motivate its design. With Aryn, users can specify\nqueries in natural language and the system automatically determines a semantic\nplan and executes it to compute an answer from a large collection of\nunstructured documents using LLMs. At the core of Aryn is Sycamore, a\ndeclarative document processing engine, built using Ray, that provides a\nreliable distributed abstraction called DocSets. Sycamore allows users to\nanalyze, enrich, and transform complex documents at scale. Aryn also comprises\nLuna, a query planner that translates natural language queries to Sycamore\nscripts, and the Aryn Partitioner, which takes raw PDFs and document images,\nand converts them to DocSets for downstream processing. Using Aryn, we\ndemonstrate a real world use case for analyzing accident reports from the\nNational Transportation Safety Board (NTSB), and discuss some of the major\nchallenges we encountered in deploying Aryn in the wild.\n","authors":["Eric Anderson","Jonathan Fritz","Austin Lee","Bohou Li","Mark Lindblad","Henry Lindeman","Alex Meyer","Parth Parmar","Tanvi Ranade","Mehul A. Shah","Benjamin Sowell","Dan Tecuci","Vinayak Thapliyal","Matt Welsh"],"pdf_url":"https://arxiv.org/pdf/2409.00847v2.pdf","comment":"6 pages, 3 figures, fixed typos"},{"id":"http://arxiv.org/abs/2409.02856v1","updated":"2024-09-04T16:29:25Z","published":"2024-09-04T16:29:25Z","title":"Building a Scalable, Effective, and Steerable Search and Ranking\n Platform","summary":" Modern e-commerce platforms offer vast product selections, making it\ndifficult for customers to find items that they like and that are relevant to\ntheir current session intent. This is why it is key for e-commerce platforms to\nhave near real-time scalable and adaptable personalized ranking and search\nsystems. While numerous methods exist in the scientific literature for building\nsuch systems, many are unsuitable for large-scale industrial use due to\ncomplexity and performance limitations. Consequently, industrial ranking\nsystems often resort to computationally efficient yet simplistic retrieval or\ncandidate generation approaches, which overlook near real-time and\nheterogeneous customer signals, which results in a less personalized and\nrelevant experience. Moreover, related customer experiences are served by\ncompletely different systems, which increases complexity, maintenance, and\ninconsistent experiences.\n In this paper, we present a personalized, adaptable near real-time ranking\nplatform that is reusable across various use cases, such as browsing and\nsearch, and that is able to cater to millions of items and customers under\nheavy load (thousands of requests per second). We employ transformer-based\nmodels through different ranking layers which can learn complex behavior\npatterns directly from customer action sequences while being able to\nincorporate temporal (e.g. in-session) and contextual information. We validate\nour system through a series of comprehensive offline and online real-world\nexperiments at a large online e-commerce platform, and we demonstrate its\nsuperiority when compared to existing systems, both in terms of customer\nexperience as well as in net revenue. Finally, we share the lessons learned\nfrom building a comprehensive, modern ranking platform for use in a large-scale\ne-commerce environment.\n","authors":["Marjan Celikik","Jacek Wasilewski","Ana Peleteiro Ramallo","Alexey Kurennoy","Evgeny Labzin","Danilo Ascione","Tural Gurbanov","Géraud Le Falher","Andrii Dzhoha","Ian Harris"],"pdf_url":"https://arxiv.org/pdf/2409.02856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02727v1","updated":"2024-09-04T14:01:48Z","published":"2024-09-04T14:01:48Z","title":"Pooling And Attention: What Are Effective Designs For LLm-Based\n Embedding Models?","summary":" The significant advancements of Large Language Models (LLMs) in generative\ntasks have led to a growing body of work exploring LLM-based embedding models.\nWhile these models, employing different pooling and attention strategies, have\nachieved state-of-the-art performance on public embedding benchmarks, questions\nstill arise about what constitutes an effective design for LLM-based embedding\nmodels. However, these models are often trained on different datasets, using\ndifferent LLM base models or training settings. Moreover, evaluations on public\nembedding benchmarks often fail to report statistical significance, making it\ndifficult to determine which designs truly contribute to final performance.\nThis complicates the process for practitioners seeking optimal training recipes\nfor LLM-based embedding models. In this study, we conduct a large-scale\nexperiment by training a series of LLM-based embedding models using the same\ntraining data and base model but differing in their pooling and attention\nstrategies. The results show that there is no one-size-fits-all solution: while\nbidirectional attention and an additional trainable pooling layer outperform in\ntext similarity and information retrieval tasks, they do not significantly\nsurpass simpler designs like EOS-last token pooling and default causal\nattention in clustering and classification tasks. Furthermore, we propose a new\npooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs\nof all hidden layers, rather than just the last layer, using a cross-attention\nnetwork. This method proves to be statistically superior in text similarity and\nretrieval tasks compared to existing pooling methods. Overall, this paper sheds\nlight on effective training strategies for LLM-based embedding models.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02727v1.pdf","comment":"https://github.com/yixuantt/PoolingAndAttn"},{"id":"http://arxiv.org/abs/2302.03883v2","updated":"2024-09-04T14:00:42Z","published":"2023-02-08T05:12:54Z","title":"Multimodal Recommender Systems: A Survey","summary":" The recommender system (RS) has been an integral toolkit of online services.\nThey are equipped with various deep learning techniques to model user\npreference based on identifier and attribute information. With the emergence of\nmultimedia services, such as short videos, news and etc., understanding these\ncontents while recommending becomes critical. Besides, multimodal features are\nalso helpful in alleviating the problem of data sparsity in RS. Thus,\nMultimodal Recommender System (MRS) has attracted much attention from both\nacademia and industry recently. In this paper, we will give a comprehensive\nsurvey of the MRS models, mainly from technical views. First, we conclude the\ngeneral procedures and major challenges for MRS. Then, we introduce the\nexisting MRS models according to four categories, i.e., Modality Encoder,\nFeature Interaction, Feature Enhancement and Model Optimization. Besides, to\nmake it convenient for those who want to research this field, we also summarize\nthe dataset and code resources. Finally, we discuss some promising future\ndirections of MRS and conclude this paper. To access more details of the\nsurveyed papers, such as implementation code, we open source a repository.\n","authors":["Qidong Liu","Jiaxi Hu","Yutian Xiao","Xiangyu Zhao","Jingtong Gao","Wanyu Wang","Qing Li","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2302.03883v2.pdf","comment":"accepted by CSUR"},{"id":"http://arxiv.org/abs/2409.00702v2","updated":"2024-09-04T13:19:42Z","published":"2024-09-01T12:11:48Z","title":"MARS: Matching Attribute-aware Representations for Text-based Sequential\n Recommendation","summary":" Sequential recommendation aims to predict the next item a user is likely to\nprefer based on their sequential interaction history. Recently, text-based\nsequential recommendation has emerged as a promising paradigm that uses\npre-trained language models to exploit textual item features to enhance\nperformance and facilitate knowledge transfer to unseen datasets. However,\nexisting text-based recommender models still struggle with two key challenges:\n(i) representing users and items with multiple attributes, and (ii) matching\nitems with complex user interests. To address these challenges, we propose a\nnovel model, Matching Attribute-aware Representations for Text-based Sequential\nRecommendation (MARS). MARS extracts detailed user and item representations\nthrough attribute-aware text encoding, capturing diverse user intents with\nmultiple attribute-aware representations. It then computes user-item scores via\nattribute-wise interaction matching, effectively capturing attribute-level user\npreferences. Our extensive experiments demonstrate that MARS significantly\noutperforms existing sequential models, achieving improvements of up to 24.43%\nand 29.26% in Recall@10 and NDCG@10 across five benchmark datasets. Code is\navailable at https://github.com/junieberry/MARS\n","authors":["Hyunsoo Kim","Junyoung Kim","Minjin Choi","Sunkyung Lee","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2409.00702v2.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2409.02685v1","updated":"2024-09-04T13:16:55Z","published":"2024-09-04T13:16:55Z","title":"RouterRetriever: Exploring the Benefits of Routing over Multiple Expert\n Embedding Models","summary":" Information retrieval methods often rely on a single embedding model trained\non large, general-domain datasets like MSMARCO. While this approach can produce\na retriever with reasonable overall performance, models trained on\ndomain-specific data often yield better results within their respective\ndomains. While prior work in information retrieval has tackled this through\nmulti-task training, the topic of combining multiple domain-specific expert\nretrievers remains unexplored, despite its popularity in language model\ngeneration. In this work, we introduce RouterRetriever, a retrieval model that\nleverages multiple domain-specific experts along with a routing mechanism to\nselect the most appropriate expert for each query. It is lightweight and allows\neasy addition or removal of experts without additional training. Evaluation on\nthe BEIR benchmark demonstrates that RouterRetriever outperforms both\nMSMARCO-trained (+2.1 absolute nDCG@10) and multi-task trained (+3.2) models.\nThis is achieved by employing our routing mechanism, which surpasses other\nrouting techniques (+1.8 on average) commonly used in language modeling.\nFurthermore, the benefit generalizes well to other datasets, even in the\nabsence of a specific expert on the dataset. To our knowledge, RouterRetriever\nis the first work to demonstrate the advantages of using multiple\ndomain-specific expert embedding models with effective routing over a single,\ngeneral-purpose embedding model in retrieval tasks.\n","authors":["Hyunji Lee","Luca Soldaini","Arman Cohan","Minjoon Seo","Kyle Lo"],"pdf_url":"https://arxiv.org/pdf/2409.02685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09979v2","updated":"2024-09-04T12:33:24Z","published":"2024-06-14T12:41:07Z","title":"HIRO: Hierarchical Information Retrieval Optimization","summary":" Retrieval-Augmented Generation (RAG) has revolutionized natural language\nprocessing by dynamically integrating external knowledge into Large Language\nModels (LLMs), addressing their limitation of static training datasets. Recent\nimplementations of RAG leverage hierarchical data structures, which organize\ndocuments at various levels of summarization and information density. This\ncomplexity, however, can cause LLMs to \"choke\" on information overload,\nnecessitating more sophisticated querying mechanisms. In this context, we\nintroduce Hierarchical Information Retrieval Optimization (HIRO), a novel\nquerying approach that employs a Depth-First Search (DFS)-based recursive\nsimilarity score calculation and branch pruning. This method uniquely minimizes\nthe context delivered to the LLM without informational loss, effectively\nmanaging the challenge of excessive data. HIRO's refined approach is validated\nby a 10.85% improvement in performance on the NarrativeQA dataset.\n","authors":["Krish Goel","Mahek Chandak"],"pdf_url":"https://arxiv.org/pdf/2406.09979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v4","updated":"2024-09-04T11:39:56Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zheng Liu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v4.pdf","comment":"updated to version 3"},{"id":"http://arxiv.org/abs/2409.01137v2","updated":"2024-09-04T10:58:57Z","published":"2024-09-02T10:19:31Z","title":"Smart E-commerce Recommendations with Semantic AI","summary":" In e-commerce, web mining for page recommendations is widely used but often\nfails to meet user needs. To address this, we propose a novel solution\ncombining semantic web mining with BP neural networks. We process user search\nlogs to extract five key features: content priority, time spent, user feedback,\nrecommendation semantics, and input deviation. These features are then fed into\na BP neural network to classify and prioritize web pages. The prioritized pages\nare recommended to users. Using book sales pages for testing, our results\ndemonstrate that this solution can quickly and accurately identify the pages\nusers need. Our approach ensures that recommendations are more relevant and\ntailored to individual preferences, enhancing the online shopping experience.\nBy leveraging advanced semantic analysis and neural network techniques, we\nbridge the gap between user expectations and actual recommendations. This\ninnovative method not only improves accuracy but also speeds up the\nrecommendation process, making it a valuable tool for e-commerce platforms\naiming to boost user satisfaction and engagement. Additionally, our system\nability to handle large datasets and provide real-time recommendations makes it\na scalable and efficient solution for modern e-commerce challenges.\n","authors":["M. Badouch","M. Boutaounte"],"pdf_url":"https://arxiv.org/pdf/2409.01137v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.02599v1","updated":"2024-09-04T10:30:11Z","published":"2024-09-04T10:30:11Z","title":"A Fashion Item Recommendation Model in Hyperbolic Space","summary":" In this work, we propose a fashion item recommendation model that\nincorporates hyperbolic geometry into user and item representations. Using\nhyperbolic space, our model aims to capture implicit hierarchies among items\nbased on their visual data and users' purchase history. During training, we\napply a multi-task learning framework that considers both hyperbolic and\nEuclidean distances in the loss function. Our experiments on three data sets\nshow that our model performs better than previous models trained in Euclidean\nspace only, confirming the effectiveness of our model. Our ablation studies\nshow that multi-task learning plays a key role, and removing the Euclidean loss\nsubstantially deteriorates the model performance.\n","authors":["Ryotaro Shimizu","Yu Wang","Masanari Kimura","Yuki Hirakawa","Takashi Wada","Yuki Saito","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.02599v1.pdf","comment":"This work was presented at the CVFAD Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2409.02580v1","updated":"2024-09-04T10:03:09Z","published":"2024-09-04T10:03:09Z","title":"AlignGroup: Learning and Aligning Group Consensus with Member\n Preferences for Group Recommendation","summary":" Group activities are important behaviors in human society, providing\npersonalized recommendations for groups is referred to as the group\nrecommendation task. Existing methods can usually be categorized into two\nstrategies to infer group preferences: 1) determining group preferences by\naggregating members' personalized preferences, and 2) inferring group consensus\nby capturing group members' coherent decisions after common compromises.\nHowever, the former would suffer from the lack of group-level considerations,\nand the latter overlooks the fine-grained preferences of individual users. To\nthis end, we propose a novel group recommendation method AlignGroup, which\nfocuses on both group consensus and individual preferences of group members to\ninfer the group decision-making. Specifically, AlignGroup explores group\nconsensus through a well-designed hypergraph neural network that efficiently\nlearns intra- and inter-group relationships. Moreover, AlignGroup innovatively\nutilizes a self-supervised alignment task to capture fine-grained group\ndecision-making by aligning the group consensus with members' common\npreferences. Extensive experiments on two real-world datasets validate that our\nAlignGroup outperforms the state-of-the-art on both the group recommendation\ntask and the user recommendation task, as well as outperforms the efficiency of\nmost baselines.\n","authors":["Jinfeng Xu","Zheyu Chen","Jinze Li","Shuo Yang","Hewei Wang","Edith C. -H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2409.02580v1.pdf","comment":"10 pages, accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2409.02571v1","updated":"2024-09-04T09:41:52Z","published":"2024-09-04T09:41:52Z","title":"iRangeGraph: Improvising Range-dedicated Graphs for Range-filtering\n Nearest Neighbor Search","summary":" Range-filtering approximate nearest neighbor (RFANN) search is attracting\nincreasing attention in academia and industry. Given a set of data objects,\neach being a pair of a high-dimensional vector and a numeric value, an RFANN\nquery with a vector and a numeric range as parameters returns the data object\nwhose numeric value is in the query range and whose vector is nearest to the\nquery vector. To process this query, a recent study proposes to build $O(n^2)$\ndedicated graph-based indexes for all possible query ranges to enable efficient\nprocessing on a database of $n$ objects. As storing all these indexes is\nprohibitively expensive, the study constructs compressed indexes instead, which\nreduces the memory consumption considerably. However, this incurs suboptimal\nperformance because the compression is lossy. In this study, instead of\nmaterializing a compressed index for every possible query range in preparation\nfor querying, we materialize graph-based indexes, called elemental graphs, for\na moderate number of ranges. We then provide an effective and efficient\nalgorithm that during querying can construct an index for any query range using\nthe elemental graphs. We prove that the time needed to construct such an index\nis low. We also cover an experimental study on real-world datasets that\nprovides evidence that the materialized elemental graphs only consume moderate\nspace and that the proposed method is capable of superior and stable query\nperformance across different query workloads.\n","authors":["Yuexuan Xu","Jianyang Gao","Yutong Gou","Cheng Long","Christian S. Jensen"],"pdf_url":"https://arxiv.org/pdf/2409.02571v1.pdf","comment":"The paper has been accepted by SIGMOD 2025"},{"id":"http://arxiv.org/abs/2404.06900v3","updated":"2024-09-04T06:55:21Z","published":"2024-04-10T10:45:30Z","title":"NFARec: A Negative Feedback-Aware Recommender Model","summary":" Graph neural network (GNN)-based models have been extensively studied for\nrecommendations, as they can extract high-order collaborative signals\naccurately which is required for high-quality recommender systems. However,\nthey neglect the valuable information gained through negative feedback in two\naspects: (1) different users might hold opposite feedback on the same item,\nwhich hampers optimal information propagation in GNNs, and (2) even when an\nitem vastly deviates from users' preferences, they might still choose it and\nprovide a negative rating. In this paper, we propose a negative feedback-aware\nrecommender model (NFARec) that maximizes the leverage of negative feedback. To\ntransfer information to multi-hop neighbors along an optimal path effectively,\nNFARec adopts a feedback-aware correlation that guides hypergraph convolutions\n(HGCs) to learn users' structural representations. Moreover, NFARec\nincorporates an auxiliary task - predicting the feedback sentiment polarity\n(i.e., positive or negative) of the next interaction - based on the Transformer\nHawkes Process. The task is beneficial for understanding users by learning the\nsentiment expressed in their previous sequential feedback patterns and\npredicting future interactions. Extensive experiments demonstrate that NFARec\noutperforms competitive baselines. Our source code and data are released at\nhttps://github.com/WangXFng/NFARec.\n","authors":["Xinfeng Wang","Fumiyo Fukumoto","Jin Cui","Yoshimi Suzuki","Dongjin Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06900v3.pdf","comment":"Accepted to SIGIR 2024"},{"id":"http://arxiv.org/abs/2404.06895v3","updated":"2024-09-04T06:51:55Z","published":"2024-04-10T10:38:24Z","title":"CaDRec: Contextualized and Debiased Recommender Model","summary":" Recommender models aimed at mining users' behavioral patterns have raised\ngreat attention as one of the essential applications in daily life. Recent work\non graph neural networks (GNNs) or debiasing methods has attained remarkable\ngains. However, they still suffer from (1) over-smoothing node embeddings\ncaused by recursive convolutions with GNNs, and (2) the skewed distribution of\ninteractions due to popularity and user-individual biases. This paper proposes\na contextualized and debiased recommender model (CaDRec). To overcome the\nover-smoothing issue, we explore a novel hypergraph convolution operator that\ncan select effective neighbors during convolution by introducing both\nstructural context and sequential context. To tackle the skewed distribution,\nwe propose two strategies for disentangling interactions: (1) modeling\nindividual biases to learn unbiased item embeddings, and (2) incorporating item\npopularity with positional encoding. Moreover, we mathematically show that the\nimbalance of the gradients to update item embeddings exacerbates the popularity\nbias, thus adopting regularization and weighting schemes as solutions.\nExtensive experiments on four datasets demonstrate the superiority of the\nCaDRec against state-of-the-art (SOTA) methods. Our source code and data are\nreleased at https://github.com/WangXFng/CaDRec.\n","authors":["Xinfeng Wang","Fumiyo Fukumoto","Jin Cui","Yoshimi Suzuki","Jiyi Li","Dongjin Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06895v3.pdf","comment":"Accepted to SIGIR 2024"},{"id":"http://arxiv.org/abs/2408.15796v2","updated":"2024-09-04T06:36:22Z","published":"2024-08-28T13:42:28Z","title":"Evaluating Named Entity Recognition Using Few-Shot Prompting with Large\n Language Models","summary":" This paper evaluates Few-Shot Prompting with Large Language Models for Named\nEntity Recognition (NER). Traditional NER systems rely on extensive labeled\ndatasets, which are costly and time-consuming to obtain. Few-Shot Prompting or\nin-context learning enables models to recognize entities with minimal examples.\nWe assess state-of-the-art models like GPT-4 in NER tasks, comparing their\nfew-shot performance to fully supervised benchmarks. Results show that while\nthere is a performance gap, large models excel in adapting to new entity types\nand domains with very limited data. We also explore the effects of prompt\nengineering, guided output format and context length on performance. This study\nunderscores Few-Shot Learning's potential to reduce the need for large labeled\ndatasets, enhancing NER scalability and accessibility.\n","authors":["Hédi Zeghidi","Ludovic Moncla"],"pdf_url":"https://arxiv.org/pdf/2408.15796v2.pdf","comment":"Github repo: https://github.com/GEODE-project/ner-llm"},{"id":"http://arxiv.org/abs/2409.02455v1","updated":"2024-09-04T05:36:00Z","published":"2024-09-04T05:36:00Z","title":"An Effective Tag Assignment Approach for Billboard Advertisement","summary":" Billboard Advertisement has gained popularity due to its significant outrage\nin return on investment. To make this advertisement approach more effective,\nthe relevant information about the product needs to be reached to the relevant\nset of people. This can be achieved if the relevant set of tags can be mapped\nto the correct slots. Formally, we call this problem the Tag Assignment Problem\nin Billboard Advertisement. Given trajectory, billboard database, and a set of\nselected billboard slots and tags, this problem asks to output a mapping of\nselected tags to the selected slots so that the influence is maximized. We\nmodel this as a variant of traditional bipartite matching called One-To-Many\nBipartite Matching (OMBM). Unlike traditional bipartite matching, a tag can be\nassigned to only one slot; in the OMBM, a tag can be assigned to multiple slots\nwhile the vice versa can not happen. We propose an iterative solution approach\nthat incrementally allocates the tags to the slots. The proposed methodology\nhas been explained with an illustrated example. A complexity analysis of the\nproposed solution approach has also been conducted. The experimental results on\nreal-world trajectory and billboard datasets prove our claim on the\neffectiveness and efficiency of the proposed solution.\n","authors":["Dildar Ali","Harishchandra Kumar","Suman Banerjee","Yamuna Prasad"],"pdf_url":"https://arxiv.org/pdf/2409.02455v1.pdf","comment":"This Paper has been accepted at The 25th International Web\n Information Systems Engineering Conference (WISE-2024)"},{"id":"http://arxiv.org/abs/2408.16672v3","updated":"2024-09-04T05:09:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce a novel architecture and a training framework to\nsupport long context window and multilingual retrieval. Our new model,\nJina-ColBERT-v2, demonstrates strong performance across a range of English and\nmultilingual retrieval tasks,\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Georgios Mastrapas","Saba Sturua","Isabelle Mohr","Andreas Koukounas","Mohammad Kalim Akram","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v3.pdf","comment":"8 pages, references at pp7,8; EMNLP workshop submission"},{"id":"http://arxiv.org/abs/2409.02425v1","updated":"2024-09-04T04:12:22Z","published":"2024-09-04T04:12:22Z","title":"Deep Adaptive Interest Network: Personalized Recommendation with\n Context-Aware Learning","summary":" In personalized recommendation systems, accurately capturing users' evolving\ninterests and combining them with contextual information is a critical research\narea. This paper proposes a novel model called the Deep Adaptive Interest\nNetwork (DAIN), which dynamically models users' interests while incorporating\ncontext-aware learning mechanisms to achieve precise and adaptive personalized\nrecommendations. DAIN leverages deep learning techniques to build an adaptive\ninterest network structure that can capture users' interest changes in\nreal-time while further optimizing recommendation results by integrating\ncontextual information. Experiments conducted on several public datasets\ndemonstrate that DAIN excels in both recommendation performance and\ncomputational efficiency. This research not only provides a new solution for\npersonalized recommendation systems but also offers fresh insights into the\napplication of context-aware learning in recommendation systems.\n","authors":["Shuaishuai Huang","Haowei Yang","You Yao","Xueting Lin","Yuming Tu"],"pdf_url":"https://arxiv.org/pdf/2409.02425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02343v1","updated":"2024-09-04T00:10:36Z","published":"2024-09-04T00:10:36Z","title":"NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for\n Retrieval","summary":" $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval)\nfrom pre-trained embedding models is the predominant retrieval method for text\nand images, as well as Retrieval-Augmented Generation (RAG) pipelines. In\npractice, application developers often fine-tune the embeddings to improve\ntheir accuracy on the dataset and query workload in hand. Existing approaches\neither fine-tune the pre-trained model itself or, more efficiently, but at the\ncost of accuracy, train adaptor models to transform the output of the\npre-trained model. We present NUDGE, a family of novel non-parametric embedding\nfine-tuning approaches that are significantly more accurate and efficient than\nboth sets of existing approaches. NUDGE directly modifies the embeddings of\ndata records to maximize the accuracy of $k$-NN retrieval. We present a\nthorough theoretical and experimental study of NUDGE's non-parametric approach.\nWe show that even though the underlying problem is NP-Hard, constrained\nvariations can be solved efficiently. These constraints additionally ensure\nthat the changes to the embeddings are modest, avoiding large distortions to\nthe semantics learned during pre-training. In experiments across five\npre-trained models and nine standard text and image retrieval datasets, NUDGE\nruns in minutes and often improves NDCG@10 by more than 10% over existing\nfine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase\nin accuracy and runs 200x and 3x faster, respectively, over fine-tuning the\npre-trained model and training adaptors.\n","authors":["Sepanta Zeighami","Zac Wellmer","Aditya Parameswaran"],"pdf_url":"https://arxiv.org/pdf/2409.02343v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.13989v3","updated":"2024-09-04T17:52:37Z","published":"2024-07-19T02:34:10Z","title":"Enhancing Graph Neural Networks with Limited Labeled Data by Actively\n Distilling Knowledge from Large Language Models","summary":" Graphs are pervasive in the real-world, such as social network analysis,\nbioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great\nability in node classification, a fundamental task on graphs. Unfortunately,\nconventional GNNs still face challenges in scenarios with few labeled nodes,\ndespite the prevalence of few-shot node classification tasks in real-world\napplications. To address this challenge, various approaches have been proposed,\nincluding graph meta-learning, transfer learning, and methods based on Large\nLanguage Models (LLMs). However, traditional meta-learning and transfer\nlearning methods often require prior knowledge from base classes or fail to\nexploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based\nmethods may overlook the zero-shot capabilities of LLMs and rely heavily on the\nquality of generated contexts. In this paper, we propose a novel approach that\nintegrates LLMs and GNNs, leveraging the zero-shot inference and reasoning\ncapabilities of LLMs and employing a Graph-LLM-based active learning paradigm\nto enhance GNNs' performance. Extensive experiments demonstrate the\neffectiveness of our model in improving node classification accuracy with\nconsiderably limited labeled data, surpassing state-of-the-art baselines by\nsignificant margins.\n","authors":["Quan Li","Tianxiang Zhao","Lingwei Chen","Junjie Xu","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13989v3.pdf","comment":"10 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2407.02461v5","updated":"2024-09-04T17:48:46Z","published":"2024-07-02T17:40:06Z","title":"Decentralized Intelligence Network (DIN)","summary":" Decentralized Intelligence Network (DIN) is a theoretical framework designed\nto address challenges in AI development, particularly focusing on data\nfragmentation and siloing issues. It facilitates effective AI training within\nsovereign data networks by overcoming barriers to accessing diverse data\nsources, leveraging: 1) personal data stores to ensure data sovereignty, where\ndata remains securely within Participants' control; 2) a scalable federated\nlearning protocol implemented on a public blockchain for decentralized AI\ntraining, where only model parameter updates are shared, keeping data within\nthe personal data stores; and 3) a scalable, trustless cryptographic rewards\nmechanism on a public blockchain to incentivize participation and ensure fair\nreward distribution through a decentralized auditing protocol. This approach\nguarantees that no entity can prevent or control access to training data or\ninfluence financial benefits, as coordination and reward distribution are\nmanaged on the public blockchain with an immutable record. The framework\nsupports effective AI training by allowing Participants to maintain control\nover their data, benefit financially, and contribute to a decentralized,\nscalable ecosystem that leverages collective AI to develop beneficial\nalgorithms.\n","authors":["Abraham Nash"],"pdf_url":"https://arxiv.org/pdf/2407.02461v5.pdf","comment":"16 pages, 1 figure. DIN was presented by the author as a speaker at\n the Summit on Responsible Decentralized Intelligence - Future of\n Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the\n Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC"},{"id":"http://arxiv.org/abs/2409.02908v1","updated":"2024-09-04T17:48:19Z","published":"2024-09-04T17:48:19Z","title":"Masked Diffusion Models are Secretly Time-Agnostic Masked Models and\n Exploit Inaccurate Categorical Sampling","summary":" Masked diffusion models (MDMs) have emerged as a popular research topic for\ngenerative modeling of discrete data, thanks to their superior performance over\nother discrete diffusion models, and are rivaling the auto-regressive models\n(ARMs) for language modeling tasks. The recent effort in simplifying the masked\ndiffusion framework further leads to alignment with continuous-space diffusion\nmodels and more principled training and sampling recipes. In this paper,\nhowever, we reveal that both training and sampling of MDMs are theoretically\nfree from the time variable, arguably the key signature of diffusion models,\nand are instead equivalent to masked models. The connection on the sampling\naspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we\nshow that the FHS is theoretically equivalent to MDMs' original generation\nprocess while significantly alleviating the time-consuming categorical sampling\nand achieving a 20$\\times$ speedup. In addition, our investigation challenges\nprevious claims that MDMs can surpass ARMs in generative perplexity. We\nidentify, for the first time, an underlying numerical issue, even with the\n32-bit floating-point precision, which results in inaccurate categorical\nsampling. We show that the numerical issue lowers the effective temperature\nboth theoretically and empirically, leading to unfair assessments of MDMs'\ngeneration results in the previous literature.\n","authors":["Kaiwen Zheng","Yongxin Chen","Hanzi Mao","Ming-Yu Liu","Jun Zhu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02908v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2402.11126v2","updated":"2024-09-04T17:46:07Z","published":"2024-02-16T23:21:40Z","title":"Kolmogorov n-Widths for Multitask Physics-Informed Machine Learning\n (PIML) Methods: Towards Robust Metrics","summary":" Physics-informed machine learning (PIML) as a means of solving partial\ndifferential equations (PDE) has garnered much attention in the Computational\nScience and Engineering (CS&E) world. This topic encompasses a broad array of\nmethods and models aimed at solving a single or a collection of PDE problems,\ncalled multitask learning. PIML is characterized by the incorporation of\nphysical laws into the training process of machine learning models in lieu of\nlarge data when solving PDE problems. Despite the overall success of this\ncollection of methods, it remains incredibly difficult to analyze, benchmark,\nand generally compare one approach to another. Using Kolmogorov n-widths as a\nmeasure of effectiveness of approximating functions, we judiciously apply this\nmetric in the comparison of various multitask PIML architectures. We compute\nlower accuracy bounds and analyze the model's learned basis functions on\nvarious PDE problems. This is the first objective metric for comparing\nmultitask PIML architectures and helps remove uncertainty in model validation\nfrom selective sampling and overfitting. We also identify avenues of\nimprovement for model architectures, such as the choice of activation function,\nwhich can drastically affect model generalization to \"worst-case\" scenarios,\nwhich is not observed when reporting task-specific errors. We also incorporate\nthis metric into the optimization process through regularization, which\nimproves the models' generalizability over the multitask PDE problem.\n","authors":["Michael Penwarden","Houman Owhadi","Robert M. Kirby"],"pdf_url":"https://arxiv.org/pdf/2402.11126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07703v2","updated":"2024-09-04T17:45:51Z","published":"2022-10-14T10:54:11Z","title":"Hybrid Decentralized Optimization: Leveraging Both First- and\n Zeroth-Order Optimizers for Faster Convergence","summary":" Distributed optimization is the standard way of speeding up machine learning\ntraining, and most of the research in the area focuses on distributed\nfirst-order, gradient-based methods. Yet, there are settings where some\ncomputationally-bounded nodes may not be able to implement first-order,\ngradient-based optimization, while they could still contribute to joint\noptimization tasks. In this paper, we initiate the study of hybrid\ndecentralized optimization, studying settings where nodes with zeroth-order and\nfirst-order optimization capabilities co-exist in a distributed system, and\nattempt to jointly solve an optimization task over some data distribution. We\nessentially show that, under reasonable parameter settings, such a system can\nnot only withstand noisier zeroth-order agents but can even benefit from\nintegrating such agents into the optimization process, rather than ignoring\ntheir information. At the core of our approach is a new analysis of distributed\noptimization with noisy and possibly-biased gradient estimators, which may be\nof independent interest. Our results hold for both convex and non-convex\nobjectives. Experimental results on standard optimization tasks confirm our\nanalysis, showing that hybrid first-zeroth order optimization can be practical,\neven when training deep neural networks.\n","authors":["Matin Ansaripour","Shayan Talaei","Giorgi Nadiradze","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2210.07703v2.pdf","comment":"Shayan Talaei and Matin Ansaripour contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02901v1","updated":"2024-09-04T17:44:52Z","published":"2024-09-04T17:44:52Z","title":"Topological Methods in Machine Learning: A Tutorial for Practitioners","summary":" Topological Machine Learning (TML) is an emerging field that leverages\ntechniques from algebraic topology to analyze complex data structures in ways\nthat traditional machine learning methods may not capture. This tutorial\nprovides a comprehensive introduction to two key TML techniques, persistent\nhomology and the Mapper algorithm, with an emphasis on practical applications.\nPersistent homology captures multi-scale topological features such as clusters,\nloops, and voids, while the Mapper algorithm creates an interpretable graph\nsummarizing high-dimensional data. To enhance accessibility, we adopt a\ndata-centric approach, enabling readers to gain hands-on experience applying\nthese techniques to relevant tasks. We provide step-by-step explanations,\nimplementations, hands-on examples, and case studies to demonstrate how these\ntools can be applied to real-world problems. The goal is to equip researchers\nand practitioners with the knowledge and resources to incorporate TML into\ntheir work, revealing insights often hidden from conventional machine learning\nmethods. The tutorial code is available at\nhttps://github.com/cakcora/TopologyForML\n","authors":["Baris Coskunuzer","Cüneyt Gürcan Akçora"],"pdf_url":"https://arxiv.org/pdf/2409.02901v1.pdf","comment":"54 pages, 35 figures"},{"id":"http://arxiv.org/abs/2409.02891v1","updated":"2024-09-04T17:31:20Z","published":"2024-09-04T17:31:20Z","title":"Regional data-driven weather modeling with a global stretched-grid","summary":" A data-driven model (DDM) suitable for regional weather forecasting\napplications is presented. The model extends the Artificial Intelligence\nForecasting System by introducing a stretched-grid architecture that dedicates\nhigher resolution over a regional area of interest and maintains a lower\nresolution elsewhere on the globe. The model is based on graph neural networks,\nwhich naturally affords arbitrary multi-resolution grid configurations.\n The model is applied to short-range weather prediction for the Nordics,\nproducing forecasts at 2.5 km spatial and 6 h temporal resolution. The model is\npre-trained on 43 years of global ERA5 data at 31 km resolution and is further\nrefined using 3.3 years of 2.5 km resolution operational analyses from the\nMetCoOp Ensemble Prediction System (MEPS). The performance of the model is\nevaluated using surface observations from measurement stations across Norway\nand is compared to short-range weather forecasts from MEPS. The DDM outperforms\nboth the control run and the ensemble mean of MEPS for 2 m temperature. The\nmodel also produces competitive precipitation and wind speed forecasts, but is\nshown to underestimate extreme events.\n","authors":["Thomas Nils Nipen","Håvard Homleid Haugen","Magnus Sikora Ingstad","Even Marius Nordhagen","Aram Farhad Shafiq Salihi","Paulina Tedesco","Ivar Ambjørn Seierstad","Jørn Kristiansen","Simon Lang","Mihai Alexe","Jesper Dramsch","Baudouin Raoult","Gert Mertes","Matthew Chantry"],"pdf_url":"https://arxiv.org/pdf/2409.02891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18322v2","updated":"2024-09-04T17:16:05Z","published":"2024-07-01T19:52:41Z","title":"The Need for Guardrails with Large Language Models in Medical\n Safety-Critical Settings: An Artificial Intelligence Application in the\n Pharmacovigilance Ecosystem","summary":" Large language models (LLMs) are useful tools with the capacity for\nperforming specific types of knowledge work at an effective scale. However, LLM\ndeployments in high-risk and safety-critical domains pose unique challenges,\nnotably the issue of ``hallucination,'' where LLMs can generate fabricated\ninformation. This is particularly concerning in settings such as drug safety,\nwhere inaccuracies could lead to patient harm. To mitigate these risks, we have\ndeveloped and demonstrated a proof of concept suite of guardrails specifically\ndesigned to mitigate certain types of hallucinations and errors for drug\nsafety, and potentially applicable to other medical safety-critical contexts.\nThese guardrails include mechanisms to detect anomalous documents to prevent\nthe ingestion of inappropriate data, identify incorrect drug names or adverse\nevent terms, and convey uncertainty in generated content. We integrated these\nguardrails with an LLM fine-tuned for a text-to-text task, which involves\nconverting both structured and unstructured data within adverse event reports\ninto natural language. This method was applied to translate individual case\nsafety reports, demonstrating effective application in a pharmacovigilance\nprocessing task. Our guardrail framework offers a set of tools with broad\napplicability across various domains, ensuring LLMs can be safely used in\nhigh-risk situations by eliminating the occurrence of key errors, including the\ngeneration of incorrect pharmacovigilance-related terms, thus adhering to\nstringent regulatory and quality standards in medical safety-critical\nenvironments.\n","authors":["Joe B Hakim","Jeffery L Painter","Darmendra Ramcharran","Vijay Kara","Greg Powell","Paulina Sobczak","Chiho Sato","Andrew Bate","Andrew Beam"],"pdf_url":"https://arxiv.org/pdf/2407.18322v2.pdf","comment":"27 pages, 6 figures, 4 tables and supplementary material provided"},{"id":"http://arxiv.org/abs/2409.02882v1","updated":"2024-09-04T17:07:46Z","published":"2024-09-04T17:07:46Z","title":"Benchmarking Spurious Bias in Few-Shot Image Classifiers","summary":" Few-shot image classifiers are designed to recognize and classify new data\nwith minimal supervision and limited data but often show reliance on spurious\ncorrelations between classes and spurious attributes, known as spurious bias.\nSpurious correlations commonly hold in certain samples and few-shot classifiers\ncan suffer from spurious bias induced from them. There is an absence of an\nautomatic benchmarking system to assess the robustness of few-shot classifiers\nagainst spurious bias. In this paper, we propose a systematic and rigorous\nbenchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied\ndegrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates\nfew-shot evaluation tasks with biased attributes so that using them for\npredictions can demonstrate poor performance. To construct these tasks, we\npropose attribute-based sample selection strategies based on a pre-trained\nvision-language model, eliminating the need for manual dataset curation. This\nallows FewSTAB to automatically benchmark spurious bias using any existing test\ndata. FewSTAB offers evaluation results in a new dimension along with a new\ndesign guideline for building robust classifiers. Moreover, it can benchmark\nspurious bias in varied degrees and enable designs for varied degrees of\nrobustness. Its effectiveness is demonstrated through experiments on ten\nfew-shot learning methods across three datasets. We hope our framework can\ninspire new designs of robust few-shot classifiers. Our code is available at\nhttps://github.com/gtzheng/FewSTAB.\n","authors":["Guangtao Zheng","Wenqian Ye","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02882v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2312.14249v2","updated":"2024-09-04T17:04:40Z","published":"2023-12-21T19:06:34Z","title":"GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for\n High-Throughput Omics Data Analysis and Visualization","summary":" The surge in high-throughput omics data has reshaped the landscape of\nbiological research, underlining the need for powerful, user-friendly data\nanalysis and interpretation tools. This paper presents GenoCraft, a web-based\ncomprehensive software solution designed to handle the entire pipeline of omics\ndata processing. GenoCraft offers a unified platform featuring advanced\nbioinformatics tools, covering all aspects of omics data analysis. It\nencompasses a range of functionalities, such as normalization, quality control,\ndifferential analysis, network analysis, pathway analysis, and diverse\nvisualization techniques. This software makes state-of-the-art omics data\nanalysis more accessible to a wider range of users. With GenoCraft, researchers\nand data scientists have access to an array of cutting-edge bioinformatics\ntools under a user-friendly interface, making it a valuable resource for\nmanaging and analyzing large-scale omics data. The API with an interactive web\ninterface is publicly available at https://genocraft.stanford. edu/. We also\nrelease all the codes in https://github.com/futianfan/GenoCraft.\n","authors":["Yingzhou Lu","Minjie Shen","Ling Yue","Chenhao Li","Fan Meng","Xiao Wang","David Herrington","Yue Wang","Yue Zhao","Tianfan Fu","Capucine Van Rechem"],"pdf_url":"https://arxiv.org/pdf/2312.14249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02877v1","updated":"2024-09-04T17:01:02Z","published":"2024-09-04T17:01:02Z","title":"Configurable Foundation Models: Building LLMs from a Modular Perspective","summary":" Advancements in LLMs have recently unveiled challenges tied to computational\nefficiency and continual scalability due to their requirements of huge\nparameters, making the applications and evolution of these models on devices\nwith limited computation resources and scenarios requiring various abilities\nincreasingly cumbersome. Inspired by modularity within the human brain, there\nis a growing tendency to decompose LLMs into numerous functional modules,\nallowing for inference with part of modules and dynamic assembly of modules to\ntackle complex tasks, such as mixture-of-experts. To highlight the inherent\nefficiency and composability of the modular approach, we coin the term brick to\nrepresent each functional module, designating the modularized structure as\nconfigurable foundation models. In this paper, we offer a comprehensive\noverview and investigation of the construction, utilization, and limitation of\nconfigurable foundation models. We first formalize modules into emergent bricks\n- functional neuron partitions that emerge during the pre-training phase, and\ncustomized bricks - bricks constructed via additional post-training to improve\nthe capabilities and knowledge of LLMs. Based on diverse functional bricks, we\nfurther present four brick-oriented operations: retrieval and routing, merging,\nupdating, and growing. These operations allow for dynamic configuration of LLMs\nbased on instructions to handle complex tasks. To verify our perspective, we\nconduct an empirical analysis on widely-used LLMs. We find that the FFN layers\nfollow modular patterns with functional specialization of neurons and\nfunctional neuron partitions. Finally, we highlight several open issues and\ndirections for future research. Overall, this paper aims to offer a fresh\nmodular perspective on existing LLM research and inspire the future creation of\nmore efficient and scalable foundational models.\n","authors":["Chaojun Xiao","Zhengyan Zhang","Chenyang Song","Dazhi Jiang","Feng Yao","Xu Han","Xiaozhi Wang","Shuo Wang","Yufei Huang","Guanyu Lin","Yingfa Chen","Weilin Zhao","Yuge Tu","Zexuan Zhong","Ao Zhang","Chenglei Si","Khai Hao Moo","Chenyang Zhao","Huimin Chen","Yankai Lin","Zhiyuan Liu","Jingbo Shang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17293v4","updated":"2024-09-04T16:59:27Z","published":"2023-12-28T13:59:43Z","title":"$μ$GUIDE: a framework for quantitative imaging via generalized\n uncertainty-driven inference using deep learning","summary":" This work proposes $\\mu$GUIDE: a general Bayesian framework to estimate\nposterior distributions of tissue microstructure parameters from any given\nbiophysical model or MRI signal representation, with exemplar demonstration in\ndiffusion-weighted MRI. Harnessing a new deep learning architecture for\nautomatic signal feature selection combined with simulation-based inference and\nefficient sampling of the posterior distributions, $\\mu$GUIDE bypasses the high\ncomputational and time cost of conventional Bayesian approaches and does not\nrely on acquisition constraints to define model-specific summary statistics.\nThe obtained posterior distributions allow to highlight degeneracies present in\nthe model definition and quantify the uncertainty and ambiguity of the\nestimated parameters.\n","authors":["Maëliss Jallais","Marco Palombo"],"pdf_url":"https://arxiv.org/pdf/2312.17293v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02871v1","updated":"2024-09-04T16:54:31Z","published":"2024-09-04T16:54:31Z","title":"Hybrid Imitation-Learning Motion Planner for Urban Driving","summary":" With the release of open source datasets such as nuPlan and Argoverse, the\nresearch around learning-based planners has spread a lot in the last years.\nExisting systems have shown excellent capabilities in imitating the human\ndriver behaviour, but they struggle to guarantee safe closed-loop driving.\nConversely, optimization-based planners offer greater security in short-term\nplanning scenarios. To confront this challenge, in this paper we propose a\nnovel hybrid motion planner that integrates both learning-based and\noptimization-based techniques. Initially, a multilayer perceptron (MLP)\ngenerates a human-like trajectory, which is then refined by an\noptimization-based component. This component not only minimizes tracking errors\nbut also computes a trajectory that is both kinematically feasible and\ncollision-free with obstacles and road boundaries. Our model effectively\nbalances safety and human-likeness, mitigating the trade-off inherent in these\nobjectives. We validate our approach through simulation experiments and further\ndemonstrate its efficacy by deploying it in real-world self-driving vehicles.\n","authors":["Cristian Gariboldi","Matteo Corno","Beng Jin"],"pdf_url":"https://arxiv.org/pdf/2409.02871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02869v1","updated":"2024-09-04T16:53:46Z","published":"2024-09-04T16:53:46Z","title":"Look Into the LITE in Deep Learning for Time Series Classification","summary":" Deep learning models have been shown to be a powerful solution for Time\nSeries Classification (TSC). State-of-the-art architectures, while producing\npromising results on the UCR and the UEA archives , present a high number of\ntrainable parameters. This can lead to long training with high CO2 emission,\npower consumption and possible increase in the number of FLoating-point\nOperation Per Second (FLOPS). In this paper, we present a new architecture for\nTSC, the Light Inception with boosTing tEchnique (LITE) with only 2.34% of the\nnumber of parameters of the state-of-the-art InceptionTime model, while\npreserving performance. This architecture, with only 9, 814 trainable\nparameters due to the usage of DepthWise Separable Convolutions (DWSC), is\nboosted by three techniques: multiplexing, custom filters, and dilated\nconvolution. The LITE architecture, trained on the UCR, is 2.78 times faster\nthan InceptionTime and consumes 2.79 times less CO2 and power. To evaluate the\nperformance of the proposed architecture on multivariate time series data, we\nadapt LITE to handle multivariate time series, we call this version LITEMV. To\nbring theory into application, we also conducted experiments using LITEMV on\nmultivariate time series representing human rehabilitation movements, showing\nthat LITEMV not only is the most efficient model but also the best performing\nfor this application on the Kimore dataset, a skeleton based human\nrehabilitation exercises dataset. Moreover, to address the interpretability of\nLITEMV, we present a study using Class Activation Maps to understand the\nclassification decision taken by the model during evaluation.\n","authors":["Ali Ismail-Fawaz","Maxime Devanne","Stefano Berretti","Jonathan Weber","Germain Forestier"],"pdf_url":"https://arxiv.org/pdf/2409.02869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08705v3","updated":"2024-09-04T16:44:57Z","published":"2023-08-16T23:42:03Z","title":"Partially Observable Multi-Agent Reinforcement Learning with Information\n Sharing","summary":" We study provable multi-agent reinforcement learning (RL) in the general\nframework of partially observable stochastic games (POSGs). To circumvent the\nknown hardness results and the use of computationally intractable oracles, we\nadvocate leveraging the potential \\emph{information-sharing} among agents, a\ncommon practice in empirical multi-agent RL, and a standard model for\nmulti-agent control systems with communications. We first establish several\ncomputational complexity results to justify the necessity of\ninformation-sharing, as well as the observability assumption that has enabled\nquasi-efficient single-agent RL with partial observations, for efficiently\nsolving POSGs. {Inspired by the inefficiency of planning in the ground-truth\nmodel,} we then propose to further \\emph{approximate} the shared common\ninformation to construct an {approximate model} of the POSG, in which planning\nan approximate \\emph{equilibrium} (in terms of solving the original POSG) can\nbe quasi-efficient, i.e., of quasi-polynomial-time, under the aforementioned\nassumptions. Furthermore, we develop a partially observable multi-agent RL\nalgorithm that is \\emph{both} statistically and computationally\nquasi-efficient. {Finally, beyond equilibrium learning, we extend our\nalgorithmic framework to finding the \\emph{team-optimal solution} in\ncooperative POSGs, i.e., decentralized partially observable Markov decision\nprocesses, a much more challenging goal. We establish concrete computational\nand sample complexities under several common structural assumptions of the\nmodel.} We hope our study could open up the possibilities of leveraging and\neven designing different \\emph{information structures}, a well-studied notion\nin control theory, for developing both sample- and computation-efficient\npartially observable multi-agent RL.\n","authors":["Xiangyu Liu","Kaiqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08705v3.pdf","comment":"Journal extension of the conference version at ICML 2023. Changed to\n the more general reward function form, added new results for learning in\n Dec-POMDPs, and streamlined proof outlines"},{"id":"http://arxiv.org/abs/2409.02856v1","updated":"2024-09-04T16:29:25Z","published":"2024-09-04T16:29:25Z","title":"Building a Scalable, Effective, and Steerable Search and Ranking\n Platform","summary":" Modern e-commerce platforms offer vast product selections, making it\ndifficult for customers to find items that they like and that are relevant to\ntheir current session intent. This is why it is key for e-commerce platforms to\nhave near real-time scalable and adaptable personalized ranking and search\nsystems. While numerous methods exist in the scientific literature for building\nsuch systems, many are unsuitable for large-scale industrial use due to\ncomplexity and performance limitations. Consequently, industrial ranking\nsystems often resort to computationally efficient yet simplistic retrieval or\ncandidate generation approaches, which overlook near real-time and\nheterogeneous customer signals, which results in a less personalized and\nrelevant experience. Moreover, related customer experiences are served by\ncompletely different systems, which increases complexity, maintenance, and\ninconsistent experiences.\n In this paper, we present a personalized, adaptable near real-time ranking\nplatform that is reusable across various use cases, such as browsing and\nsearch, and that is able to cater to millions of items and customers under\nheavy load (thousands of requests per second). We employ transformer-based\nmodels through different ranking layers which can learn complex behavior\npatterns directly from customer action sequences while being able to\nincorporate temporal (e.g. in-session) and contextual information. We validate\nour system through a series of comprehensive offline and online real-world\nexperiments at a large online e-commerce platform, and we demonstrate its\nsuperiority when compared to existing systems, both in terms of customer\nexperience as well as in net revenue. Finally, we share the lessons learned\nfrom building a comprehensive, modern ranking platform for use in a large-scale\ne-commerce environment.\n","authors":["Marjan Celikik","Jacek Wasilewski","Ana Peleteiro Ramallo","Alexey Kurennoy","Evgeny Labzin","Danilo Ascione","Tural Gurbanov","Géraud Le Falher","Andrii Dzhoha","Ian Harris"],"pdf_url":"https://arxiv.org/pdf/2409.02856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01433v2","updated":"2024-09-04T16:28:36Z","published":"2024-09-02T19:20:26Z","title":"Domain Decomposition-based coupling of Operator Inference reduced order\n models via the Schwarz alternating method","summary":" This paper presents and evaluates an approach for coupling together\nsubdomain-local reduced order models (ROMs) constructed via non-intrusive\noperator inference (OpInf) with each other and with subdomain-local full order\nmodels (FOMs), following a domain decomposition of the spatial geometry on\nwhich a given partial differential equation (PDE) is posed. Joining\nsubdomain-local models is accomplished using the overlapping Schwarz\nalternating method, a minimally-intrusive multiscale coupling technique that\nworks by transforming a monolithic problem into a sequence of subdomain-local\nproblems, which communicate through transmission boundary conditions imposed on\nthe subdomain interfaces. After formulating the overlapping Schwarz alternating\nmethod for OpInf ROMs, termed OpInf-Schwarz, we evaluate the method's accuracy\nand efficiency on several test cases involving the heat equation in two spatial\ndimensions. We demonstrate that the method is capable of coupling together\narbitrary combinations of OpInf ROMs and FOMs, and that speed-ups over a\nmonolithic FOM are possible when performing OpInf ROM coupling.\n","authors":["Ian Moore","Christopher Wentland","Anthony Gruber","Irina Tezaur"],"pdf_url":"https://arxiv.org/pdf/2409.01433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02850v1","updated":"2024-09-04T16:20:57Z","published":"2024-09-04T16:20:57Z","title":"Oops, I Sampled it Again: Reinterpreting Confidence Intervals in\n Few-Shot Learning","summary":" The predominant method for computing confidence intervals (CI) in few-shot\nlearning (FSL) is based on sampling the tasks with replacement, i.e.\\ allowing\nthe same samples to appear in multiple tasks. This makes the CI misleading in\nthat it takes into account the randomness of the sampler but not the data\nitself. To quantify the extent of this problem, we conduct a comparative\nanalysis between CIs computed with and without replacement. These reveal a\nnotable underestimation by the predominant method. This observation calls for a\nreevaluation of how we interpret confidence intervals and the resulting\nconclusions in FSL comparative studies. Our research demonstrates that the use\nof paired tests can partially address this issue. Additionally, we explore\nmethods to further reduce the (size of the) CI by strategically sampling tasks\nof a specific size. We also introduce a new optimized benchmark, which can be\naccessed at https://github.com/RafLaf/FSL-benchmark-again\n","authors":["Raphael Lafargue","Luke Smith","Franck Vermet","Mathias Löwe","Ian Reid","Vincent Gripon","Jack Valmadre"],"pdf_url":"https://arxiv.org/pdf/2409.02850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02842v1","updated":"2024-09-04T16:14:14Z","published":"2024-09-04T16:14:14Z","title":"SNNAX -- Spiking Neural Networks in JAX","summary":" Spiking Neural Networks (SNNs) simulators are essential tools to prototype\nbiologically inspired models and neuromorphic hardware architectures and\npredict their performance. For such a tool, ease of use and flexibility are\ncritical, but so is simulation speed especially given the complexity inherent\nto simulating SNN. Here, we present SNNAX, a JAX-based framework for simulating\nand training such models with PyTorch-like intuitiveness and JAX-like execution\nspeed. SNNAX models are easily extended and customized to fit the desired model\nspecifications and target neuromorphic hardware. Additionally, SNNAX offers key\nfeatures for optimizing the training and deployment of SNNs such as flexible\nautomatic differentiation and just-in-time compilation. We evaluate and compare\nSNNAX to other commonly used machine learning (ML) frameworks used for\nprogramming SNNs. We provide key performance metrics, best practices,\ndocumented examples for simulating SNNs in SNNAX, and implement several\nbenchmarks used in the literature.\n","authors":["Jamie Lohoff","Jan Finkbeiner","Emre Neftci"],"pdf_url":"https://arxiv.org/pdf/2409.02842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08763v4","updated":"2024-09-04T16:13:18Z","published":"2024-03-13T17:58:57Z","title":"Simple and Scalable Strategies to Continually Pre-train Large Language\n Models","summary":" Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to start the process over again once new data becomes available. A much\nmore efficient solution is to continually pre-train these models, saving\nsignificant compute compared to re-training. However, the distribution shift\ninduced by new data typically results in degraded performance on previous data\nor poor adaptation to the new data. In this work, we show that a simple and\nscalable combination of learning rate (LR) re-warming, LR re-decaying, and\nreplay of previous data is sufficient to match the performance of fully\nre-training from scratch on all available data, as measured by the final loss\nand the average score on several language model (LM) evaluation benchmarks.\nSpecifically, we show this for a weak but realistic distribution shift between\ntwo commonly used LLM pre-training datasets (English$\\rightarrow$English) and a\nstronger distribution shift (English$\\rightarrow$German) at the $405$M\nparameter model scale with large dataset sizes (hundreds of billions of\ntokens). Selecting the weak but realistic shift for larger-scale experiments,\nwe also find that our continual learning strategies match the re-training\nbaseline for a 10B parameter LLM. Our results demonstrate that LLMs can be\nsuccessfully updated via simple and scalable continual learning strategies,\nmatching the re-training baseline using only a fraction of the compute.\nFinally, inspired by previous work, we propose alternatives to the cosine\nlearning rate schedule that help circumvent forgetting induced by LR re-warming\nand that are not bound to a fixed token budget.\n","authors":["Adam Ibrahim","Benjamin Thérien","Kshitij Gupta","Mats L. Richter","Quentin Anthony","Timothée Lesort","Eugene Belilovsky","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2403.08763v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02836v1","updated":"2024-09-04T16:02:30Z","published":"2024-09-04T16:02:30Z","title":"Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency\n Discussions by Few-Shot Learning with Large Language Models","summary":" This study performs analysis of Predictive statements, Hope speech, and\nRegret Detection behaviors within cryptocurrency-related discussions,\nleveraging advanced natural language processing techniques. We introduce a\nnovel classification scheme named \"Prediction statements,\" categorizing\ncomments into Predictive Incremental, Predictive Decremental, Predictive\nNeutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large\nlanguage model, we explore sentiment dynamics across five prominent\ncryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis\nreveals distinct patterns in predictive sentiments, with Matic demonstrating a\nnotably higher propensity for optimistic predictions. Additionally, we\ninvestigate hope and regret sentiments, uncovering nuanced interplay between\nthese emotions and predictive behaviors. Despite encountering limitations\nrelated to data volume and resource availability, our study reports valuable\ndiscoveries concerning investor behavior and sentiment trends within the\ncryptocurrency market, informing strategic decision-making and future research\nendeavors.\n","authors":["Moein Shahiki Tash","Zahra Ahani","Mohim Tash","Olga Kolesnikova","Grigori Sidorov"],"pdf_url":"https://arxiv.org/pdf/2409.02836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20407v3","updated":"2024-09-04T15:57:00Z","published":"2024-05-30T18:25:19Z","title":"Convolutional L2LFlows: Generating Accurate Showers in Highly Granular\n Calorimeters Using Convolutional Normalizing Flows","summary":" In the quest to build generative surrogate models as computationally\nefficient alternatives to rule-based simulations, the quality of the generated\nsamples remains a crucial frontier. So far, normalizing flows have been among\nthe models with the best fidelity. However, as the latent space in such models\nis required to have the same dimensionality as the data space, scaling up\nnormalizing flows to high dimensional datasets is not straightforward. The\nprior L2LFlows approach successfully used a series of separate normalizing\nflows and sequence of conditioning steps to circumvent this problem. In this\nwork, we extend L2LFlows to simulate showers with a 9-times larger profile in\nthe lateral direction. To achieve this, we introduce convolutional layers and\nU-Net-type connections, move from masked autoregressive flows to coupling\nlayers, and demonstrate the successful modelling of showers in the ILD\nElectromagnetic Calorimeter as well as Dataset 3 from the public CaloChallenge\ndataset.\n","authors":["Thorsten Buss","Frank Gaede","Gregor Kasieczka","Claudius Krause","David Shih"],"pdf_url":"https://arxiv.org/pdf/2405.20407v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00717v2","updated":"2024-09-04T15:50:40Z","published":"2024-09-01T13:14:41Z","title":"Multi-Agent Reinforcement Learning from Human Feedback: Data Coverage\n and Algorithmic Techniques","summary":" We initiate the study of Multi-Agent Reinforcement Learning from Human\nFeedback (MARLHF), exploring both theoretical foundations and empirical\nvalidations. We define the task as identifying Nash equilibrium from a\npreference-only offline dataset in general-sum games, a problem marked by the\nchallenge of sparse feedback signals. Our theory establishes the upper\ncomplexity bounds for Nash Equilibrium in effective MARLHF, demonstrating that\nsingle-policy coverage is inadequate and highlighting the importance of\nunilateral dataset coverage. These theoretical insights are verified through\ncomprehensive experiments. To enhance the practical performance, we further\nintroduce two algorithmic techniques. (1) We propose a Mean Squared Error (MSE)\nregularization along the time axis to achieve a more uniform reward\ndistribution and improve reward learning outcomes. (2) We utilize imitation\nlearning to approximate the reference policy, ensuring stability and\neffectiveness in training. Our findings underscore the multifaceted approach\nrequired for MARLHF, paving the way for effective preference-based multi-agent\nsystems.\n","authors":["Natalia Zhang","Xinqi Wang","Qiwen Cui","Runlong Zhou","Sham M. Kakade","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2409.00717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04346v2","updated":"2024-09-04T15:48:40Z","published":"2024-05-07T14:23:22Z","title":"Revisiting Character-level Adversarial Attacks for Language Models","summary":" Adversarial attacks in Natural Language Processing apply perturbations in the\ncharacter or token levels. Token-level attacks, gaining prominence for their\nuse of gradient-based methods, are susceptible to altering sentence semantics,\nleading to invalid adversarial examples. While character-level attacks easily\nmaintain semantics, they have received less attention as they cannot easily\nadopt popular gradient-based methods, and are thought to be easy to defend.\nChallenging these beliefs, we introduce Charmer, an efficient query-based\nadversarial attack capable of achieving high attack success rate (ASR) while\ngenerating highly similar adversarial examples. Our method successfully targets\nboth small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2,\nCharmer improves the ASR in 4.84% points and the USE similarity in 8% points\nwith respect to the previous art. Our implementation is available in\nhttps://github.com/LIONS-EPFL/Charmer.\n","authors":["Elias Abad Rocamora","Yongtao Wu","Fanghui Liu","Grigorios G. Chrysos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2405.04346v2.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2409.02817v1","updated":"2024-09-04T15:35:18Z","published":"2024-09-04T15:35:18Z","title":"Obsidian: Cooperative State-Space Exploration for Performant Inference\n on Secure ML Accelerators","summary":" Trusted execution environments (TEEs) for machine learning accelerators are\nindispensable in secure and efficient ML inference. Optimizing workloads\nthrough state-space exploration for the accelerator architectures improves\nperformance and energy consumption. However, such explorations are expensive\nand slow due to the large search space. Current research has to use fast\nanalytical models that forego critical hardware details and cross-layer\nopportunities unique to the hardware security primitives. While cycle-accurate\nmodels can theoretically reach better designs, their high runtime cost\nrestricts them to a smaller state space.\n We present Obsidian, an optimization framework for finding the optimal\nmapping from ML kernels to a secure ML accelerator. Obsidian addresses the\nabove challenge by exploring the state space using analytical and\ncycle-accurate models cooperatively. The two main exploration components\ninclude: (1) A secure accelerator analytical model, that includes the effect of\nsecure hardware while traversing the large mapping state space and produce the\nbest m model mappings; (2) A compiler profiling step on a cycle-accurate model,\nthat captures runtime bottlenecks to further improve execution runtime, energy\nand resource utilization and find the optimal model mapping.\n We compare our results to a baseline secure accelerator, comprising of the\nstate-of-the-art security schemes obtained from guardnn [ 33 ] and sesame [11].\nThe analytical model reduces the inference latency by 20.5% for a cloud and\n8.4% for an edge deployment with an energy improvement of 24% and 19%\nrespectively. The cycle-accurate model, further reduces the latency by 9.1% for\na cloud and 12.2% for an edge with an energy improvement of 13.8% and 13.1%.\n","authors":["Sarbartha Banerjee","Shijia Wei","Prakash Ramrakhyani","Mohit Tiwari"],"pdf_url":"https://arxiv.org/pdf/2409.02817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02802v1","updated":"2024-09-04T15:22:08Z","published":"2024-09-04T15:22:08Z","title":"Boosting Certificate Robustness for Time Series Classification with\n Efficient Self-Ensemble","summary":" Recently, the issue of adversarial robustness in the time series domain has\ngarnered significant attention. However, the available defense mechanisms\nremain limited, with adversarial training being the predominant approach,\nthough it does not provide theoretical guarantees. Randomized Smoothing has\nemerged as a standout method due to its ability to certify a provable lower\nbound on robustness radius under $\\ell_p$-ball attacks. Recognizing its\nsuccess, research in the time series domain has started focusing on these\naspects. However, existing research predominantly focuses on time series\nforecasting, or under the non-$\\ell_p$ robustness in statistic feature\naugmentation for time series classification~(TSC). Our review found that\nRandomized Smoothing performs modestly in TSC, struggling to provide effective\nassurances on datasets with poor robustness. Therefore, we propose a\nself-ensemble method to enhance the lower bound of the probability confidence\nof predicted labels by reducing the variance of classification margins, thereby\ncertifying a larger radius. This approach also addresses the computational\noverhead issue of Deep Ensemble~(DE) while remaining competitive and, in some\ncases, outperforming it in terms of robustness. Both theoretical analysis and\nexperimental results validate the effectiveness of our method, demonstrating\nsuperior performance in robustness testing compared to baseline approaches.\n","authors":["Chang Dong","Zhengyang Li","Liangwei Zheng","Weitong Chen","Wei Emma Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02802v1.pdf","comment":"6 figures, 4 tables, 10 pages"},{"id":"http://arxiv.org/abs/2405.01704v2","updated":"2024-09-04T15:16:46Z","published":"2024-05-02T20:03:13Z","title":"Privacy-aware Berrut Approximated Coded Computing for Federated Learning","summary":" Federated Learning (FL) is an interesting strategy that enables the\ncollaborative training of an AI model among different data owners without\nrevealing their private datasets. Even so, FL has some privacy vulnerabilities\nthat have been tried to be overcome by applying some techniques like\nDifferential Privacy (DP), Homomorphic Encryption (HE), or Secure Multi-Party\nComputation (SMPC). However, these techniques have some important drawbacks\nthat might narrow their range of application: problems to work with non-linear\nfunctions and to operate large matrix multiplications and high communication\nand computational costs to manage semi-honest nodes. In this context, we\npropose a solution to guarantee privacy in FL schemes that simultaneously\nsolves the previously mentioned problems. Our proposal is based on the Berrut\nApproximated Coded Computing, a technique from the Coded Distributed Computing\nparadigm, adapted to a Secret Sharing configuration, to provide input privacy\nto FL in a scalable way. It can be applied for computing non-linear functions\nand treats the special case of distributed matrix multiplication, a key\nprimitive at the core of many automated learning tasks. Because of these\ncharacteristics, it could be applied in a wide range of FL scenarios, since it\nis independent of the machine learning models or aggregation algorithms used in\nthe FL scheme. We provide analysis of the achieved privacy and complexity of\nour solution and, due to the extensive numerical results performed, a good\ntrade-off between privacy and precision can be observed.\n","authors":["Xavier Martínez Luaña","Rebeca P. Díaz Redondo","Manuel Fernández Veiga"],"pdf_url":"https://arxiv.org/pdf/2405.01704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15950v2","updated":"2024-09-04T15:08:49Z","published":"2024-05-24T21:34:16Z","title":"A Systematic Bias of Machine Learning Regression Models and Its\n Correction: an Application to Imaging-based Brain Age Prediction","summary":" Machine learning models for continuous outcomes often yield systematically\nbiased predictions, particularly for values that largely deviate from the mean.\nSpecifically, predictions for large-valued outcomes tend to be negatively\nbiased (underestimating actual values), while those for small-valued outcomes\nare positively biased (overestimating actual values). We refer to this linear\ncentral tendency warped bias as the \"systematic bias of machine learning\nregression\". In this paper, we first demonstrate that this systematic\nprediction bias persists across various machine learning regression models, and\nthen delve into its theoretical underpinnings. To address this issue, we\npropose a general constrained optimization approach designed to correct this\nbias and develop computationally efficient implementation algorithms.\nSimulation results indicate that our correction method effectively eliminates\nthe bias from the predicted outcomes. We apply the proposed approach to the\nprediction of brain age using neuroimaging data. In comparison to competing\nmachine learning regression models, our method effectively addresses the\nlongstanding issue of \"systematic bias of machine learning regression\" in\nneuroimaging-based brain age calculation, yielding unbiased predictions of\nbrain age.\n","authors":["Hwiyoung Lee","Shuo Chen"],"pdf_url":"https://arxiv.org/pdf/2405.15950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02792v1","updated":"2024-09-04T15:06:44Z","published":"2024-09-04T15:06:44Z","title":"UnLearning from Experience to Avoid Spurious Correlations","summary":" While deep neural networks can achieve state-of-the-art performance in many\ntasks, these models are more fragile than they appear. They are prone to\nlearning spurious correlations in their training data, leading to surprising\nfailure cases. In this paper, we propose a new approach that addresses the\nissue of spurious correlations: UnLearning from Experience (ULE). Our method is\nbased on using two classification models trained in parallel: student and\nteacher models. Both models receive the same batches of training data. The\nstudent model is trained with no constraints and pursues the spurious\ncorrelations in the data. The teacher model is trained to solve the same\nclassification problem while avoiding the mistakes of the student model. As\ntraining is done in parallel, the better the student model learns the spurious\ncorrelations, the more robust the teacher model becomes. The teacher model uses\nthe gradient of the student's output with respect to its input to unlearn\nmistakes made by the student. We show that our method is effective on the\nWaterbirds, CelebA, Spawrious and UrbanCars datasets.\n","authors":["Jeff Mitchell","Jesús Martínez del Rincón","Niall McLaughlin"],"pdf_url":"https://arxiv.org/pdf/2409.02792v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2409.02778v1","updated":"2024-09-04T14:56:28Z","published":"2024-09-04T14:56:28Z","title":"Regularized Multi-output Gaussian Convolution Process with Domain\n Adaptation","summary":" Multi-output Gaussian process (MGP) has been attracting increasing attention\nas a transfer learning method to model multiple outputs. Despite its high\nflexibility and generality, MGP still faces two critical challenges when\napplied to transfer learning. The first one is negative transfer, which occurs\nwhen there exists no shared information among the outputs. The second challenge\nis the input domain inconsistency, which is commonly studied in transfer\nlearning yet not explored in MGP. In this paper, we propose a regularized MGP\nmodeling framework with domain adaptation to overcome these challenges. More\nspecifically, a sparse covariance matrix of MGP is proposed by using\nconvolution process, where penalization terms are added to adaptively select\nthe most informative outputs for knowledge transfer. To deal with the domain\ninconsistency, a domain adaptation method is proposed by marginalizing\ninconsistent features and expanding missing features to align the input domains\namong different outputs. Statistical properties of the proposed method are\nprovided to guarantee the performance practically and asymptotically. The\nproposed framework outperforms state-of-the-art benchmarks in comprehensive\nsimulation studies and one real case study of a ceramic manufacturing process.\nThe results demonstrate the effectiveness of our method in dealing with both\nthe negative transfer and the domain inconsistency.\n","authors":["Wang Xinming","Wang Chao","Song Xuan","Kirby Levi","Wu Jianguo"],"pdf_url":"https://arxiv.org/pdf/2409.02778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08381v2","updated":"2024-09-04T14:52:59Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations change\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations changes through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that how ID changes through the network differs\nnoticeably between natural and medical image models. Specifically, medical\nimage models peak in representation ID earlier in the network, implying a\ndifference in the image features and their abstractness that are typically used\nfor downstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02772v1","updated":"2024-09-04T14:51:36Z","published":"2024-09-04T14:51:36Z","title":"Unifying Causal Representation Learning with the Invariance Principle","summary":" Causal representation learning aims at recovering latent causal variables\nfrom high-dimensional observations to solve causal downstream tasks, such as\npredicting the effect of new interventions or more robust classification. A\nplethora of methods have been developed, each tackling carefully crafted\nproblem settings that lead to different types of identifiability. The folklore\nis that these different settings are important, as they are often linked to\ndifferent rungs of Pearl's causal hierarchy, although not all neatly fit. Our\nmain contribution is to show that many existing causal representation learning\napproaches methodologically align the representation to known data symmetries.\nIdentification of the variables is guided by equivalence classes across\ndifferent data pockets that are not necessarily causal. This result suggests\nimportant implications, allowing us to unify many existing approaches in a\nsingle method that can mix and match different assumptions, including\nnon-causal ones, based on the invariances relevant to our application. It also\nsignificantly benefits applicability, which we demonstrate by improving\ntreatment effect estimation on real-world high-dimensional ecological data.\nOverall, this paper clarifies the role of causality assumptions in the\ndiscovery of causal variables and shifts the focus to preserving data\nsymmetries.\n","authors":["Dingling Yao","Dario Rancati","Riccardo Cadei","Marco Fumero","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2409.02772v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2407.13703v3","updated":"2024-09-04T14:41:26Z","published":"2024-06-26T08:59:49Z","title":"Energy-Efficient Channel Decoding for Wireless Federated Learning:\n Convergence Analysis and Adaptive Design","summary":" One of the most critical challenges for deploying distributed learning\nsolutions, such as federated learning (FL), in wireless networks is the limited\nbattery capacity of mobile clients. While it is a common belief that the major\nenergy consumption of mobile clients comes from the uplink data transmission,\nthis paper presents a novel finding, namely channel decoding also contributes\nsignificantly to the overall energy consumption of mobile clients in FL.\nMotivated by this new observation, we propose an energy-efficient adaptive\nchannel decoding scheme that leverages the intrinsic robustness of FL to model\nerrors. In particular, the robustness is exploited to reduce the energy\nconsumption of channel decoders at mobile clients by adaptively adjusting the\nnumber of decoding iterations. We theoretically prove that wireless FL with\ncommunication errors can converge at the same rate as the case with error-free\ncommunication provided the bit error rate (BER) is properly constrained. An\nadaptive channel decoding scheme is then proposed to improve the energy\nefficiency of wireless FL systems. Experimental results demonstrate that the\nproposed method maintains the same learning accuracy while reducing the channel\ndecoding energy consumption by ~20% when compared to an existing approach.\n","authors":["Linping Qu","Yuyi Mao","Shenghui Song","Chi-Ying Tsui"],"pdf_url":"https://arxiv.org/pdf/2407.13703v3.pdf","comment":"This paper has been accepted by the IEEE TWC. Copyright may be\n transferred without notice, after which this version may no longer be\n accessible"},{"id":"http://arxiv.org/abs/2409.00105v2","updated":"2024-09-04T14:40:14Z","published":"2024-08-27T14:40:16Z","title":"Negation Blindness in Large Language Models: Unveiling the NO Syndrome\n in Image Generation","summary":" Foundational Large Language Models (LLMs) have changed the way we perceive\ntechnology. They have been shown to excel in tasks ranging from poem writing\nand coding to essay generation and puzzle solving. With the incorporation of\nimage generation capability, they have become more comprehensive and versatile\nAI tools. At the same time, researchers are striving to identify the\nlimitations of these tools to improve them further. Currently identified flaws\ninclude hallucination, biases, and bypassing restricted commands to generate\nharmful content. In the present work, we have identified a fundamental\nlimitation related to the image generation ability of LLMs, and termed it The\nNO Syndrome. This negation blindness refers to LLMs inability to correctly\ncomprehend NO related natural language prompts to generate the desired images.\nInterestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found\nto be suffering from this syndrome. To demonstrate the generalization of this\nlimitation, we carried out simulation experiments and conducted entropy-based\nand benchmark statistical analysis tests on various LLMs in multiple languages,\nincluding English, Hindi, and French. We conclude that the NO syndrome is a\nsignificant flaw in current LLMs that needs to be addressed. A related finding\nof this study showed a consistent discrepancy between image and textual\nresponses as a result of this NO syndrome. We posit that the introduction of a\nnegation context-aware reinforcement learning based feedback loop between the\nLLMs textual response and generated image could help ensure the generated text\nis based on both the LLMs correct contextual understanding of the negation\nquery and the generated visual output.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2409.00105v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.02747v1","updated":"2024-09-04T14:26:58Z","published":"2024-09-04T14:26:58Z","title":"Tractable Offline Learning of Regular Decision Processes","summary":" This work studies offline Reinforcement Learning (RL) in a class of\nnon-Markovian environments called Regular Decision Processes (RDPs). In RDPs,\nthe unknown dependency of future observations and rewards from the past\ninteractions can be captured by some hidden finite-state automaton. For this\nreason, many RDP algorithms first reconstruct this unknown dependency using\nautomata learning techniques. In this paper, we show that it is possible to\novercome two strong limitations of previous offline RL algorithms for RDPs,\nnotably RegORL. This can be accomplished via the introduction of two original\ntechniques: the development of a new pseudometric based on formal languages,\nwhich removes a problematic dependency on\n$L_\\infty^\\mathsf{p}$-distinguishability parameters, and the adoption of\nCount-Min-Sketch (CMS), instead of naive counting. The former reduces the\nnumber of samples required in environments that are characterized by a low\ncomplexity in language-theoretic terms. The latter alleviates the memory\nrequirements for long planning horizons. We derive the PAC sample complexity\nbounds associated to each of these techniques, and we validate the approach\nexperimentally.\n","authors":["Ahana Deb","Roberto Cipollone","Anders Jonsson","Alessandro Ronca","Mohammad Sadegh Talebi"],"pdf_url":"https://arxiv.org/pdf/2409.02747v1.pdf","comment":"To appear in EWRL 2024"},{"id":"http://arxiv.org/abs/2408.16945v3","updated":"2024-09-04T14:25:47Z","published":"2024-08-29T23:51:51Z","title":"Different Victims, Same Layout: Email Visual Similarity Detection for\n Enhanced Email Protection","summary":" In the pursuit of an effective spam detection system, the focus has often\nbeen on identifying known spam patterns either through rule-based detection\nsystems or machine learning (ML) solutions that rely on keywords. However, both\nsystems are susceptible to evasion techniques and zero-day attacks that can be\nachieved at low cost. Therefore, an email that bypassed the defense system once\ncan do it again in the following days, even though rules are updated or the ML\nmodels are retrained. The recurrence of failures to detect emails that exhibit\nlayout similarities to previously undetected spam is concerning for customers\nand can erode their trust in a company. Our observations show that threat\nactors reuse email kits extensively and can bypass detection with little\neffort, for example, by making changes to the content of emails. In this work,\nwe propose an email visual similarity detection approach, named Pisco, to\nimprove the detection capabilities of an email threat defense system. We apply\nour proof of concept to some real-world samples received from different\nsources. Our results show that email kits are being reused extensively and\nvisually similar emails are sent to our customers at various time intervals.\nTherefore, this method could be very helpful in situations where detection\nengines that rely on textual features and keywords are bypassed, an occurrence\nour observations show happens frequently.\n","authors":["Sachin Shukla","Omid Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2408.16945v3.pdf","comment":"To be published in the proceedings of the ACM Conference on Computer\n and Communications Security (ACM CCS 2024)"},{"id":"http://arxiv.org/abs/2409.02740v1","updated":"2024-09-04T14:21:00Z","published":"2024-09-04T14:21:00Z","title":"Convolutional Neural Networks for Automated Cellular Automaton\n Classification","summary":" The emergent dynamics in spacetime diagrams of cellular automata (CAs) is\noften organised by means of a number of behavioural classes. Whilst\nclassification of elementary CAs is feasible and well-studied, non-elementary\nCAs are generally too diverse and numerous to exhaustively classify manually.\nIn this chapter we treat the spacetime diagram as a digital image, and\nimplement simple computer vision techniques to perform an automated\nclassification of elementary cellular automata into the five Li-Packard\nclasses. In particular, we present a supervised learning task to a\nconvolutional neural network, in such a way that it may be generalised to\nnon-elementary CAs. If we want to do so, we must divert the algorithm's focus\naway from the underlying 'microscopic' local updates. We first show that\npreviously developed deep learning approaches have in fact been trained to\nidentify the local update rule, rather than directly focus on the mesoscopic\npatterns that are associated with the particular behavioural classes. By means\nof a well-argued neural network design, as well as a number of data\naugmentation techniques, we then present a convolutional neural network that\nperforms nearly perfectly at identifying the behavioural class, without\nnecessarily first identifying the underlying microscopic dynamics.\n","authors":["Michiel Rollier","Aisling J. Daly","Jan M. Baetens"],"pdf_url":"https://arxiv.org/pdf/2409.02740v1.pdf","comment":"19 pages, 12 figures, book chapter"},{"id":"http://arxiv.org/abs/2402.02438v2","updated":"2024-09-04T14:14:17Z","published":"2024-02-04T10:27:42Z","title":"Fast and interpretable Support Vector Classification based on the\n truncated ANOVA decomposition","summary":" Support Vector Machines (SVMs) are an important tool for performing\nclassification on scattered data, where one usually has to deal with many data\npoints in high-dimensional spaces. We propose solving SVMs in primal form using\nfeature maps based on trigonometric functions or wavelets. In small dimensional\nsettings the Fast Fourier Transform (FFT) and related methods are a powerful\ntool in order to deal with the considered basis functions. For growing\ndimensions the classical FFT-based methods become inefficient due to the curse\nof dimensionality. Therefore, we restrict ourselves to multivariate basis\nfunctions, each of which only depends on a small number of dimensions. This is\nmotivated by the well-known sparsity of effects and recent results regarding\nthe reconstruction of functions from scattered data in terms of truncated\nanalysis of variance (ANOVA) decompositions, which makes the resulting model\neven interpretable in terms of importance of the features as well as their\ncouplings. The usage of small superposition dimensions has the consequence that\nthe computational effort no longer grows exponentially but only polynomially\nwith respect to the dimension. In order to enforce sparsity regarding the basis\ncoefficients, we use the frequently applied $\\ell_2$-norm and, in addition,\n$\\ell_1$-norm regularization. The found classifying function, which is the\nlinear combination of basis functions, and its variance can then be analyzed in\nterms of the classical ANOVA decomposition of functions. Based on numerical\nexamples we show that we are able to recover the signum of a function that\nperfectly fits our model assumptions. Furthermore, we perform classification on\ndifferent artificial and real-world data sets. We obtain better results with\n$\\ell_1$-norm regularization, both in terms of accuracy and clarity of\ninterpretability.\n","authors":["Kseniya Akhalaya","Franziska Nestler","Daniel Potts"],"pdf_url":"https://arxiv.org/pdf/2402.02438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12965v2","updated":"2024-09-04T14:10:43Z","published":"2024-05-21T17:45:36Z","title":"The future of cosmological likelihood-based inference: accelerated\n high-dimensional parameter estimation and model comparison","summary":" We advocate for a new paradigm of cosmological likelihood-based inference,\nleveraging recent developments in machine learning and its underlying\ntechnology, to accelerate Bayesian inference in high-dimensional settings.\nSpecifically, we combine (i) emulation, where a machine learning model is\ntrained to mimic cosmological observables, e.g. CosmoPower-JAX; (ii)\ndifferentiable and probabilistic programming, e.g. JAX and NumPyro,\nrespectively; (iii) scalable Markov chain Monte Carlo (MCMC) sampling\ntechniques that exploit gradients, e.g. Hamiltonian Monte Carlo; and (iv)\ndecoupled and scalable Bayesian model selection techniques that compute the\nBayesian evidence purely from posterior samples, e.g. the learned harmonic mean\nimplemented in harmonic. This paradigm allows us to carry out a complete\nBayesian analysis, including both parameter estimation and model selection, in\na fraction of the time of traditional approaches. First, we demonstrate the\napplication of this paradigm on a simulated cosmic shear analysis for a Stage\nIV survey in 37- and 39-dimensional parameter spaces, comparing $\\Lambda$CDM\nand a dynamical dark energy model ($w_0w_a$CDM). We recover posterior contours\nand evidence estimates that are in excellent agreement with those computed by\nthe traditional nested sampling approach while reducing the computational cost\nfrom 8 months on 48 CPU cores to 2 days on 12 GPUs. Second, we consider a joint\nanalysis between three simulated next-generation surveys, each performing a\n3x2pt analysis, resulting in 157- and 159-dimensional parameter spaces.\nStandard nested sampling techniques are simply unlikely to be feasible in this\nhigh-dimensional setting, requiring a projected 12 years of compute time on 48\nCPU cores; on the other hand, the proposed approach only requires 8 days of\ncompute time on 24 GPUs. All packages used in our analyses are publicly\navailable.\n","authors":["Davide Piras","Alicja Polanska","Alessio Spurio Mancini","Matthew A. Price","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2405.12965v2.pdf","comment":"14 pages, 6 figures. Accepted for publication in the Open Journal of\n Astrophysics. Codes available at\n https://github.com/alessiospuriomancini/cosmopower,\n https://github.com/dpiras/cosmopower-jax,\n https://github.com/astro-informatics/harmonic/"},{"id":"http://arxiv.org/abs/2409.02730v1","updated":"2024-09-04T14:03:08Z","published":"2024-09-04T14:03:08Z","title":"Complete and Efficient Covariants for 3D Point Configurations with\n Application to Learning Molecular Quantum Properties","summary":" When modeling physical properties of molecules with machine learning, it is\ndesirable to incorporate $SO(3)$-covariance. While such models based on low\nbody order features are not complete, we formulate and prove general\ncompleteness properties for higher order methods, and show that $6k-5$ of these\nfeatures are enough for up to $k$ atoms. We also find that the Clebsch--Gordan\noperations commonly used in these methods can be replaced by matrix\nmultiplications without sacrificing completeness, lowering the scaling from\n$O(l^6)$ to $O(l^3)$ in the degree of the features. We apply this to quantum\nchemistry, but the proposed methods are generally applicable for problems\ninvolving 3D point configurations.\n","authors":["Hartmut Maennel","Oliver T. Unke","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2409.02730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02728v1","updated":"2024-09-04T14:01:56Z","published":"2024-09-04T14:01:56Z","title":"Task-Oriented Communication for Graph Data: A Graph Information\n Bottleneck Approach","summary":" Graph data, essential in fields like knowledge representation and social\nnetworks, often involves large networks with many nodes and edges. Transmitting\nthese graphs can be highly inefficient due to their size and redundancy for\nspecific tasks. This paper introduces a method to extract a smaller,\ntask-focused subgraph that maintains key information while reducing\ncommunication overhead. Our approach utilizes graph neural networks (GNNs) and\nthe graph information bottleneck (GIB) principle to create a compact,\ninformative, and robust graph representation suitable for transmission. The\nchallenge lies in the irregular structure of graph data, making GIB\noptimization complex. We address this by deriving a tractable variational upper\nbound for the objective function. Additionally, we propose the VQ-GIB\nmechanism, integrating vector quantization (VQ) to convert subgraph\nrepresentations into a discrete codebook sequence, compatible with existing\ndigital communication systems. Our experiments show that this GIB-based method\nsignificantly lowers communication costs while preserving essential\ntask-related information. The approach demonstrates robust performance across\nvarious communication channels, suitable for both continuous and discrete\nsystems.\n","authors":["Shujing Li","Yanhu Wang","Shuaishuai Guo","Chenyuan Feng"],"pdf_url":"https://arxiv.org/pdf/2409.02728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02712v1","updated":"2024-09-04T13:49:45Z","published":"2024-09-04T13:49:45Z","title":"A Data Selection Approach for Enhancing Low Resource Machine Translation\n Using Cross-Lingual Sentence Representations","summary":" Machine translation in low-resource language pairs faces significant\nchallenges due to the scarcity of parallel corpora and linguistic resources.\nThis study focuses on the case of English-Marathi language pairs, where\nexisting datasets are notably noisy, impeding the performance of machine\ntranslation models. To mitigate the impact of data quality issues, we propose a\ndata filtering approach based on cross-lingual sentence representations. Our\nmethodology leverages a multilingual SBERT model to filter out problematic\ntranslations in the training data. Specifically, we employ an IndicSBERT\nsimilarity model to assess the semantic equivalence between original and\ntranslated sentences, allowing us to retain linguistically correct translations\nwhile discarding instances with substantial deviations. The results demonstrate\na significant improvement in translation quality over the baseline\npost-filtering with IndicSBERT. This illustrates how cross-lingual sentence\nrepresentations can reduce errors in machine translation scenarios with limited\nresources. By integrating multilingual sentence BERT models into the\ntranslation pipeline, this research contributes to advancing machine\ntranslation techniques in low-resource environments. The proposed method not\nonly addresses the challenges in English-Marathi language pairs but also\nprovides a valuable framework for enhancing translation quality in other\nlow-resource language translation tasks.\n","authors":["Nidhi Kowtal","Tejas Deshpande","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.02712v1.pdf","comment":"Accepted at I2CT 2024"},{"id":"http://arxiv.org/abs/2409.02708v1","updated":"2024-09-04T13:44:22Z","published":"2024-09-04T13:44:22Z","title":"Few-shot Multi-Task Learning of Linear Invariant Features with Meta\n Subspace Pursuit","summary":" Data scarcity poses a serious threat to modern machine learning and\nartificial intelligence, as their practical success typically relies on the\navailability of big datasets. One effective strategy to mitigate the issue of\ninsufficient data is to first harness information from other data sources\npossessing certain similarities in the study design stage, and then employ the\nmulti-task or meta learning framework in the analysis stage. In this paper, we\nfocus on multi-task (or multi-source) linear models whose coefficients across\ntasks share an invariant low-rank component, a popular structural assumption\nconsidered in the recent multi-task or meta learning literature. Under this\nassumption, we propose a new algorithm, called Meta Subspace Pursuit\n(abbreviated as Meta-SP), that provably learns this invariant subspace shared\nby different tasks. Under this stylized setup for multi-task or meta learning,\nwe establish both the algorithmic and statistical guarantees of the proposed\nmethod. Extensive numerical experiments are conducted, comparing Meta-SP\nagainst several competing methods, including popular, off-the-shelf\nmodel-agnostic meta learning algorithms such as ANIL. These experiments\ndemonstrate that Meta-SP achieves superior performance over the competing\nmethods in various aspects.\n","authors":["Chaozhi Zhang","Lin Liu","Xiaoqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02697v1","updated":"2024-09-04T13:33:38Z","published":"2024-09-04T13:33:38Z","title":"Decision Transformer for Enhancing Neural Local Search on the Job Shop\n Scheduling Problem","summary":" The job shop scheduling problem (JSSP) and its solution algorithms have been\nof enduring interest in both academia and industry for decades. In recent\nyears, machine learning (ML) is playing an increasingly important role in\nadvancing existing and building new heuristic solutions for the JSSP, aiming to\nfind better solutions in shorter computation times. In this paper we build on\ntop of a state-of-the-art deep reinforcement learning (DRL) agent, called\nNeural Local Search (NLS), which can efficiently and effectively control a\nlarge local neighborhood search on the JSSP. In particular, we develop a method\nfor training the decision transformer (DT) algorithm on search trajectories\ntaken by a trained NLS agent to further improve upon the learned\ndecision-making sequences. Our experiments show that the DT successfully learns\nlocal search strategies that are different and, in many cases, more effective\nthan those of the NLS agent itself. In terms of the tradeoff between solution\nquality and acceptable computational time needed for the search, the DT is\nparticularly superior in application scenarios where longer computational times\nare acceptable. In this case, it makes up for the longer inference times\nrequired per search step, which are caused by the larger neural network\narchitecture, through better quality decisions per step. Thereby, the DT\nachieves state-of-the-art results for solving the JSSP with ML-enhanced search.\n","authors":["Constantin Waubert de Puiseau","Fabian Wolz","Merlin Montag","Jannik Peters","Hasan Tercan","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.02697v1.pdf","comment":"currently under review for IEEE Transactions on Cybernetics"},{"id":"http://arxiv.org/abs/2402.10502v2","updated":"2024-09-04T13:31:31Z","published":"2024-02-16T08:21:43Z","title":"A possible late-time transition of $M_B$ inferred via neural networks","summary":" The strengthening of tensions in the cosmological parameters has led to a\nreconsideration of fundamental aspects of standard cosmology. The tension in\nthe Hubble constant can also be viewed as a tension between local and early\nUniverse constraints on the absolute magnitude $M_B$ of Type Ia supernova. In\nthis work, we reconsider the possibility of a variation of this parameter in a\nmodel-independent way. We employ neural networks to agnostically constrain the\nvalue of the absolute magnitude as well as assess the impact and statistical\nsignificance of a variation in $M_B$ with redshift from the Pantheon+\ncompilation, together with a thorough analysis of the neural network\narchitecture. We find an indication for a possible transition redshift at the\n$z\\approx 1$ region.\n","authors":["Purba Mukherjee","Konstantinos F. Dialektopoulos","Jackson Levi Said","Jurgen Mifsud"],"pdf_url":"https://arxiv.org/pdf/2402.10502v2.pdf","comment":"13 pages, 9 sets of figures, 2 tables. To appear in JCAP"},{"id":"http://arxiv.org/abs/2408.16122v2","updated":"2024-09-04T13:28:34Z","published":"2024-08-28T20:22:09Z","title":"Variational Mode Decomposition and Linear Embeddings are What You Need\n For Time-Series Forecasting","summary":" Time-series forecasting often faces challenges due to data volatility, which\ncan lead to inaccurate predictions. Variational Mode Decomposition (VMD) has\nemerged as a promising technique to mitigate volatility by decomposing data\ninto distinct modes, thereby enhancing forecast accuracy. In this study, we\nintegrate VMD with linear models to develop a robust forecasting framework. Our\napproach is evaluated on 13 diverse datasets, including ETTm2, WindTurbine, M4,\nand 10 air quality datasets from various Southeast Asian cities. The\neffectiveness of the VMD strategy is assessed by comparing Root Mean Squared\nError (RMSE) values from models utilizing VMD against those without it.\nAdditionally, we benchmark linear-based models against well-known neural\nnetwork architectures such as LSTM, Bidirectional LSTM, and RNN. The results\ndemonstrate a significant reduction in RMSE across nearly all models following\nVMD application. Notably, the Linear + VMD model achieved the lowest average\nRMSE in univariate forecasting at 0.619. In multivariate forecasting, the\nDLinear + VMD model consistently outperformed others, attaining the lowest RMSE\nacross all datasets with an average of 0.019. These findings underscore the\neffectiveness of combining VMD with linear models for superior time-series\nforecasting.\n","authors":["Hafizh Raihan Kurnia Putra","Novanto Yudistira","Tirana Noor Fatyanosa"],"pdf_url":"https://arxiv.org/pdf/2408.16122v2.pdf","comment":"For associated repository, see\n https://github.com/Espalemit/VMD-With-LTSF-Linear.git"},{"id":"http://arxiv.org/abs/2409.02686v1","updated":"2024-09-04T13:17:09Z","published":"2024-09-04T13:17:09Z","title":"Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for\n Problem-Solving Improvement of LLMs","summary":" Large Language Models (LLMs) have demonstrated remarkable efficiency in\ntackling various tasks based on human instructions, but recent studies reveal\nthat these models often fail to achieve satisfactory results on questions\ninvolving reasoning, such as mathematics or physics questions. This phenomenon\nis usually attributed to the uncertainty regarding whether these models could\ngenuinely comprehend the knowledge embedded in the text or merely learn to\nreplicate the token distribution without a true understanding of the content.\nIn this paper, we delve into this problem and aim to enhance the reasoning\ncapabilities of LLMs. First, we investigate if the model has genuine reasoning\ncapabilities by visualizing the text generation process at the attention and\nrepresentation level. Then, we formulate the reasoning process of LLMs into a\ncausal framework, which provides a formal explanation of the problems we\nobserve in the visualization. Finally, building upon this causal framework, we\npropose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient\nfine-tuning (PEFT) method to enhance the model's reasoning capabilities by\nencouraging the model to extract the general problem-solving skills and apply\nthese skills to different questions. Experiments show that our method\noutperforms the baseline consistently across multiple benchmarks, and with only\n1.2M tunable parameters, we achieve better or comparable results to other\nfine-tuning methods. This demonstrates the effectiveness and efficiency of our\nmethod in improving the overall accuracy and reliability of LLMs.\n","authors":["Ruoyu Wang","Xiaoxuan Li","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02684v1","updated":"2024-09-04T13:16:20Z","published":"2024-09-04T13:16:20Z","title":"Neural timescales from a computational perspective","summary":" Timescales of neural activity are diverse across and within brain areas, and\nexperimental observations suggest that neural timescales reflect information in\ndynamic environments. However, these observations do not specify how neural\ntimescales are shaped, nor whether particular timescales are necessary for\nneural computations and brain function. Here, we take a complementary\nperspective and synthesize three directions where computational methods can\ndistill the broad set of empirical observations into quantitative and testable\ntheories: We review (i) how data analysis methods allow us to capture different\ntimescales of neural dynamics across different recording modalities, (ii) how\ncomputational models provide a mechanistic explanation for the emergence of\ndiverse timescales, and (iii) how task-optimized models in machine learning\nuncover the functional relevance of neural timescales. This integrative\ncomputational approach, combined with empirical findings, would provide a more\nholistic understanding of how neural timescales capture the relationship\nbetween brain structure, dynamics, and behavior.\n","authors":["Roxana Zeraati","Anna Levina","Jakob H. Macke","Richard Gao"],"pdf_url":"https://arxiv.org/pdf/2409.02684v1.pdf","comment":"18 pages, 4 figures, 2 boxes"},{"id":"http://arxiv.org/abs/2409.02681v1","updated":"2024-09-04T13:11:59Z","published":"2024-09-04T13:11:59Z","title":"Neural Networks with LSTM and GRU in Modeling Active Fires in the Amazon","summary":" This study presents a comprehensive methodology for modeling and forecasting\nthe historical time series of fire spots detected by the AQUA_M-T satellite in\nthe Amazon, Brazil. The approach utilizes a mixed Recurrent Neural Network\n(RNN) model, combining Long Short-Term Memory (LSTM) and Gated Recurrent Unit\n(GRU) architectures to predict monthly accumulations of daily detected fire\nspots. A summary of the data revealed a consistent seasonality over time, with\nannual maximum and minimum fire spot values tending to repeat at the same\nperiods each year. The primary objective is to verify whether the forecasts\ncapture this inherent seasonality through rigorous statistical analysis. The\nmethodology involved careful data preparation, model configuration, and\ntraining using cross-validation with two seeds, ensuring that the data\ngeneralizes well to the test and validation sets, and confirming the\nconvergence of the model parameters. The results indicate that the mixed LSTM\nand GRU model offers improved accuracy in forecasting 12 months ahead,\ndemonstrating its effectiveness in capturing complex temporal patterns and\nmodeling the observed time series. This research significantly contributes to\nthe application of deep learning techniques in environmental monitoring,\nspecifically in fire spot forecasting. In addition to improving forecast\naccuracy, the proposed approach highlights the potential for adaptation to\nother time series forecasting challenges, opening new avenues for research and\ndevelopment in machine learning and natural phenomenon prediction. Keywords:\nTime Series Forecasting, Recurrent Neural Networks, Deep Learning.\n","authors":["Ramon Tavares"],"pdf_url":"https://arxiv.org/pdf/2409.02681v1.pdf","comment":"16 pages, in Portuguese language, 24 figures"},{"id":"http://arxiv.org/abs/2212.05782v2","updated":"2024-09-04T13:06:58Z","published":"2022-12-12T09:09:39Z","title":"GT-CausIn: a novel causal-based insight for traffic prediction","summary":" Traffic forecasting is an important application of spatiotemporal series\nprediction. Among different methods, graph neural networks have achieved so far\nthe most promising results, learning relations between graph nodes then becomes\na crucial task. However, improvement space is very limited when these relations\nare learned in a node-to-node manner. The challenge stems from (1) obscure\ntemporal dependencies between different stations, (2) difficulties in defining\nvariables beyond the node level, and (3) no ready-made method to validate the\nlearned relations. To confront these challenges, we define legitimate traffic\ncausal variables to discover the causal relation inside the traffic network,\nwhich is carefully checked with statistic tools and case analysis. We then\npresent a novel model named Graph Spatial-Temporal Network Based on Causal\nInsight (GT-CausIn), where prior learned causal information is integrated with\ngraph diffusion layers and temporal convolutional network (TCN) layers.\nExperiments are carried out on two real-world traffic datasets: PEMS-BAY and\nMETR-LA, which show that GT-CausIn significantly outperforms the\nstate-of-the-art models on mid-term and long-term prediction.\n","authors":["Ting Gao","Rodrigo Kappes Marques","Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2212.05782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02672v1","updated":"2024-09-04T13:00:59Z","published":"2024-09-04T13:00:59Z","title":"Independence Constrained Disentangled Representation Learning from\n Epistemological Perspective","summary":" Disentangled Representation Learning aims to improve the explainability of\ndeep learning methods by training a data encoder that identifies semantically\nmeaningful latent variables in the data generation process. Nevertheless, there\nis no consensus regarding a universally accepted definition for the objective\nof disentangled representation learning. In particular, there is a considerable\namount of discourse regarding whether should the latent variables be mutually\nindependent or not. In this paper, we first investigate these arguments on the\ninterrelationships between latent variables by establishing a conceptual bridge\nbetween Epistemology and Disentangled Representation Learning. Then, inspired\nby these interdisciplinary concepts, we introduce a two-level latent space\nframework to provide a general solution to the prior arguments on this issue.\nFinally, we propose a novel method for disentangled representation learning by\nemploying an integration of mutual information constraint and independence\nconstraint within the Generative Adversarial Network (GAN) framework.\nExperimental results demonstrate that our proposed method consistently\noutperforms baseline approaches in both quantitative and qualitative\nevaluations. The method exhibits strong performance across multiple commonly\nused metrics and demonstrates a great capability in disentangling various\nsemantic factors, leading to an improved quality of controllable generation,\nwhich consequently benefits the explainability of the algorithm.\n","authors":["Ruoyu Wang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01821v2","updated":"2024-09-04T12:58:11Z","published":"2024-09-03T12:03:45Z","title":"When Does Visual Prompting Outperform Linear Probing for Vision-Language\n Models? A Likelihood Perspective","summary":" Adapting pre-trained models to new tasks can exhibit varying effectiveness\nacross datasets. Visual prompting, a state-of-the-art parameter-efficient\ntransfer learning method, can significantly improve the performance of\nout-of-distribution tasks. On the other hand, linear probing, a standard\ntransfer learning method, can sometimes become the best approach. We propose a\nlog-likelihood ratio (LLR) approach to analyze the comparative benefits of\nvisual prompting and linear probing. By employing the LLR score alongside\nresource-efficient visual prompts approximations, our cost-effective measure\nattains up to a 100-fold reduction in run time compared to full training, while\nachieving prediction accuracies up to 91%. The source code is available at\nhttps://github.com/IBM/VP-LLR.\n","authors":["Hsi-Ai Tsao","Lei Hsiung","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2409.01821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02669v1","updated":"2024-09-04T12:53:26Z","published":"2024-09-04T12:53:26Z","title":"Causality-Aware Transformer Networks for Robotic Navigation","summary":" Recent advances in machine learning algorithms have garnered growing interest\nin developing versatile Embodied AI systems. However, current research in this\ndomain reveals opportunities for improvement. First, the direct adoption of\nRNNs and Transformers often overlooks the specific differences between Embodied\nAI and traditional sequential data modelling, potentially limiting its\nperformance in Embodied AI tasks. Second, the reliance on task-specific\nconfigurations, such as pre-trained modules and dataset-specific logic,\ncompromises the generalizability of these methods. We address these constraints\nby initially exploring the unique differences between Embodied AI tasks and\nother sequential data tasks through the lens of Causality, presenting a causal\nframework to elucidate the inadequacies of conventional sequential methods for\nEmbodied AI. By leveraging this causal perspective, we propose Causality-Aware\nTransformer (CAT) Networks for Navigation, featuring a Causal Understanding\nModule to enhance the models's Environmental Understanding capability.\nMeanwhile, our method is devoid of task-specific inductive biases and can be\ntrained in an End-to-End manner, which enhances the method's generalizability\nacross various contexts. Empirical evaluations demonstrate that our methodology\nconsistently surpasses benchmark performances across a spectrum of settings,\ntasks and simulation environments. Extensive ablation studies reveal that the\nperformance gains can be attributed to the Causal Understanding Module, which\ndemonstrates effectiveness and efficiency in both Reinforcement Learning and\nSupervised Learning settings.\n","authors":["Ruoyu Wang","Yao Liu","Yuanjiang Cao","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02668v1","updated":"2024-09-04T12:51:41Z","published":"2024-09-04T12:51:41Z","title":"Introduction to Machine Learning","summary":" This book introduces the mathematical foundations and techniques that lead to\nthe development and analysis of many of the algorithms that are used in machine\nlearning. It starts with an introductory chapter that describes notation used\nthroughout the book and serve at a reminder of basic concepts in calculus,\nlinear algebra and probability and also introduces some measure theoretic\nterminology, which can be used as a reading guide for the sections that use\nthese tools. The introductory chapters also provide background material on\nmatrix analysis and optimization. The latter chapter provides theoretical\nsupport to many algorithms that are used in the book, including stochastic\ngradient descent, proximal methods, etc. After discussing basic concepts for\nstatistical prediction, the book includes an introduction to reproducing kernel\ntheory and Hilbert space techniques, which are used in many places, before\naddressing the description of various algorithms for supervised statistical\nlearning, including linear methods, support vector machines, decision trees,\nboosting, or neural networks. The subject then switches to generative methods,\nstarting with a chapter that presents sampling methods and an introduction to\nthe theory of Markov chains. The following chapter describe the theory of\ngraphical models, an introduction to variational methods for models with latent\nvariables, and to deep-learning based generative models. The next chapters\nfocus on unsupervised learning methods, for clustering, factor analysis and\nmanifold learning. The final chapter of the book is theory-oriented and\ndiscusses concentration inequalities and generalization bounds.\n","authors":["Laurent Younes"],"pdf_url":"https://arxiv.org/pdf/2409.02668v1.pdf","comment":"textbook"},{"id":"http://arxiv.org/abs/2312.02491v2","updated":"2024-09-04T12:43:48Z","published":"2023-12-05T04:43:23Z","title":"Pseudo Replay-based Class Continual Learning for Online New Category\n Anomaly Detection in Additive Manufacturing","summary":" The incorporation of advanced sensors and machine learning techniques has\nenabled modern manufacturing enterprises to perform data-driven\nclassification-based anomaly detection based on the sensor data collected in\nmanufacturing processes. However, one critical challenge is that newly\npresented defect category may manifest as the manufacturing process continues,\nresulting in monitoring performance deterioration of previously trained machine\nlearning models. Hence, there is an increasing need for empowering machine\nlearning models to learn continually. Among all continual learning methods,\nmemory-based continual learning has the best performance but faces the\nconstraints of data storage capacity. To address this issue, this paper\ndevelops a novel pseudo replay-based continual learning framework by\nintegrating class incremental learning and oversampling-based data generation.\nWithout storing all the data, the developed framework could generate\nhigh-quality data representing previous classes to train machine learning model\nincrementally when new category anomaly occurs. In addition, it could even\nenhance the monitoring performance since it also effectively improves the data\nquality. The effectiveness of the proposed framework is validated in three\ncases studies, which leverages supervised classification problem for anomaly\ndetection. The experimental results show that the developed method is very\npromising in detecting novel anomaly while maintaining a good performance on\nthe previous task and brings up more flexibility in model architecture.\n","authors":["Yuxuan Li","Tianxin Xie","Chenang Liu","Zhangyue Shi"],"pdf_url":"https://arxiv.org/pdf/2312.02491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10064v2","updated":"2024-09-04T12:28:52Z","published":"2024-01-22T12:17:27Z","title":"Navigating the Maize: Cyclic and conditional computational graphs for\n molecular simulation","summary":" Many computational chemistry and molecular simulation workflows can be\nexpressed as graphs. This abstraction is useful to modularize and potentially\nreuse existing components, as well as provide parallelization and ease\nreproducibility. Existing tools represent the computation as a directed acyclic\ngraph (DAG), thus allowing efficient execution by parallelization of concurrent\nbranches. These systems can, however, generally not express cyclic and\nconditional workflows. We therefore developed Maize, a workflow manager for\ncyclic and conditional graphs based on the principles of flow-based\nprogramming. By running each node of the graph concurrently in separate\nprocesses and allowing communication at any time through dedicated inter-node\nchannels, arbitrary graph structures can be executed. We demonstrate the\neffectiveness of the tool on a dynamic active learning task in computational\ndrug design, involving the use of a small molecule generative model and an\nassociated scoring system, and on a reactivity prediction pipeline using\nquantum-chemistry and semiempirical approaches.\n","authors":["Thomas Löhr","Michele Assante","Michael Dodds","Lili Cao","Mikhail Kabeshov","Jon-Paul Janet","Marco Klähn","Ola Engkvist"],"pdf_url":"https://arxiv.org/pdf/2402.10064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00980v2","updated":"2024-09-04T12:25:28Z","published":"2024-09-02T06:52:01Z","title":"DNN-GDITD: Out-of-distribution detection via Deep Neural Network based\n Gaussian Descriptor for Imbalanced Tabular Data","summary":" Classification tasks present challenges due to class imbalances and evolving\ndata distributions. Addressing these issues requires a robust method to handle\nimbalances while effectively detecting out-of-distribution (OOD) samples not\nencountered during training. This study introduces a novel OOD detection\nalgorithm designed for tabular datasets, titled Deep Neural Network-based\nGaussian Descriptor for Imbalanced Tabular Data (DNN-GDITD). The DNN-GDITD\nalgorithm can be placed on top of any DNN to facilitate better classification\nof imbalanced data and OOD detection using spherical decision boundaries. Using\na combination of Push, Score-based, and focal losses, DNN-GDITD assigns\nconfidence scores to test data points, categorizing them as known classes or as\nan OOD sample. Extensive experimentation on tabular datasets demonstrates the\neffectiveness of DNN-GDITD compared to three OOD algorithms. Evaluation\nencompasses imbalanced and balanced scenarios on diverse tabular datasets,\nincluding a synthetic financial dispute dataset and publicly available tabular\ndatasets like Gas Sensor, Drive Diagnosis, and MNIST, showcasing DNN-GDITD's\nversatility.\n","authors":["Priyanka Chudasama","Anil Surisetty","Aakarsh Malhotra","Alok Singh"],"pdf_url":"https://arxiv.org/pdf/2409.00980v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2409.02647v1","updated":"2024-09-04T12:23:47Z","published":"2024-09-04T12:23:47Z","title":"Learning-Based Error Detection System for Advanced Vehicle Instrument\n Cluster Rendering","summary":" The automotive industry is currently expanding digital display options with\nevery new model that comes onto the market. This entails not just an expansion\nin dimensions, resolution, and customization choices, but also the capability\nto employ novel display effects like overlays while assembling the content of\nthe display cluster. Unfortunately, this raises the need for appropriate\nmonitoring systems that can detect rendering errors and apply appropriate\ncountermeasures when required. Classical solutions such as Cyclic Redundancy\nChecks (CRC) will soon be no longer viable as any sort of alpha blending,\nwarping of scaling of content can cause unwanted CRC violations. Therefore, we\npropose a novel monitoring approach to verify correctness of displayed content\nusing telltales (e.g. warning signs) as example. It uses a learning-based\napproach to separate \"good\" telltales, i.e. those that a human driver will\nunderstand correctly, and \"corrupted\" telltales, i.e. those that will not be\nvisible or perceived correctly. As a result, it possesses inherent resilience\nagainst individual pixel errors and implicitly supports changing backgrounds,\noverlay or scaling effects. This is underlined by our experimental study where\nall \"corrupted\" test patterns were correctly classified, while no false alarms\nwere triggered.\n","authors":["Cornelius Bürkle","Fabian Oboril","Kay-Ulrich Scholl"],"pdf_url":"https://arxiv.org/pdf/2409.02647v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.02644v1","updated":"2024-09-04T12:20:27Z","published":"2024-09-04T12:20:27Z","title":"Conformal Prediction in Dynamic Biological Systems","summary":" Uncertainty quantification (UQ) is the process of systematically determining\nand characterizing the degree of confidence in computational model predictions.\nIn the context of systems biology, especially with dynamic models, UQ is\ncrucial because it addresses the challenges posed by nonlinearity and parameter\nsensitivity, allowing us to properly understand and extrapolate the behavior of\ncomplex biological systems. Here, we focus on dynamic models represented by\ndeterministic nonlinear ordinary differential equations. Many current UQ\napproaches in this field rely on Bayesian statistical methods. While powerful,\nthese methods often require strong prior specifications and make parametric\nassumptions that may not always hold in biological systems. Additionally, these\nmethods face challenges in domains where sample sizes are limited, and\nstatistical inference becomes constrained, with computational speed being a\nbottleneck in large models of biological systems. As an alternative, we propose\nthe use of conformal inference methods, introducing two novel algorithms that,\nin some instances, offer non-asymptotic guarantees, enhancing robustness and\nscalability across various applications. We demonstrate the efficacy of our\nproposed algorithms through several scenarios, highlighting their advantages\nover traditional Bayesian approaches. The proposed methods show promising\nresults for diverse biological data structures and scenarios, offering a\ngeneral framework to quantify uncertainty for dynamic models of biological\nsystems.The software for the methodology and the reproduction of the results is\navailable at https://zenodo.org/doi/10.5281/zenodo.13644870.\n","authors":["Alberto Portela","Julio R. Banga","Marcos Matabuena"],"pdf_url":"https://arxiv.org/pdf/2409.02644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00180v4","updated":"2024-09-04T11:56:13Z","published":"2023-03-01T02:14:20Z","title":"MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN\n for Precise Facial Expression Intensity Estimation","summary":" This paper presents MMA-MRNNet, a novel deep learning architecture for\ndynamic multi-output Facial Expression Intensity Estimation (FEIE) from video\ndata. Traditional approaches to this task often rely on complex 3-D CNNs, which\nrequire extensive pre-training and assume that facial expressions are uniformly\ndistributed across all frames of a video. These methods struggle to handle\nvideos of varying lengths, often resorting to ad-hoc strategies that either\ndiscard valuable information or introduce bias. MMA-MRNNet addresses these\nchallenges through a two-stage process. First, the Multiple Models of Affect\n(MMA) extractor component is a Multi-Task Learning CNN that concurrently\nestimates valence-arousal, recognizes basic facial expressions, and detects\naction units in each frame. These representations are then processed by a\nMasked RNN component, which captures temporal dependencies and dynamically\nupdates weights according to the true length of the input video, ensuring that\nonly the most relevant features are used for the final prediction. The proposed\nunimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction\ndataset and demonstrated significantly superior performance, surpassing\nstate-of-the-art methods by a wide margin, regardless of whether they were\nunimodal, multimodal, or ensemble approaches. Finally, we demonstrated the\neffectiveness of the MMA component of our proposed method across multiple\nin-the-wild datasets, where it consistently outperformed all state-of-the-art\nmethods across various metrics.\n","authors":["Dimitrios Kollias","Andreas Psaroudakis","Anastasios Arsenos","Paraskevi Theofilou","Chunchang Shao","Guanyu Hu","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2303.00180v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00208v3","updated":"2024-09-04T11:48:04Z","published":"2023-11-01T00:38:26Z","title":"What Formal Languages Can Transformers Express? A Survey","summary":" As transformers have gained prominence in natural language processing, some\nresearchers have investigated theoretically what problems they can and cannot\nsolve, by treating problems as formal languages. Exploring such questions can\nhelp clarify the power of transformers relative to other models of computation,\ntheir fundamental capabilities and limits, and the impact of architectural\nchoices. Work in this subarea has made considerable progress in recent years.\nHere, we undertake a comprehensive survey of this work, documenting the diverse\nassumptions that underlie different results and providing a unified framework\nfor harmonizing seemingly contradictory findings.\n","authors":["Lena Strobl","William Merrill","Gail Weiss","David Chiang","Dana Angluin"],"pdf_url":"https://arxiv.org/pdf/2311.00208v3.pdf","comment":"One minor correction in {\\S}5.1"},{"id":"http://arxiv.org/abs/2307.13565v4","updated":"2024-09-04T11:47:12Z","published":"2023-07-25T15:17:31Z","title":"Decision-Focused Learning: Foundations, State of the Art, Benchmark and\n Future Opportunities","summary":" Decision-focused learning (DFL) is an emerging paradigm that integrates\nmachine learning (ML) and constrained optimization to enhance decision quality\nby training ML models in an end-to-end system. This approach shows significant\npotential to revolutionize combinatorial decision-making in real-world\napplications that operate under uncertainty, where estimating unknown\nparameters within decision models is a major challenge. This paper presents a\ncomprehensive review of DFL, providing an in-depth analysis of both\ngradient-based and gradient-free techniques used to combine ML and constrained\noptimization. It evaluates the strengths and limitations of these techniques\nand includes an extensive empirical evaluation of eleven methods across seven\nproblems. The survey also offers insights into recent advancements and future\nresearch directions in DFL.\n Code and benchmark: https://github.com/PredOpt/predopt-benchmarks\n","authors":["Jayanta Mandi","James Kotary","Senne Berden","Maxime Mulamba","Victor Bucarey","Tias Guns","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2307.13565v4.pdf","comment":"Experimental Survey and Benchmarking"},{"id":"http://arxiv.org/abs/2409.02629v1","updated":"2024-09-04T11:47:00Z","published":"2024-09-04T11:47:00Z","title":"AdvSecureNet: A Python Toolkit for Adversarial Machine Learning","summary":" Machine learning models are vulnerable to adversarial attacks. Several tools\nhave been developed to research these vulnerabilities, but they often lack\ncomprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch\nbased toolkit for adversarial machine learning that is the first to natively\nsupport multi-GPU setups for attacks, defenses, and evaluation. It is the first\ntoolkit that supports both CLI and API interfaces and external YAML\nconfiguration files to enhance versatility and reproducibility. The toolkit\nincludes multiple attacks, defenses and evaluation metrics. Rigiorous software\nengineering practices are followed to ensure high code quality and\nmaintainability. The project is available as an open-source project on GitHub\nat https://github.com/melihcatal/advsecurenet and installable via PyPI.\n","authors":["Melih Catal","Manuel Günther"],"pdf_url":"https://arxiv.org/pdf/2409.02629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02628v1","updated":"2024-09-04T11:45:55Z","published":"2024-09-04T11:45:55Z","title":"(Implicit) Ensembles of Ensembles: Epistemic Uncertainty Collapse in\n Large Models","summary":" Epistemic uncertainty is crucial for safety-critical applications and\nout-of-distribution detection tasks. Yet, we uncover a paradoxical phenomenon\nin deep learning models: an epistemic uncertainty collapse as model complexity\nincreases, challenging the assumption that larger models invariably offer\nbetter uncertainty quantification. We propose that this stems from implicit\nensembling within large models. To support this hypothesis, we demonstrate\nepistemic uncertainty collapse empirically across various architectures, from\nexplicit ensembles of ensembles and simple MLPs to state-of-the-art vision\nmodels, including ResNets and Vision Transformers -- for the latter, we examine\nimplicit ensemble extraction and decompose larger models into diverse\nsub-models, recovering epistemic uncertainty. We provide theoretical\njustification for these phenomena and explore their implications for\nuncertainty estimation.\n","authors":["Andreas Kirsch"],"pdf_url":"https://arxiv.org/pdf/2409.02628v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.07569v2","updated":"2024-09-04T11:34:33Z","published":"2024-04-11T08:57:48Z","title":"Can Vehicle Motion Planning Generalize to Realistic Long-tail Scenarios?","summary":" Real-world autonomous driving systems must make safe decisions in the face of\nrare and diverse traffic scenarios. Current state-of-the-art planners are\nmostly evaluated on real-world datasets like nuScenes (open-loop) or nuPlan\n(closed-loop). In particular, nuPlan seems to be an expressive evaluation\nmethod since it is based on real-world data and closed-loop, yet it mostly\ncovers basic driving scenarios. This makes it difficult to judge a planner's\ncapabilities to generalize to rarely-seen situations. Therefore, we propose a\nnovel closed-loop benchmark interPlan containing several edge cases and\nchallenging driving scenarios. We assess existing state-of-the-art planners on\nour benchmark and show that neither rule-based nor learning-based planners can\nsafely navigate the interPlan scenarios. A recently evolving direction is the\nusage of foundation models like large language models (LLM) to handle\ngeneralization. We evaluate an LLM-only planner and introduce a novel hybrid\nplanner that combines an LLM-based behavior planner with a rule-based motion\nplanner that achieves state-of-the-art performance on our benchmark.\n","authors":["Marcel Hallgarten","Julian Zapata","Martin Stoll","Katrin Renz","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2404.07569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16582v2","updated":"2024-09-04T11:14:18Z","published":"2024-03-25T09:49:42Z","title":"In the Search for Optimal Multi-view Learning Models for Crop\n Classification with Global Remote Sensing Data","summary":" Studying and analyzing cropland is a difficult task due to its dynamic and\nheterogeneous growth behavior. Usually, diverse data sources can be collected\nfor its estimation. Although deep learning models have proven to excel in the\ncrop classification task, they face substantial challenges when dealing with\nmultiple inputs, named Multi-View Learning (MVL). The methods used in the MVL\nscenario can be structured based on the encoder architecture, the fusion\nstrategy, and the optimization technique. The literature has primarily focused\non using specific encoder architectures for local regions, lacking a deeper\nexploration of other components in the MVL methodology. In contrast, we\ninvestigate the simultaneous selection of the fusion strategy and encoder\narchitecture, assessing global-scale cropland and crop-type classifications. We\nuse a range of five fusion strategies (Input, Feature, Decision, Ensemble,\nHybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible\nconfigurations in the MVL method. We use the CropHarvest dataset for\nvalidation, which provides optical, radar, weather time series, and topographic\ninformation as input data. We found that in scenarios with a limited number of\nlabeled samples, a unique configuration is insufficient for all the cases.\nInstead, a specialized combination should be meticulously sought, including an\nencoder and fusion strategy. To streamline this search process, we suggest\nidentifying the optimal encoder architecture tailored for a particular fusion\nstrategy, and then determining the most suitable fusion strategy for the\nclassification task. We provide a methodological framework for researchers\nexploring crop classification through an MVL methodology.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.16582v2.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2407.15512v2","updated":"2024-09-04T11:01:47Z","published":"2024-07-22T09:58:29Z","title":"Increasing the Robustness of Model Predictions to Missing Sensors in\n Earth Observation","summary":" Multi-sensor ML models for EO aim to enhance prediction accuracy by\nintegrating data from various sources. However, the presence of missing data\nposes a significant challenge, particularly in non-persistent sensors that can\nbe affected by external factors. Existing literature has explored strategies\nlike temporal dropout and sensor-invariant models to address the generalization\nto missing data issues. Inspired by these works, we study two novel methods\ntailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and\nEnsemble Sensor Invariant (ESensI). Through experimentation on three\nmulti-sensor temporal EO datasets, we demonstrate that these methods\neffectively increase the robustness of model predictions to missing sensors.\nParticularly, we focus on how the predictive performance of models drops when\nsensors are missing at different levels. We observe that ensemble multi-sensor\nmodels are the most robust to the lack of sensors. In addition, the sensor\ndropout component in ISensD shows promising robustness results.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2407.15512v2.pdf","comment":"Accepted at the MACLEAN workshop in the ECML/PKDD 2024"},{"id":"http://arxiv.org/abs/2401.15113v3","updated":"2024-09-04T10:59:10Z","published":"2024-01-25T20:41:17Z","title":"Scalable Glacier Mapping using Deep Learning and Open Earth Observation\n Data Matches the Accuracy of Manual Delineation","summary":" Accurate global glacier mapping is critical for understanding climate change\nimpacts. Despite its importance, automated glacier mapping at a global scale\nremains largely unexplored. Here we address this gap and propose\nGlacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep\nlearning model, and five strategies for multitemporal global-scale glacier\nmapping using open satellite imagery. Assessing the spatial, temporal and\ncross-sensor generalisation shows that our best strategy achieves intersection\nover union >0.85 on previously unobserved images in most cases, which drops to\n>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90\nfor regions dominated by clean ice. A comparative validation against human\nexpert uncertainties in terms of area and distance deviations underscores\nGlaViTU performance, approaching or matching expert-level delineation. Adding\nsynthetic aperture radar data, namely, backscatter and interferometric\ncoherence, increases the accuracy in all regions where available. The\ncalibrated confidence for glacier extents is reported making the predictions\nmore reliable and interpretable. We also release a benchmark dataset that\ncovers 9% of glaciers worldwide. Our results support efforts towards automated\nmultitemporal and global glacier mapping.\n","authors":["Konstantin A. Maslov","Claudio Persello","Thomas Schellenberger","Alfred Stein"],"pdf_url":"https://arxiv.org/pdf/2401.15113v3.pdf","comment":"after major revision, expanded validation"},{"id":"http://arxiv.org/abs/2409.01137v2","updated":"2024-09-04T10:58:57Z","published":"2024-09-02T10:19:31Z","title":"Smart E-commerce Recommendations with Semantic AI","summary":" In e-commerce, web mining for page recommendations is widely used but often\nfails to meet user needs. To address this, we propose a novel solution\ncombining semantic web mining with BP neural networks. We process user search\nlogs to extract five key features: content priority, time spent, user feedback,\nrecommendation semantics, and input deviation. These features are then fed into\na BP neural network to classify and prioritize web pages. The prioritized pages\nare recommended to users. Using book sales pages for testing, our results\ndemonstrate that this solution can quickly and accurately identify the pages\nusers need. Our approach ensures that recommendations are more relevant and\ntailored to individual preferences, enhancing the online shopping experience.\nBy leveraging advanced semantic analysis and neural network techniques, we\nbridge the gap between user expectations and actual recommendations. This\ninnovative method not only improves accuracy but also speeds up the\nrecommendation process, making it a valuable tool for e-commerce platforms\naiming to boost user satisfaction and engagement. Additionally, our system\nability to handle large datasets and provide real-time recommendations makes it\na scalable and efficient solution for modern e-commerce challenges.\n","authors":["M. Badouch","M. Boutaounte"],"pdf_url":"https://arxiv.org/pdf/2409.01137v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.00125v2","updated":"2024-09-04T10:48:52Z","published":"2024-08-28T22:02:42Z","title":"A Hybrid Framework for Spatial Interpolation: Merging Data-driven with\n Domain Knowledge","summary":" Estimating spatially distributed information through the interpolation of\nscattered observation datasets often overlooks the critical role of domain\nknowledge in understanding spatial dependencies. Additionally, the features of\nthese data sets are typically limited to the spatial coordinates of the\nscattered observation locations. In this paper, we propose a hybrid framework\nthat integrates data-driven spatial dependency feature extraction with\nrule-assisted spatial dependency function mapping to augment domain knowledge.\nWe demonstrate the superior performance of our framework in two comparative\napplication scenarios, highlighting its ability to capture more localized\nspatial features in the reconstructed distribution fields. Furthermore, we\nunderscore its potential to enhance nonlinear estimation capabilities through\nthe application of transformed fuzzy rules and to quantify the inherent\nuncertainties associated with the observation data sets. Our framework\nintroduces an innovative approach to spatial information estimation by\nsynergistically combining observational data with rule-assisted domain\nknowledge.\n","authors":["Cong Zhang","Shuyi Du","Hongqing Song","Yuhe Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00125v2.pdf","comment":"21 pages, 13 figures; typos corrected, references updated"},{"id":"http://arxiv.org/abs/2409.02604v1","updated":"2024-09-04T10:37:44Z","published":"2024-09-04T10:37:44Z","title":"Hypothesizing Missing Causal Variables with LLMs","summary":" Scientific discovery is a catalyst for human intellectual advances, driven by\nthe cycle of hypothesis generation, experimental design, data evaluation, and\niterative assumption refinement. This process, while crucial, is expensive and\nheavily dependent on the domain knowledge of scientists to generate hypotheses\nand navigate the scientific cycle. Central to this is causality, the ability to\nestablish the relationship between the cause and the effect. Motivated by the\nscientific discovery process, in this work, we formulate a novel task where the\ninput is a partial causal graph with missing variables, and the output is a\nhypothesis about the missing variables to complete the partial graph. We design\na benchmark with varying difficulty levels and knowledge assumptions about the\ncausal graph. With the growing interest in using Large Language Models (LLMs)\nto assist in scientific discovery, we benchmark open-source and closed models\non our testbed. We show the strong ability of LLMs to hypothesize the mediation\nvariables between a cause and its effect. In contrast, they underperform in\nhypothesizing the cause and effect variables themselves. We also observe\nsurprising results where some of the open-source models outperform the closed\nGPT-4 model.\n","authors":["Ivaxi Sheth","Sahar Abdelnabi","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.02604v1.pdf","comment":"Code - https://github.com/ivaxi0s/hypothesizing-causal-variable-llm"},{"id":"http://arxiv.org/abs/2409.02599v1","updated":"2024-09-04T10:30:11Z","published":"2024-09-04T10:30:11Z","title":"A Fashion Item Recommendation Model in Hyperbolic Space","summary":" In this work, we propose a fashion item recommendation model that\nincorporates hyperbolic geometry into user and item representations. Using\nhyperbolic space, our model aims to capture implicit hierarchies among items\nbased on their visual data and users' purchase history. During training, we\napply a multi-task learning framework that considers both hyperbolic and\nEuclidean distances in the loss function. Our experiments on three data sets\nshow that our model performs better than previous models trained in Euclidean\nspace only, confirming the effectiveness of our model. Our ablation studies\nshow that multi-task learning plays a key role, and removing the Euclidean loss\nsubstantially deteriorates the model performance.\n","authors":["Ryotaro Shimizu","Yu Wang","Masanari Kimura","Yuki Hirakawa","Takashi Wada","Yuki Saito","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.02599v1.pdf","comment":"This work was presented at the CVFAD Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2409.02596v1","updated":"2024-09-04T10:27:07Z","published":"2024-09-04T10:27:07Z","title":"An Analysis of Linear Complexity Attention Substitutes with BEST-RQ","summary":" Self-Supervised Learning (SSL) has proven to be effective in various domains,\nincluding speech processing. However, SSL is computationally and memory\nexpensive. This is in part due the quadratic complexity of multi-head\nself-attention (MHSA). Alternatives for MHSA have been proposed and used in the\nspeech domain, but have yet to be investigated properly in an SSL setting. In\nthis work, we study the effects of replacing MHSA with recent state-of-the-art\nalternatives that have linear complexity, namely, HyperMixing, Fastformer,\nSummaryMixing, and Mamba. We evaluate these methods by looking at the speed,\nthe amount of VRAM consumed, and the performance on the SSL MP3S benchmark.\nResults show that these linear alternatives maintain competitive performance\ncompared to MHSA while, on average, decreasing VRAM consumption by around 20%\nto 60% and increasing speed from 7% to 65% for input sequences ranging from 20\nto 80 seconds.\n","authors":["Ryan Whetten","Titouan Parcollet","Adel Moumen","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2409.02596v1.pdf","comment":"Accepted in the IEEE Soken Language Technology Workshop 2024"},{"id":"http://arxiv.org/abs/2405.04296v2","updated":"2024-09-04T10:23:04Z","published":"2024-05-07T13:11:37Z","title":"Open Implementation and Study of BEST-RQ for Speech Processing","summary":" Self-Supervised Learning (SSL) has proven to be useful in various speech\ntasks. However, these methods are generally very demanding in terms of data,\nmemory, and computational resources. BERT-based Speech pre-Training with\nRandom-projection Quantizer (BEST-RQ), is an SSL method that has shown great\nperformance on Automatic Speech Recognition (ASR) while being simpler than\nother SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance,\ndetails are lacking in the original paper, such as the amount of GPU/TPU hours\nused in pre-training, and there is no official easy-to-use open-source\nimplementation. Furthermore, BEST-RQ has not been evaluated on other downstream\ntasks aside from ASR and speech translation. In this work, we describe a\nre-implementation of a Random-projection quantizer and perform a preliminary\nstudy with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the\ndetails and differences of our implementation. We show that a random projection\nquantizer can achieve similar downstream performance as wav2vec 2.0 while\ndecreasing training time by over a factor of two.\n","authors":["Ryan Whetten","Titouan Parcollet","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2405.04296v2.pdf","comment":"Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio,\n Speech and Beyond (SASB 2024)"},{"id":"http://arxiv.org/abs/2302.13696v5","updated":"2024-09-04T10:21:32Z","published":"2023-02-27T11:55:24Z","title":"Moderate Adaptive Linear Units (MoLU)","summary":" We propose a new high-performance activation function, Moderate Adaptive\nLinear Units (MoLU), for the deep neural network. The MoLU is a simple,\nbeautiful and powerful activation function that can be a good main activation\nfunction among hundreds of activation functions. Because the MoLU is made up of\nthe elementary functions, not only it is a diffeomorphism (i.e. analytic over\nwhole domains), but also it reduces the training time.\n","authors":["Hankyul Koh","Joon-hyuk Ko","Wonho Jhe"],"pdf_url":"https://arxiv.org/pdf/2302.13696v5.pdf","comment":"4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.01227v2","updated":"2024-09-04T10:20:59Z","published":"2024-09-02T13:02:51Z","title":"Prompt Compression with Context-Aware Sentence Encoding for Fast and\n Improved LLM Inference","summary":" Large language models (LLMs) have triggered a new stream of research focusing\non compressing the context length to reduce the computational cost while\nensuring the retention of helpful information for LLMs to answer the given\nquestion. Token-based removal methods are one of the most prominent approaches\nin this direction, but risk losing the semantics of the context caused by\nintermediate token removal, especially under high compression ratios, while\nalso facing challenges in computational efficiency. In this work, we propose\ncontext-aware prompt compression (CPC), a sentence-level prompt compression\ntechnique where its key innovation is a novel context-aware sentence encoder\nthat provides a relevance score for each sentence for a given question. To\ntrain this encoder, we generate a new dataset consisting of questions,\npositives, and negative pairs where positives are sentences relevant to the\nquestion, while negatives are irrelevant context sentences. We train the\nencoder in a contrastive setup to learn context-aware sentence representations.\nOur method considerably outperforms prior works on prompt compression on\nbenchmark datasets and is up to 10.93x faster at inference compared to the best\ntoken-level compression method. We also find better improvement for shorter\nlength constraints in most benchmarks, showing the effectiveness of our\nproposed solution in the compression of relevant information in a shorter\ncontext. Finally, we release the code and the dataset for quick reproducibility\nand further development: https://github.com/Workday/cpc.\n","authors":["Barys Liskavets","Maxim Ushakov","Shuvendu Roy","Mark Klibanov","Ali Etemad","Shane Luke"],"pdf_url":"https://arxiv.org/pdf/2409.01227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15421v2","updated":"2024-09-04T10:17:22Z","published":"2024-08-27T21:54:26Z","title":"Simultaneous Training of First- and Second-Order Optimizers in\n Population-Based Reinforcement Learning","summary":" The tuning of hyperparameters in reinforcement learning (RL) is critical, as\nthese parameters significantly impact an agent's performance and learning\nefficiency. Dynamic adjustment of hyperparameters during the training process\ncan significantly enhance both the performance and stability of learning.\nPopulation-based training (PBT) provides a method to achieve this by\ncontinuously tuning hyperparameters throughout the training. This ongoing\nadjustment enables models to adapt to different learning stages, resulting in\nfaster convergence and overall improved performance. In this paper, we propose\nan enhancement to PBT by simultaneously utilizing both first- and second-order\noptimizers within a single population. We conducted a series of experiments\nusing the TD3 algorithm across various MuJoCo environments. Our results, for\nthe first time, empirically demonstrate the potential of incorporating\nsecond-order optimizers within PBT-based RL. Specifically, the combination of\nthe K-FAC optimizer with Adam led to up to a 10% improvement in overall\nperformance compared to PBT using only Adam. Additionally, in environments\nwhere Adam occasionally fails, such as the Swimmer environment, the mixed\npopulation with K-FAC exhibited more reliable learning outcomes, offering a\nsignificant advantage in training stability without a substantial increase in\ncomputational time.\n","authors":["Felix Pfeiffer","Shahram Eivazi"],"pdf_url":"https://arxiv.org/pdf/2408.15421v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02588v1","updated":"2024-09-04T10:14:17Z","published":"2024-09-04T10:14:17Z","title":"Multiview Random Vector Functional Link Network for Predicting\n DNA-Binding Proteins","summary":" The identification of DNA-binding proteins (DBPs) is a critical task due to\ntheir significant impact on various biological activities. Understanding the\nmechanisms underlying protein-DNA interactions is essential for elucidating\nvarious life activities. In recent years, machine learning-based models have\nbeen prominently utilized for DBP prediction. In this paper, to predict DBPs,\nwe propose a novel framework termed a multiview random vector functional link\n(MvRVFL) network, which fuses neural network architecture with multiview\nlearning. The proposed MvRVFL model combines the benefits of late and early\nfusion, allowing for distinct regularization parameters across different views\nwhile leveraging a closed-form solution to determine unknown parameters\nefficiently. The primal objective function incorporates a coupling term aimed\nat minimizing a composite of errors stemming from all views. From each of the\nthree protein views of the DBP datasets, we extract five features. These\nfeatures are then fused together by incorporating a hidden feature during the\nmodel training process. The performance of the proposed MvRVFL model on the DBP\ndataset surpasses that of baseline models, demonstrating its superior\neffectiveness. Furthermore, we extend our assessment to the UCI, KEEL, AwA, and\nCorel5k datasets, to establish the practicality of the proposed models. The\nconsistency error bound, the generalization error bound, and empirical\nfindings, coupled with rigorous statistical analyses, confirm the superior\ngeneralization capabilities of the MvRVFL model compared to the baseline\nmodels.\n","authors":["A. Quadir","M. Sajid","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2409.02588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02584v1","updated":"2024-09-04T10:06:42Z","published":"2024-09-04T10:06:42Z","title":"BMI Prediction from Handwritten English Characters Using a Convolutional\n Neural Network","summary":" A person's Body Mass Index, or BMI, is the most widely used parameter for\nassessing their health. BMI is a crucial predictor of potential diseases that\nmay arise at higher body fat levels because it is correlated with body fat.\nConversely, a community's or an individual's nutritional status can be\ndetermined using the BMI. Although deep learning models are used in several\nstudies to estimate BMI from face photos and other data, no previous research\nestablished a clear connection between deep learning techniques for handwriting\nanalysis and BMI prediction. This article addresses this research gap with a\ndeep learning approach to estimating BMI from handwritten characters by\ndeveloping a convolutional neural network (CNN). A dataset containing samples\nfrom 48 people in lowercase English scripts is successfully captured for the\nBMI prediction task. The proposed CNN-based approach reports a commendable\naccuracy of 99.92%. Performance comparison with other popular CNN architectures\nreveals that AlexNet and InceptionV3 achieve the second and third-best\nperformance, with the accuracy of 99.69% and 99.53%, respectively.\n","authors":["N. T. Diba","N. Akter","S. A. H. Chowdhury","J. E. Giti"],"pdf_url":"https://arxiv.org/pdf/2409.02584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04985v6","updated":"2024-09-04T10:04:52Z","published":"2023-12-08T11:47:35Z","title":"SparQ Attention: Bandwidth-Efficient LLM Inference","summary":" The computational difficulties of large language model (LLM) inference remain\na significant obstacle to their widespread deployment. The need for many\napplications to support long input sequences and process them in large batches\ntypically causes token-generation to be bottlenecked by data transfer. For this\nreason, we introduce SparQ Attention, a technique for increasing the inference\nthroughput of LLMs by utilising memory bandwidth more efficiently within the\nattention layers, through selective fetching of the cached history. Our\nproposed technique can be applied directly to off-the-shelf LLMs during\ninference, without requiring any modification to the pre-training setup or\nadditional fine-tuning. We show that SparQ Attention brings up to 8x savings in\nattention data transfers without substantial drops in accuracy, by evaluating\nLlama 2 and 3, Mistral, Gemma and Pythia models on a wide range of downstream\ntasks.\n","authors":["Luka Ribar","Ivan Chelombiev","Luke Hudlass-Galley","Charlie Blake","Carlo Luschi","Douglas Orr"],"pdf_url":"https://arxiv.org/pdf/2312.04985v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02572v1","updated":"2024-09-04T09:46:33Z","published":"2024-09-04T09:46:33Z","title":"Advancing Cyber Incident Timeline Analysis Through Rule Based AI and\n Large Language Models","summary":" Timeline Analysis (TA) is a key part of Timeline Forensics (TF) in Digital\nForensics (DF), focusing primarily on examining and analysing temporal digital\nartefacts such as timestamps, derived from event logs, file metadata, and other\nrelated data to correlate events resulting from cyber incidents and reconstruct\ntheir chronological timeline. Traditional tools often struggle to efficiently\nprocess the vast volume and variety of data acquired during DF investigations\nand Incident Response (IR) processes. This paper presents a novel framework,\nGenDFIR, that combines Rule-Based Artificial Intelligence (R-BAI) algorithms\nwith Large Language Models (LLMs) to advance and automate the TA process. Our\napproach consists of two main stages (1) We use R-BAI to identify and select\nanomalous digital artefacts based on predefined rules. (2) The selected\nartefacts are then converted into embeddings for processing by an LLM with the\nhelp of a Retrieval-Augmented Generation (RAG) agent. The LLM consequently\nleverages its capabilities to perform automated TA on the artefacts and predict\npotential incident scenarios. To validate our framework, we evaluate GenDFIR\nperformance, efficiency, and reliability using various metrics across synthetic\ncyber incident simulation scenarios. This paper presents a proof of concept,\nwhere the findings demonstrate the significant potential of integrating R-BAI\nand LLMs for TA. This novel approach highlights the power of Generative AI\n(GenAI), specifically LLMs, and opens new avenues for advanced threat detection\nand incident reconstruction, representing a significant step forward in the\nfield.\n","authors":["Fatma Yasmine Loumachi","Mohamed Chahine Ghanem"],"pdf_url":"https://arxiv.org/pdf/2409.02572v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2012.15079v2","updated":"2024-09-04T09:44:38Z","published":"2020-12-30T08:31:31Z","title":"Enhancing Sindhi Word Segmentation using Subword Representation Learning\n and Position-aware Self-attention","summary":" Sindhi word segmentation is a challenging task due to space omission and\ninsertion issues. The Sindhi language itself adds to this complexity. It's\ncursive and consists of characters with inherent joining and non-joining\nproperties, independent of word boundaries. Existing Sindhi word segmentation\nmethods rely on designing and combining hand-crafted features. However, these\nmethods have limitations, such as difficulty handling out-of-vocabulary words,\nlimited robustness for other languages, and inefficiency with large amounts of\nnoisy or raw text. Neural network-based models, in contrast, can automatically\ncapture word boundary information without requiring prior knowledge. In this\npaper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses\nword segmentation as a sequence labeling task. The SGNWS model incorporates\nsubword representation learning through a bidirectional long short-term memory\nencoder, position-aware self-attention, and a conditional random field. Our\nempirical results demonstrate that the SGNWS model achieves state-of-the-art\nperformance in Sindhi word segmentation on six datasets.\n","authors":["Wazir Ali","Jay Kumar","Saifullah Tumrani","Redhwan Nour","Adeeb Noor","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2012.15079v2.pdf","comment":"Journal Paper, 14 pages"},{"id":"http://arxiv.org/abs/2409.02555v1","updated":"2024-09-04T09:21:13Z","published":"2024-09-04T09:21:13Z","title":"Low-Resolution Object Recognition with Cross-Resolution Relational\n Contrastive Distillation","summary":" Recognizing objects in low-resolution images is a challenging task due to the\nlack of informative details. Recent studies have shown that knowledge\ndistillation approaches can effectively transfer knowledge from a\nhigh-resolution teacher model to a low-resolution student model by aligning\ncross-resolution representations. However, these approaches still face\nlimitations in adapting to the situation where the recognized objects exhibit\nsignificant representation discrepancies between training and testing images.\nIn this study, we propose a cross-resolution relational contrastive\ndistillation approach to facilitate low-resolution object recognition. Our\napproach enables the student model to mimic the behavior of a well-trained\nteacher model which delivers high accuracy in identifying high-resolution\nobjects. To extract sufficient knowledge, the student learning is supervised\nwith contrastive relational distillation loss, which preserves the similarities\nin various relational structures in contrastive representation space. In this\nmanner, the capability of recovering missing details of familiar low-resolution\nobjects can be effectively enhanced, leading to a better knowledge transfer.\nExtensive experiments on low-resolution object classification and\nlow-resolution face recognition clearly demonstrate the effectiveness and\nadaptability of our approach.\n","authors":["Kangkai Zhang","Shiming Ge","Ruixin Shi","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02555v1.pdf","comment":"This paper is accepted by IEEE Transactions on Circuits and Systems\n for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2409.02530v1","updated":"2024-09-04T08:44:36Z","published":"2024-09-04T08:44:36Z","title":"Understanding eGFR Trajectories and Kidney Function Decline via Large\n Multimodal Models","summary":" The estimated Glomerular Filtration Rate (eGFR) is an essential indicator of\nkidney function in clinical practice. Although traditional equations and\nMachine Learning (ML) models using clinical and laboratory data can estimate\neGFR, accurately predicting future eGFR levels remains a significant challenge\nfor nephrologists and ML researchers. Recent advances demonstrate that Large\nLanguage Models (LLMs) and Large Multimodal Models (LMMs) can serve as robust\nfoundation models for diverse applications. This study investigates the\npotential of LMMs to predict future eGFR levels with a dataset consisting of\nlaboratory and clinical values from 50 patients. By integrating various\nprompting techniques and ensembles of LMMs, our findings suggest that these\nmodels, when combined with precise prompts and visual representations of eGFR\ntrajectories, offer predictive performance comparable to existing ML models.\nThis research extends the application of foundation models and suggests avenues\nfor future studies to harness these models in addressing complex medical\nforecasting challenges.\n","authors":["Chih-Yuan Li","Jun-Ting Wu","Chan Hsu","Ming-Yen Lin","Yihuang Kang"],"pdf_url":"https://arxiv.org/pdf/2409.02530v1.pdf","comment":"This preprint version includes corrections of typographical errors\n related to numerical values in Table 2, which were present in the version\n published at the BDH workshop in MIPR 2024. These corrections do not affect\n the overall conclusions of the study"},{"id":"http://arxiv.org/abs/2409.02529v1","updated":"2024-09-04T08:42:42Z","published":"2024-09-04T08:42:42Z","title":"Sample what you cant compress","summary":" For learned image representations, basic autoencoders often produce blurry\nresults. Reconstruction quality can be improved by incorporating additional\npenalties such as adversarial (GAN) and perceptual losses. Arguably, these\napproaches lack a principled interpretation. Concurrently, in generative\nsettings diffusion has demonstrated a remarkable ability to create crisp, high\nquality results and has solid theoretical underpinnings (from variational\ninference to direct study as the Fisher Divergence). Our work combines\nautoencoder representation learning with diffusion and is, to our knowledge,\nthe first to demonstrate the efficacy of jointly learning a continuous encoder\nand decoder under a diffusion-based loss. We demonstrate that this approach\nyields better reconstruction quality as compared to GAN-based autoencoders\nwhile being easier to tune. We also show that the resulting representation is\neasier to model with a latent diffusion model as compared to the representation\nobtained from a state-of-the-art GAN-based loss. Since our decoder is\nstochastic, it can generate details not encoded in the otherwise deterministic\nlatent representation; we therefore name our approach \"Sample what you can't\ncompress\", or SWYCC for short.\n","authors":["Vighnesh Birodkar","Gabriel Barcik","James Lyon","Sergey Ioffe","David Minnen","Joshua V. Dillon"],"pdf_url":"https://arxiv.org/pdf/2409.02529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02517v1","updated":"2024-09-04T08:25:54Z","published":"2024-09-04T08:25:54Z","title":"Training Universal Vocoders with Feature Smoothing-Based Augmentation\n Methods for High-Quality TTS Systems","summary":" While universal vocoders have achieved proficient waveform generation across\ndiverse voices, their integration into text-to-speech (TTS) tasks often results\nin degraded synthetic quality. To address this challenge, we present a novel\naugmentation technique for training universal vocoders. Our training scheme\nrandomly applies linear smoothing filters to input acoustic features,\nfacilitating vocoder generalization across a wide range of smoothings. It\nsignificantly mitigates the training-inference mismatch, enhancing the\nnaturalness of synthetic output even when the acoustic model produces overly\nsmoothed features. Notably, our method is applicable to any vocoder without\nrequiring architectural modifications or dependencies on specific acoustic\nmodels. The experimental results validate the superiority of our vocoder over\nconventional methods, achieving 11.99% and 12.05% improvements in mean opinion\nscores when integrated with Tacotron 2 and FastSpeech 2 TTS acoustic models,\nrespectively.\n","authors":["Jeongmin Liu","Eunwoo Song"],"pdf_url":"https://arxiv.org/pdf/2409.02517v1.pdf","comment":"4 pages, 4 figures, for demo samples, see\n https://sytronik.github.io/demos/voc_smth_aug/"},{"id":"http://arxiv.org/abs/2409.02512v1","updated":"2024-09-04T08:21:47Z","published":"2024-09-04T08:21:47Z","title":"Continual Diffuser (CoD): Mastering Continual Offline Reinforcement\n Learning with Experience Rehearsal","summary":" Artificial neural networks, especially recent diffusion-based models, have\nshown remarkable superiority in gaming, control, and QA systems, where the\ntraining tasks' datasets are usually static. However, in real-world\napplications, such as robotic control of reinforcement learning (RL), the tasks\nare changing, and new tasks arise in a sequential order. This situation poses\nthe new challenge of plasticity-stability trade-off for training an agent who\ncan adapt to task changes and retain acquired knowledge. In view of this, we\npropose a rehearsal-based continual diffusion model, called Continual Diffuser\n(CoD), to endow the diffuser with the capabilities of quick adaptation\n(plasticity) and lasting retention (stability). Specifically, we first\nconstruct an offline benchmark that contains 90 tasks from multiple domains.\nThen, we train the CoD on each task with sequential modeling and conditional\ngeneration for making decisions. Next, we preserve a small portion of previous\ndatasets as the rehearsal buffer and replay it to retain the acquired\nknowledge. Extensive experiments on a series of tasks show CoD can achieve a\npromising plasticity-stability trade-off and outperform existing\ndiffusion-based methods and other representative baselines on most tasks.\n","authors":["Jifeng Hu","Li Shen","Sili Huang","Zhejian Yang","Hechang Chen","Lichao Sun","Yi Chang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.02512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10093v2","updated":"2024-09-04T08:20:40Z","published":"2024-06-14T14:49:12Z","title":"BiKC: Keypose-Conditioned Consistency Policy for Bimanual Robotic\n Manipulation","summary":" Bimanual manipulation tasks typically involve multiple stages which require\nefficient interactions between two arms, posing step-wise and stage-wise\nchallenges for imitation learning systems. Specifically, failure and delay of\none step will broadcast through time, hinder success and efficiency of each\nsub-stage task, and thereby overall task performance. Although recent works\nhave made strides in addressing certain challenges, few approaches explicitly\nconsider the multi-stage nature of bimanual tasks while simultaneously\nemphasizing the importance of inference speed. In this paper, we introduce a\nnovel keypose-conditioned consistency policy tailored for bimanual\nmanipulation. It is a hierarchical imitation learning framework that consists\nof a high-level keypose predictor and a low-level trajectory generator. The\npredicted keyposes provide guidance for trajectory generation and also mark the\ncompletion of one sub-stage task. The trajectory generator is designed as a\nconsistency model trained from scratch without distillation, which generates\naction sequences conditioning on current observations and predicted keyposes\nwith fast inference speed. Simulated and real-world experimental results\ndemonstrate that the proposed approach surpasses baseline methods in terms of\nsuccess rate and operational efficiency. Codes are available at\nhttps://github.com/ManUtdMoon/BiKC.\n","authors":["Dongjie Yu","Hang Xu","Yizhou Chen","Yi Ren","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2406.10093v2.pdf","comment":"Accepted by The 16th International Workshop on the Algorithmic\n Foundations of Robotics (WAFR 2024)"},{"id":"http://arxiv.org/abs/2405.11449v3","updated":"2024-09-04T08:03:27Z","published":"2024-05-19T04:58:53Z","title":"NetMamba: Efficient Network Traffic Classification via Pre-training\n Unidirectional Mamba","summary":" Network traffic classification is a crucial research area aiming to enhance\nservice quality, streamline network management, and bolster cybersecurity. To\naddress the growing complexity of transmission encryption techniques, various\nmachine learning and deep learning methods have been proposed. However,\nexisting approaches face two main challenges. Firstly, they struggle with model\ninefficiency due to the quadratic complexity of the widely used Transformer\narchitecture. Secondly, they suffer from inadequate traffic representation\nbecause of discarding important byte information while retaining unwanted\nbiases. To address these challenges, we propose NetMamba, an efficient\nlinear-time state space model equipped with a comprehensive traffic\nrepresentation scheme. We adopt a specially selected and improved\nunidirectional Mamba architecture for the networking field, instead of the\nTransformer, to address efficiency issues. In addition, we design a traffic\nrepresentation scheme to extract valid information from massive traffic data\nwhile removing biased information. Evaluation experiments on six public\ndatasets encompassing three main classification tasks showcase NetMamba's\nsuperior classification performance compared to state-of-the-art baselines. It\nachieves an accuracy rate of nearly 99% (some over 99%) in all tasks.\nAdditionally, NetMamba demonstrates excellent efficiency, improving inference\nspeed by up to 60 times while maintaining comparably low memory usage.\nFurthermore, NetMamba exhibits superior few-shot learning abilities, achieving\nbetter classification performance with fewer labeled data. To the best of our\nknowledge, NetMamba is the first model to tailor the Mamba architecture for\nnetworking.\n","authors":["Tongze Wang","Xiaohui Xie","Wenduo Wang","Chuyi Wang","Youjian Zhao","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2405.11449v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02495v1","updated":"2024-09-04T07:46:28Z","published":"2024-09-04T07:46:28Z","title":"CoAst: Validation-Free Contribution Assessment for Federated Learning\n based on Cross-Round Valuation","summary":" In the federated learning (FL) process, since the data held by each\nparticipant is different, it is necessary to figure out which participant has a\nhigher contribution to the model performance. Effective contribution assessment\ncan help motivate data owners to participate in the FL training. Research works\nin this field can be divided into two directions based on whether a validation\ndataset is required. Validation-based methods need to use representative\nvalidation data to measure the model accuracy, which is difficult to obtain in\npractical FL scenarios. Existing validation-free methods assess the\ncontribution based on the parameters and gradients of local models and the\nglobal model in a single training round, which is easily compromised by the\nstochasticity of model training. In this work, we propose CoAst, a practical\nmethod to assess the FL participants' contribution without access to any\nvalidation data. The core idea of CoAst involves two aspects: one is to only\ncount the most important part of model parameters through a weights\nquantization, and the other is a cross-round valuation based on the similarity\nbetween the current local parameters and the global parameter updates in\nseveral subsequent communication rounds. Extensive experiments show that CoAst\nhas comparable assessment reliability to existing validation-based methods and\noutperforms existing validation-free methods.\n","authors":["Hao Wu","Likun Zhang","Shucheng Li","Fengyuan Xu","Sheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.02495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02492v1","updated":"2024-09-04T07:35:12Z","published":"2024-09-04T07:35:12Z","title":"Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of\n Data-Driven Optimization Routine","summary":" Diffusion tensor imaging (DTI) holds significant importance in clinical\ndiagnosis and neuroscience research. However, conventional model-based fitting\nmethods often suffer from sensitivity to noise, leading to decreased accuracy\nin estimating DTI parameters. While traditional data-driven deep learning\nmethods have shown potential in terms of accuracy and efficiency, their limited\ngeneralization to out-of-training-distribution data impedes their broader\napplication due to the diverse scan protocols used across centers, scanners,\nand studies. This work aims to tackle these challenges and promote the use of\nDTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI\ncombines the weighted linear least squares fitting algorithm and regularization\nby denoising technique. The former fits DW images from diverse acquisition\nsettings into diffusion tensor field, while the latter applies a deep\nlearning-based denoiser to regularize the diffusion tensor field instead of the\nDW images, which is free from the limitation of fixed-channel assignment of the\nnetwork. The optimization object is solved using the alternating direction\nmethod of multipliers and then unrolled to construct a deep neural network,\nleveraging a data-driven strategy to learn network parameters. Extensive\nvalidation experiments are conducted utilizing both internally simulated\ndatasets and externally obtained in-vivo datasets. The results, encompassing\nboth qualitative and quantitative analyses, showcase that the proposed method\nattains state-of-the-art performance in DTI parameter estimation. Notably, it\ndemonstrates superior generalization, accuracy, and efficiency, rendering it\nhighly reliable for widespread application in the field.\n","authors":["Jialong Li","Zhicheng Zhang","Yunwei Chen","Qiqi Lu","Ye Wu","Xiaoming Liu","QianJin Feng","Yanqiu Feng","Xinyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11293v2","updated":"2024-09-04T07:27:39Z","published":"2023-11-19T10:43:43Z","title":"From Categories to Classifiers: Name-Only Continual Learning by\n Exploring the Web","summary":" Continual Learning (CL) often relies on the availability of extensive\nannotated datasets, an assumption that is unrealistically time-consuming and\ncostly in practice. We explore a novel paradigm termed name-only continual\nlearning where time and cost constraints prohibit manual annotation. In this\nscenario, learners adapt to new category shifts using only category names\nwithout the luxury of annotated training data. Our proposed solution leverages\nthe expansive and ever-evolving internet to query and download uncurated\nwebly-supervised data for image classification. We investigate the reliability\nof our web data and find them comparable, and in some cases superior, to\nmanually annotated datasets. Additionally, we show that by harnessing the web,\nwe can create support sets that surpass state-of-the-art name-only\nclassification that create support sets using generative models or image\nretrieval from LAION-5B, achieving up to 25% boost in accuracy. When applied\nacross varied continual learning contexts, our method consistently exhibits a\nsmall performance gap in comparison to models trained on manually annotated\ndatasets. We present EvoTrends, a class-incremental dataset made from the web\nto capture real-world trends, created in just minutes. Overall, this paper\nunderscores the potential of using uncurated webly-supervised data to mitigate\nthe challenges associated with manual data labeling in continual learning.\n","authors":["Ameya Prabhu","Hasan Abed Al Kader Hammoud","Ser-Nam Lim","Bernard Ghanem","Philip H. S. Torr","Adel Bibi"],"pdf_url":"https://arxiv.org/pdf/2311.11293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02485v1","updated":"2024-09-04T07:23:12Z","published":"2024-09-04T07:23:12Z","title":"Adversarial Attacks on Machine Learning-Aided Visualizations","summary":" Research in ML4VIS investigates how to use machine learning (ML) techniques\nto generate visualizations, and the field is rapidly growing with high societal\nimpact. However, as with any computational pipeline that employs ML processes,\nML4VIS approaches are susceptible to a range of ML-specific adversarial\nattacks. These attacks can manipulate visualization generations, causing\nanalysts to be tricked and their judgments to be impaired. Due to a lack of\nsynthesis from both visualization and ML perspectives, this security aspect is\nlargely overlooked by the current ML4VIS literature. To bridge this gap, we\ninvestigate the potential vulnerabilities of ML-aided visualizations from\nadversarial attacks using a holistic lens of both visualization and ML\nperspectives. We first identify the attack surface (i.e., attack entry points)\nthat is unique in ML-aided visualizations. We then exemplify five different\nadversarial attacks. These examples highlight the range of possible attacks\nwhen considering the attack surface and multiple different adversary\ncapabilities. Our results show that adversaries can induce various attacks,\nsuch as creating arbitrary and deceptive visualizations, by systematically\nidentifying input attributes that are influential in ML inferences. Based on\nour observations of the attack surface characteristics and the attack examples,\nwe underline the importance of comprehensive studies of security issues and\ndefense mechanisms as a call of urgency for the ML4VIS community.\n","authors":["Takanori Fujiwara","Kostiantyn Kucher","Junpeng Wang","Rafael M. Martins","Andreas Kerren","Anders Ynnerman"],"pdf_url":"https://arxiv.org/pdf/2409.02485v1.pdf","comment":"This is the author's version of the article that has been accepted by\n the Journal of Visualization"},{"id":"http://arxiv.org/abs/2409.02482v1","updated":"2024-09-04T07:18:26Z","published":"2024-09-04T07:18:26Z","title":"Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes","summary":" High-quality real-time view synthesis methods are based on volume rendering,\nsplatting, or surface rendering. While surface-based methods generally are the\nfastest, they cannot faithfully model fuzzy geometry like hair. In turn,\nalpha-blending techniques excel at representing fuzzy materials but require an\nunbounded number of samples per ray (P1). Further overheads are induced by\nempty space skipping in volume rendering (P2) and sorting input primitives in\nsplatting (P3). These problems are exacerbated on low-performance graphics\nhardware, e.g. on mobile devices. We present a novel representation for\nreal-time view synthesis where the (P1) number of sampling locations is small\nand bounded, (P2) sampling locations are efficiently found via rasterization,\nand (P3) rendering is sorting-free. We achieve this by representing objects as\nsemi-transparent multi-layer meshes, rendered in fixed layer order from\noutermost to innermost. We model mesh layers as SDF shells with optimal spacing\nlearned during training. After baking, we fit UV textures to the corresponding\nmeshes. We show that our method can represent challenging fuzzy objects while\nachieving higher frame rates than volume-based and splatting-based methods on\nlow-end and mobile devices.\n","authors":["Stefano Esposito","Anpei Chen","Christian Reiser","Samuel Rota Bulò","Lorenzo Porzi","Katja Schwarz","Christian Richardt","Michael Zollhöfer","Peter Kontschieder","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2409.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02471v1","updated":"2024-09-04T06:43:17Z","published":"2024-09-04T06:43:17Z","title":"Demographic parity in regression and classification within the\n unawareness framework","summary":" This paper explores the theoretical foundations of fair regression under the\nconstraint of demographic parity within the unawareness framework, where\ndisparate treatment is prohibited, extending existing results where such\ntreatment is permitted. Specifically, we aim to characterize the optimal fair\nregression function when minimizing the quadratic loss. Our results reveal that\nthis function is given by the solution to a barycenter problem with optimal\ntransport costs. Additionally, we study the connection between optimal fair\ncost-sensitive classification, and optimal fair regression. We demonstrate that\nnestedness of the decision sets of the classifiers is both necessary and\nsufficient to establish a form of equivalence between classification and\nregression. Under this nestedness assumption, the optimal classifiers can be\nderived by applying thresholds to the optimal fair regression function;\nconversely, the optimal fair regression function is characterized by the family\nof cost-sensitive classifiers.\n","authors":["Vincent Divol","Solenne Gaucher"],"pdf_url":"https://arxiv.org/pdf/2409.02471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11008v2","updated":"2024-09-04T06:41:37Z","published":"2024-05-17T11:09:33Z","title":"A Systematic Review on Sleep Stage Classification and Sleep Disorder\n Detection Using Artificial Intelligence","summary":" Sleep is vital for people's physical and mental health, and sound sleep can\nhelp them focus on daily activities. Therefore, a sleep study that includes\nsleep patterns and sleep disorders is crucial to enhancing our knowledge about\nindividuals' health status. This study aims to provide a comprehensive,\nsystematic review of the recent literature to analyze the different approaches\nand their outcomes in sleep studies, which includes works on \"sleep stages\nclassification\" and \"sleep disorder detection\" using AI. In this review, 183\narticles were initially selected from different journals, among which 80\nrecords were enlisted for explicit review, ranging from 2016 to 2023. Brain\nwaves were the most commonly employed body parameters for sleep staging and\ndisorder studies (almost 29% of the research used brain activity signals\nexclusively, and 77% combined with the other signals). The convolutional neural\nnetwork (CNN), the most widely used of the 34 distinct artificial intelligence\nmodels, comprised 27%. The other models included the long short-term memory\n(LSTM), support vector machine (SVM), random forest (RF), and recurrent neural\nnetwork (RNN), which consisted of 11%, 6%, 6%, and 5% sequentially. For\nperformance metrics, accuracy was widely used for a maximum of 83.75% of the\ncases, the F1 score of 45%, Kappa of 36.25%, Sensitivity of 31.25%, and\nSpecificity of 30% of cases, along with the other metrics. This article would\nhelp physicians and researchers get the gist of AI's contribution to sleep\nstudies and the feasibility of their intended work.\n","authors":["Tayab Uddin Wara","Ababil Hossain Fahad","Adri Shankar Das","Md. Mehedi Hasan Shawon"],"pdf_url":"https://arxiv.org/pdf/2405.11008v2.pdf","comment":"39 pages, 11 Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2404.10155v3","updated":"2024-09-04T06:10:47Z","published":"2024-04-15T22:02:58Z","title":"The Fault in our Stars: Quality Assessment of Code Generation Benchmarks","summary":" Large Language Models (LLMs) are gaining popularity among software engineers.\nA crucial aspect of developing effective code generation LLMs is to evaluate\nthese models using a robust benchmark. Evaluation benchmarks with quality\nissues can provide a false sense of performance. In this work, we conduct the\nfirst-of-its-kind study of the quality of prompts within benchmarks used to\ncompare the performance of different code generation models. To conduct this\nstudy, we analyzed 3,566 prompts from 9 code generation benchmarks to identify\nquality issues in them. We also investigated whether fixing the identified\nquality issues in the benchmarks' prompts affects a model's performance. We\nalso studied memorization issues of the evaluation dataset, which can put into\nquestion a benchmark's trustworthiness. We found that code generation\nevaluation benchmarks mainly focused on Python and coding exercises and had\nvery limited contextual dependencies to challenge the model. These datasets and\nthe developers' prompts suffer from quality issues like spelling and\ngrammatical errors, unclear sentences to express developers' intent, and not\nusing proper documentation style. Fixing all these issues in the benchmarks can\nlead to a better performance for Python code generation, but not a significant\nimprovement was observed for Java code generation. We also found evidence that\nGPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues.\n","authors":["Mohammed Latif Siddiq","Simantika Dristi","Joy Saha","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2404.10155v3.pdf","comment":"Accepted at the 24th IEEE International Conference on Source Code\n Analysis and Manipulation(SCAM 2024) Research Track"},{"id":"http://arxiv.org/abs/2305.18420v2","updated":"2024-09-04T05:03:06Z","published":"2023-05-28T19:40:46Z","title":"Sample Complexity of Variance-reduced Distributionally Robust Q-learning","summary":" Dynamic decision-making under distributional shifts is of fundamental\ninterest in theory and applications of reinforcement learning: The distribution\nof the environment in which the data is collected can differ from that of the\nenvironment in which the model is deployed. This paper presents two novel\nmodel-free algorithms, namely the distributionally robust Q-learning and its\nvariance-reduced counterpart, that can effectively learn a robust policy\ndespite distributional shifts. These algorithms are designed to efficiently\napproximate the $q$-function of an infinite-horizon $\\gamma$-discounted robust\nMarkov decision process with Kullback-Leibler ambiguity set to an entry-wise\n$\\epsilon$-degree of precision. Further, the variance-reduced distributionally\nrobust Q-learning combines the synchronous Q-learning with variance-reduction\ntechniques to enhance its performance. Consequently, we establish that it\nattains a minimax sample complexity upper bound of $\\tilde\nO(|\\mathbf{S}||\\mathbf{A}|(1-\\gamma)^{-4}\\epsilon^{-2})$, where $\\mathbf{S}$\nand $\\mathbf{A}$ denote the state and action spaces. This is the first\ncomplexity result that is independent of the ambiguity size $\\delta$, thereby\nproviding new complexity theoretic insights. Additionally, a series of\nnumerical experiments confirm the theoretical findings and the efficiency of\nthe algorithms in handling distributional shifts.\n","authors":["Shengbo Wang","Nian Si","Jose Blanchet","Zhengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.18420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02446v1","updated":"2024-09-04T04:56:41Z","published":"2024-09-04T04:56:41Z","title":"ForeCal: Random Forest-based Calibration for DNNs","summary":" Deep neural network(DNN) based classifiers do extremely well in\ndiscriminating between observations, resulting in higher ROC AUC and accuracy\nmetrics, but their outputs are often miscalibrated with respect to true event\nlikelihoods. Post-hoc calibration algorithms are often used to calibrate the\noutputs of these classifiers. Methods like Isotonic regression, Platt scaling,\nand Temperature scaling have been shown to be effective in some cases but are\nlimited by their parametric assumptions and/or their inability to capture\ncomplex non-linear relationships. We propose ForeCal - a novel post-hoc\ncalibration algorithm based on Random forests. ForeCal exploits two unique\nproperties of Random forests: the ability to enforce weak monotonicity and\nrange-preservation. It is more powerful in achieving calibration than current\nstate-of-the-art methods, is non-parametric, and can incorporate exogenous\ninformation as features to learn a better calibration function. Through\nexperiments on 43 diverse datasets from the UCI ML repository, we show that\nForeCal outperforms existing methods in terms of Expected Calibration\nError(ECE) with minimal impact on the discriminative power of the base DNN as\nmeasured by AUC.\n","authors":["Dhruv Nigam"],"pdf_url":"https://arxiv.org/pdf/2409.02446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02431v1","updated":"2024-09-04T04:18:25Z","published":"2024-09-04T04:18:25Z","title":"Adversarial Learning for Neural PDE Solvers with Sparse Data","summary":" Neural network solvers for partial differential equations (PDEs) have made\nsignificant progress, yet they continue to face challenges related to data\nscarcity and model robustness. Traditional data augmentation methods, which\nleverage symmetry or invariance, impose strong assumptions on physical systems\nthat often do not hold in dynamic and complex real-world applications. To\naddress this research gap, this study introduces a universal learning strategy\nfor neural network PDEs, named Systematic Model Augmentation for Robust\nTraining (SMART). By focusing on challenging and improving the model's\nweaknesses, SMART reduces generalization error during training under\ndata-scarce conditions, leading to significant improvements in prediction\naccuracy across various PDE scenarios. The effectiveness of the proposed method\nis demonstrated through both theoretical analysis and extensive\nexperimentation. The code will be available.\n","authors":["Yunpeng Gong","Yongjie Hou","Zhenzhong Wang","Zexin Lin","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.02431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02430v1","updated":"2024-09-04T04:17:57Z","published":"2024-09-04T04:17:57Z","title":"Transfer-based Adversarial Poisoning Attacks for Online (MIMO-)Deep\n Receviers","summary":" Recently, the design of wireless receivers using deep neural networks (DNNs),\nknown as deep receivers, has attracted extensive attention for ensuring\nreliable communication in complex channel environments. To adapt quickly to\ndynamic channels, online learning has been adopted to update the weights of\ndeep receivers with over-the-air data (e.g., pilots). However, the fragility of\nneural models and the openness of wireless channels expose these systems to\nmalicious attacks. To this end, understanding these attack methods is essential\nfor robust receiver design.In this paper, we propose a transfer-based\nadversarial poisoning attack method for online receivers.Without knowledge of\nthe attack target, adversarial perturbations are injected to the pilots,\npoisoning the online deep receiver and impairing its ability to adapt to\ndynamic channels and nonlinear effects. In particular, our attack method\ntargets Deep Soft Interference Cancellation (DeepSIC)[1] using online\nmeta-learning.As a classical model-driven deep receiver, DeepSIC incorporates\nwireless domain knowledge into its architecture. This integration allows it to\nadapt efficiently to time-varying channels with only a small number of pilots,\nachieving optimal performance in a multi-input and multi-output (MIMO)\nscenario.The deep receiver in this scenario has a number of applications in the\nfield of wireless communication, which motivates our study of the attack\nmethods targeting it.Specifically, we demonstrate the effectiveness of our\nattack in simulations on synthetic linear, synthetic nonlinear, static, and\nCOST 2100 channels. Simulation results indicate that the proposed poisoning\nattack significantly reduces the performance of online receivers in rapidly\nchanging scenarios.\n","authors":["Kunze Wu","Weiheng Jiang","Dusit Niyato","Yinghuan Li","Chuang Luo"],"pdf_url":"https://arxiv.org/pdf/2409.02430v1.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.02428v1","updated":"2024-09-04T04:15:14Z","published":"2024-09-04T04:15:14Z","title":"Large Language Models as Efficient Reward Function Searchers for\n Custom-Environment Multi-Objective Reinforcement Learning","summary":" Leveraging large language models (LLMs) for designing reward functions\ndemonstrates significant potential. However, achieving effective design and\nimprovement of reward functions in reinforcement learning (RL) tasks with\ncomplex custom environments and multiple requirements presents considerable\nchallenges. In this paper, we enable LLMs to be effective white-box searchers,\nhighlighting their advanced semantic understanding capabilities. Specifically,\nwe generate reward components for each explicit user requirement and employ the\nreward critic to identify the correct code form. Then, LLMs assign weights to\nthe reward components to balance their values and iteratively search and\noptimize these weights based on the context provided by the training log\nanalyzer, while adaptively determining the search step size. We applied the\nframework to an underwater information collection RL task without direct human\nfeedback or reward examples (zero-shot). The reward critic successfully correct\nthe reward code with only one feedback for each requirement, effectively\npreventing irreparable errors that can occur when reward function feedback is\nprovided in aggregate. The effective initialization of weights enables the\nacquisition of different reward functions within the Pareto solution set\nwithout weight search. Even in the case where a weight is 100 times off, fewer\nthan four iterations are needed to obtain solutions that meet user\nrequirements. The framework also works well with most prompts utilizing GPT-3.5\nTurbo, since it does not require advanced numerical understanding or\ncalculation.\n","authors":["Guanwen Xie","Jingzehua Xu","Yiyuan Yang","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02426v1","updated":"2024-09-04T04:14:02Z","published":"2024-09-04T04:14:02Z","title":"Diffusion Models Learn Low-Dimensional Distributions via Subspace\n Clustering","summary":" Recent empirical studies have demonstrated that diffusion models can\neffectively learn the image distribution and generate new samples. Remarkably,\nthese models can achieve this even with a small number of training samples\ndespite a large image dimension, circumventing the curse of dimensionality. In\nthis work, we provide theoretical insights into this phenomenon by leveraging\nkey empirical observations: (i) the low intrinsic dimensionality of image data,\n(ii) a union of manifold structure of image data, and (iii) the low-rank\nproperty of the denoising autoencoder in trained diffusion models. These\nobservations motivate us to assume the underlying data distribution of image\ndata as a mixture of low-rank Gaussians and to parameterize the denoising\nautoencoder as a low-rank model according to the score function of the assumed\ndistribution. With these setups, we rigorously show that optimizing the\ntraining loss of diffusion models is equivalent to solving the canonical\nsubspace clustering problem over the training samples. Based on this\nequivalence, we further show that the minimal number of samples required to\nlearn the underlying distribution scales linearly with the intrinsic dimensions\nunder the above data and model assumptions. This insight sheds light on why\ndiffusion models can break the curse of dimensionality and exhibit the phase\ntransition in learning distributions. Moreover, we empirically establish a\ncorrespondence between the subspaces and the semantic representations of image\ndata, facilitating image editing. We validate these results with corroborated\nexperimental results on both simulated distributions and image datasets.\n","authors":["Peng Wang","Huijie Zhang","Zekai Zhang","Siyi Chen","Yi Ma","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02426v1.pdf","comment":"39 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.02425v1","updated":"2024-09-04T04:12:22Z","published":"2024-09-04T04:12:22Z","title":"Deep Adaptive Interest Network: Personalized Recommendation with\n Context-Aware Learning","summary":" In personalized recommendation systems, accurately capturing users' evolving\ninterests and combining them with contextual information is a critical research\narea. This paper proposes a novel model called the Deep Adaptive Interest\nNetwork (DAIN), which dynamically models users' interests while incorporating\ncontext-aware learning mechanisms to achieve precise and adaptive personalized\nrecommendations. DAIN leverages deep learning techniques to build an adaptive\ninterest network structure that can capture users' interest changes in\nreal-time while further optimizing recommendation results by integrating\ncontextual information. Experiments conducted on several public datasets\ndemonstrate that DAIN excels in both recommendation performance and\ncomputational efficiency. This research not only provides a new solution for\npersonalized recommendation systems but also offers fresh insights into the\napplication of context-aware learning in recommendation systems.\n","authors":["Shuaishuai Huang","Haowei Yang","You Yao","Xueting Lin","Yuming Tu"],"pdf_url":"https://arxiv.org/pdf/2409.02425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08627v3","updated":"2024-09-04T03:53:59Z","published":"2023-12-08T04:27:11Z","title":"Predicting and Interpreting Energy Barriers of Metallic Glasses with\n Graph Neural Networks","summary":" Metallic Glasses (MGs) are widely used materials that are stronger than steel\nwhile being shapeable as plastic. While understanding the structure-property\nrelationship of MGs remains a challenge in materials science, studying their\nenergy barriers (EBs) as an intermediary step shows promise. In this work, we\nutilize Graph Neural Networks (GNNs) to model MGs and study EBs. We contribute\na new dataset for EB prediction and a novel Symmetrized GNN (SymGNN) model that\nis E(3)-invariant in expectation. SymGNN handles invariance by aggregating over\northogonal transformations of the graph structure. When applied to EB\nprediction, SymGNN are more accurate than molecular dynamics (MD)\nlocal-sampling methods and other machine-learning models. Compared to precise\nMD simulations, SymGNN reduces the inference time on new MGs from roughly 41\ndays to less than one second. We apply explanation algorithms to reveal the\nrelationship between structures and EBs. The structures that we identify\nthrough explanations match the medium-range order (MRO) hypothesis and possess\nunique topological properties. Our work enables effective prediction and\ninterpretation of MG EBs, bolstering material science research.\n","authors":["Haoyu Li","Shichang Zhang","Longwen Tang","Mathieu Bauchy","Yizhou Sun"],"pdf_url":"https://arxiv.org/pdf/2401.08627v3.pdf","comment":"ICML 2024. Code available at https://github.com/haoyuli02/SymGNN"},{"id":"http://arxiv.org/abs/2409.02416v1","updated":"2024-09-04T03:41:44Z","published":"2024-09-04T03:41:44Z","title":"Relative-Translation Invariant Wasserstein Distance","summary":" We introduce a new family of distances, relative-translation invariant\nWasserstein distances ($RW_p$), for measuring the similarity of two probability\ndistributions under distribution shift. Generalizing it from the classical\noptimal transport model, we show that $RW_p$ distances are also real distance\nmetrics defined on the quotient set $\\mathcal{P}_p(\\mathbb{R}^n)/\\sim$ and\ninvariant to distribution translations. When $p=2$, the $RW_2$ distance enjoys\nmore exciting properties, including decomposability of the optimal transport\nmodel, translation-invariance of the $RW_2$ distance, and a Pythagorean\nrelationship between $RW_2$ and the classical quadratic Wasserstein distance\n($W_2$). Based on these properties, we show that a distribution shift, measured\nby $W_2$ distance, can be explained in the bias-variance perspective. In\naddition, we propose a variant of the Sinkhorn algorithm, named $RW_2$ Sinkhorn\nalgorithm, for efficiently calculating $RW_2$ distance, coupling solutions, as\nwell as $W_2$ distance. We also provide the analysis of numerical stability and\ntime complexity for the proposed algorithm. Finally, we validate the $RW_2$\ndistance metric and the algorithm performance with three experiments. We\nconduct one numerical validation for the $RW_2$ Sinkhorn algorithm and show two\nreal-world applications demonstrating the effectiveness of using $RW_2$ under\ndistribution shift: digits recognition and similar thunderstorm detection. The\nexperimental results report that our proposed algorithm significantly improves\nthe computational efficiency of Sinkhorn in certain practical applications, and\nthe $RW_2$ distance is robust to distribution translations compared with\nbaselines.\n","authors":["Binshuai Wang","Qiwei Di","Ming Yin","Mengdi Wang","Quanquan Gu","Peng Wei"],"pdf_url":"https://arxiv.org/pdf/2409.02416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02413v1","updated":"2024-09-04T03:39:23Z","published":"2024-09-04T03:39:23Z","title":"Abstractive Text Summarization: State of the Art, Challenges, and\n Improvements","summary":" Specifically focusing on the landscape of abstractive text summarization, as\nopposed to extractive techniques, this survey presents a comprehensive\noverview, delving into state-of-the-art techniques, prevailing challenges, and\nprospective research directions. We categorize the techniques into traditional\nsequence-to-sequence models, pre-trained large language models, reinforcement\nlearning, hierarchical methods, and multi-modal summarization. Unlike prior\nworks that did not examine complexities, scalability and comparisons of\ntechniques in detail, this review takes a comprehensive approach encompassing\nstate-of-the-art methods, challenges, solutions, comparisons, limitations and\ncharts out future improvements - providing researchers an extensive overview to\nadvance abstractive summarization research. We provide vital comparison tables\nacross techniques categorized - offering insights into model complexity,\nscalability and appropriate applications. The paper highlights challenges such\nas inadequate meaning representation, factual consistency, controllable text\nsummarization, cross-lingual summarization, and evaluation metrics, among\nothers. Solutions leveraging knowledge incorporation and other innovative\nstrategies are proposed to address these challenges. The paper concludes by\nhighlighting emerging research areas like factual inconsistency,\ndomain-specific, cross-lingual, multilingual, and long-document summarization,\nas well as handling noisy data. Our objective is to provide researchers and\npractitioners with a structured overview of the domain, enabling them to better\nunderstand the current landscape and identify potential areas for further\nresearch and improvement.\n","authors":["Hassan Shakil","Ahmad Farooq","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2409.02413v1.pdf","comment":"9 Tables, 7 Figures"},{"id":"http://arxiv.org/abs/2104.12678v6","updated":"2024-09-04T03:27:11Z","published":"2021-04-26T16:11:47Z","title":"Semi-Decentralized Federated Edge Learning for Fast Convergence on\n Non-IID Data","summary":" Federated edge learning (FEEL) has emerged as an effective approach to reduce\nthe large communication latency in Cloud-based machine learning solutions,\nwhile preserving data privacy. Unfortunately, the learning performance of FEEL\nmay be compromised due to limited training data in a single edge cluster. In\nthis paper, we investigate a novel framework of FEEL, namely semi-decentralized\nfederated edge learning (SD-FEEL). By allowing model aggregation across\ndifferent edge clusters, SD-FEEL enjoys the benefit of FEEL in reducing the\ntraining latency, while improving the learning performance by accessing richer\ntraining data from multiple edge clusters. A training algorithm for SD-FEEL\nwith three main procedures in each round is presented, including local model\nupdates, intra-cluster and inter-cluster model aggregations, which is proved to\nconverge on non-independent and identically distributed (non-IID) data. We also\ncharacterize the interplay between the network topology of the edge servers and\nthe communication overhead of inter-cluster model aggregation on the training\nperformance. Experiment results corroborate our analysis and demonstrate the\neffectiveness of SD-FFEL in achieving faster convergence than traditional\nfederated learning architectures. Besides, guidelines on choosing critical\nhyper-parameters of the training algorithm are also provided.\n","authors":["Yuchang Sun","Jiawei Shao","Yuyi Mao","Jessie Hui Wang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2104.12678v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00908v2","updated":"2024-09-04T03:26:58Z","published":"2024-09-02T02:40:42Z","title":"EnsLoss: Stochastic Calibrated Loss Ensembles for Preventing Overfitting\n in Classification","summary":" Empirical risk minimization (ERM) with a computationally feasible surrogate\nloss is a widely accepted approach for classification. Notably, the convexity\nand calibration (CC) properties of a loss function ensure consistency of ERM in\nmaximizing accuracy, thereby offering a wide range of options for surrogate\nlosses. In this article, we propose a novel ensemble method, namely EnsLoss,\nwhich extends the ensemble learning concept to combine loss functions within\nthe ERM framework. A key feature of our method is the consideration on\npreserving the \"legitimacy\" of the combined losses, i.e., ensuring the CC\nproperties. Specifically, we first transform the CC conditions of losses into\nloss-derivatives, thereby bypassing the need for explicit loss functions and\ndirectly generating calibrated loss-derivatives. Therefore, inspired by\nDropout, EnsLoss enables loss ensembles through one training process with\ndoubly stochastic gradient descent (i.e., random batch samples and random\ncalibrated loss-derivatives). We theoretically establish the statistical\nconsistency of our approach and provide insights into its benefits. The\nnumerical effectiveness of EnsLoss compared to fixed loss methods is\ndemonstrated through experiments on a broad range of 14 OpenML tabular datasets\nand 46 image datasets with various deep learning architectures. Python\nrepository and source code are available on GitHub at\nhttps://github.com/statmlben/ensloss.\n","authors":["Ben Dai"],"pdf_url":"https://arxiv.org/pdf/2409.00908v2.pdf","comment":"31 pages; 4 figures"},{"id":"http://arxiv.org/abs/2408.08998v2","updated":"2024-09-04T03:26:09Z","published":"2024-08-16T20:00:08Z","title":"A Confidence Interval for the $\\ell_2$ Expected Calibration Error","summary":" Recent advances in machine learning have significantly improved prediction\naccuracy in various applications. However, ensuring the calibration of\nprobabilistic predictions remains a significant challenge. Despite efforts to\nenhance model calibration, the rigorous statistical evaluation of model\ncalibration remains less explored. In this work, we develop confidence\nintervals the $\\ell_2$ Expected Calibration Error (ECE). We consider\ntop-1-to-$k$ calibration, which includes both the popular notion of confidence\ncalibration as well as full calibration. For a debiased estimator of the ECE,\nwe show asymptotic normality, but with different convergence rates and\nasymptotic variances for calibrated and miscalibrated models. We develop\nmethods to construct asymptotically valid confidence intervals for the ECE,\naccounting for this behavior as well as non-negativity. Our theoretical\nfindings are supported through extensive experiments, showing that our methods\nproduce valid confidence intervals with shorter lengths compared to those\nobtained by resampling-based methods.\n","authors":["Yan Sun","Pratik Chaudhari","Ian J. Barnett","Edgar Dobriban"],"pdf_url":"https://arxiv.org/pdf/2408.08998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02410v1","updated":"2024-09-04T03:25:48Z","published":"2024-09-04T03:25:48Z","title":"Adaptive Class Emergence Training: Enhancing Neural Network Stability\n and Generalization through Progressive Target Evolution","summary":" Recent advancements in artificial intelligence, particularly deep neural\nnetworks, have pushed the boundaries of what is achievable in complex tasks.\nTraditional methods for training neural networks in classification problems\noften rely on static target outputs, such as one-hot encoded vectors, which can\nlead to unstable optimization and difficulties in handling non-linearities\nwithin data. In this paper, we propose a novel training methodology that\nprogressively evolves the target outputs from a null vector to one-hot encoded\nvectors throughout the training process. This gradual transition allows the\nnetwork to adapt more smoothly to the increasing complexity of the\nclassification task, maintaining an equilibrium state that reduces the risk of\noverfitting and enhances generalization. Our approach, inspired by concepts\nfrom structural equilibrium in finite element analysis, has been validated\nthrough extensive experiments on both synthetic and real-world datasets. The\nresults demonstrate that our method achieves faster convergence, improved\naccuracy, and better generalization, especially in scenarios with high data\ncomplexity and noise. This progressive training framework offers a robust\nalternative to classical methods, opening new perspectives for more efficient\nand stable neural network training.\n","authors":["Jaouad Dabounou"],"pdf_url":"https://arxiv.org/pdf/2409.02410v1.pdf","comment":"15 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.00025v2","updated":"2024-09-04T03:11:02Z","published":"2024-08-16T19:22:02Z","title":"A Novel Approach to Classify Power Quality Signals Using Vision\n Transformers","summary":" With the rapid integration of electronically interfaced renewable energy\nresources and loads into smart grids, there is increasing interest in power\nquality disturbances (PQD) classification to enhance the security and\nefficiency of these grids. This paper introduces a new approach to PQD\nclassification based on the Vision Transformer (ViT) model. When a PQD occurs,\nthe proposed approach first converts the power quality signal into an image and\nthen utilizes a pre-trained ViT to accurately determine the class of the PQD.\nUnlike most previous works, which were limited to a few disturbance classes or\nsmall datasets, the proposed method is trained and tested on a large dataset\nwith 17 disturbance classes. Our experimental results show that the proposed\nViT-based approach achieves PQD classification precision and recall of 98.28%\nand 97.98%, respectively, outperforming recently proposed techniques applied to\nthe same dataset.\n","authors":["Ahmad Mohammad Saber","Alaa Selim","Mohamed M. Hammad","Amr Youssef","Deepa Kundur","Ehab El-Saadany"],"pdf_url":"https://arxiv.org/pdf/2409.00025v2.pdf","comment":"IECON 2024-50th Annual Conference of the IEEE Industrial Electronics\n Society, Chicago, U.S.A, 2024, pp. 1-6"},{"id":"http://arxiv.org/abs/2409.02404v1","updated":"2024-09-04T03:06:13Z","published":"2024-09-04T03:06:13Z","title":"Learning Privacy-Preserving Student Networks via\n Discriminative-Generative Distillation","summary":" While deep models have proved successful in learning rich knowledge from\nmassive well-annotated data, they may pose a privacy leakage risk in practical\ndeployment. It is necessary to find an effective trade-off between high utility\nand strong privacy. In this work, we propose a discriminative-generative\ndistillation approach to learn privacy-preserving deep models. Our key idea is\ntaking models as bridge to distill knowledge from private data and then\ntransfer it to learn a student network via two streams. First, discriminative\nstream trains a baseline classifier on private data and an ensemble of teachers\non multiple disjoint private subsets, respectively. Then, generative stream\ntakes the classifier as a fixed discriminator and trains a generator in a\ndata-free manner. After that, the generator is used to generate massive\nsynthetic data which are further applied to train a variational autoencoder\n(VAE). Among these synthetic data, a few of them are fed into the teacher\nensemble to query labels via differentially private aggregation, while most of\nthem are embedded to the trained VAE for reconstructing synthetic data.\nFinally, a semi-supervised student learning is performed to simultaneously\nhandle two tasks: knowledge transfer from the teachers with distillation on few\nprivately labeled synthetic data, and knowledge enhancement with tangent-normal\nadversarial regularization on many triples of reconstructed synthetic data. In\nthis way, our approach can control query cost over private data and mitigate\naccuracy degradation in a unified manner, leading to a privacy-preserving\nstudent model. Extensive experiments and analysis clearly show the\neffectiveness of the proposed approach.\n","authors":["Shiming Ge","Bochao Liu","Pengju Wang","Yong Li","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02404v1.pdf","comment":"This paper is accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2406.06479v3","updated":"2024-09-04T02:57:47Z","published":"2024-06-10T17:20:13Z","title":"Graph-Based Bidirectional Transformer Decision Threshold Adjustment\n Algorithm for Class-Imbalanced Molecular Data","summary":" Data sets with imbalanced class sizes, where one class size is much smaller\nthan that of others, occur exceedingly often in many applications, including\nthose with biological foundations, such as disease diagnosis and drug\ndiscovery. Therefore, it is extremely important to be able to identify data\nelements of classes of various sizes, as a failure to do so can result in heavy\ncosts. Nonetheless, many data classification procedures do not perform well on\nimbalanced data sets as they often fail to detect elements belonging to\nunderrepresented classes. In this work, we propose the BTDT-MBO algorithm,\nincorporating Merriman-Bence-Osher (MBO) approaches and a bidirectional\ntransformer, as well as distance correlation and decision threshold\nadjustments, for data classification tasks on highly imbalanced molecular data\nsets, where the sizes of the classes vary greatly. The proposed technique not\nonly integrates adjustments in the classification threshold for the MBO\nalgorithm in order to help deal with the class imbalance, but also uses a\nbidirectional transformer procedure based on an attention mechanism for\nself-supervised learning. In addition, the model implements distance\ncorrelation as a weight function for the similarity graph-based framework on\nwhich the adjusted MBO algorithm operates. The proposed method is validated\nusing six molecular data sets and compared to other related techniques. The\ncomputational experiments show that the proposed technique is superior to\ncompeting approaches even in the case of a high class imbalance ratio.\n","authors":["Nicole Hayes","Ekaterina Merkurjev","Guo-Wei Wei"],"pdf_url":"https://arxiv.org/pdf/2406.06479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11783v2","updated":"2024-09-04T02:45:12Z","published":"2023-03-19T08:19:10Z","title":"CCPL: Cross-modal Contrastive Protein Learning","summary":" Effective protein representation learning is crucial for predicting protein\nfunctions. Traditional methods often pretrain protein language models on large,\nunlabeled amino acid sequences, followed by finetuning on labeled data. While\neffective, these methods underutilize the potential of protein structures,\nwhich are vital for function determination. Common structural representation\ntechniques rely heavily on annotated data, limiting their generalizability.\nMoreover, structural pretraining methods, similar to natural language\npretraining, can distort actual protein structures. In this work, we introduce\na novel unsupervised protein structure representation pretraining method,\ncross-modal contrastive protein learning (CCPL). CCPL leverages a robust\nprotein language model and uses unsupervised contrastive alignment to enhance\nstructure learning, incorporating self-supervised structural constraints to\nmaintain intrinsic structural information. We evaluated our model across\nvarious benchmarks, demonstrating the framework's superiority.\n","authors":["Jiangbin Zheng","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2303.11783v2.pdf","comment":"Accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2409.02392v1","updated":"2024-09-04T02:41:04Z","published":"2024-09-04T02:41:04Z","title":"Building Math Agents with Multi-Turn Iterative Preference Learning","summary":" Recent studies have shown that large language models' (LLMs) mathematical\nproblem-solving capabilities can be enhanced by integrating external tools,\nsuch as code interpreters, and employing multi-turn Chain-of-Thought (CoT)\nreasoning. While current methods focus on synthetic data generation and\nSupervised Fine-Tuning (SFT), this paper studies the complementary direct\npreference learning approach to further improve model performance. However,\nexisting direct preference learning algorithms are originally designed for the\nsingle-turn chat task, and do not fully address the complexities of multi-turn\nreasoning and external tool integration required for tool-integrated\nmathematical reasoning tasks. To fill in this gap, we introduce a multi-turn\ndirect preference learning framework, tailored for this context, that leverages\nfeedback from code interpreters and optimizes trajectory-level preferences.\nThis framework includes multi-turn DPO and multi-turn KTO as specific\nimplementations. The effectiveness of our framework is validated through\ntraining of various language models using an augmented prompt set from the\nGSM8K and MATH datasets. Our results demonstrate substantial improvements: a\nsupervised fine-tuned Gemma-1.1-it-7B model's performance increased from 77.5%\nto 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B\nmodel improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH.\n","authors":["Wei Xiong","Chengshuai Shi","Jiaming Shen","Aviv Rosenberg","Zhen Qin","Daniele Calandriello","Misha Khalman","Rishabh Joshi","Bilal Piot","Mohammad Saleh","Chi Jin","Tong Zhang","Tianqi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02392v1.pdf","comment":"A multi-turn direct preference learning framework for tool-integrated\n reasoning tasks"},{"id":"http://arxiv.org/abs/2409.01128v2","updated":"2024-09-04T02:40:52Z","published":"2024-09-02T10:07:24Z","title":"Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in\n Federated Class Continual Learning","summary":" Federated Class Continual Learning (FCCL) merges the challenges of\ndistributed client learning with the need for seamless adaptation to new\nclasses without forgetting old ones. The key challenge in FCCL is catastrophic\nforgetting, an issue that has been explored to some extent in Continual\nLearning (CL). However, due to privacy preservation requirements, some\nconventional methods, such as experience replay, are not directly applicable to\nFCCL. Existing FCCL methods mitigate forgetting by generating historical data\nthrough federated training of GANs or data-free knowledge distillation.\nHowever, these approaches often suffer from unstable training of generators or\nlow-quality generated data, limiting their guidance for the model. To address\nthis challenge, we propose a novel method of data replay based on diffusion\nmodels. Instead of training a diffusion model, we employ a pre-trained\nconditional diffusion model to reverse-engineer each class, searching the\ncorresponding input conditions for each class within the model's input space,\nsignificantly reducing computational resources and time consumption while\nensuring effective generation. Furthermore, we enhance the classifier's domain\ngeneralization ability on generated and real data through contrastive learning,\nindirectly improving the representational capability of generated data for real\ndata. Comprehensive experiments demonstrate that our method significantly\noutperforms existing baselines. Code is available at\nhttps://github.com/jinglin-liang/DDDR.\n","authors":["Jinglin Liang","Jin Zhong","Hanlin Gu","Zhongqi Lu","Xingxing Tang","Gang Dai","Shuangping Huang","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01128v2.pdf","comment":"Accepted by ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.02388v1","updated":"2024-09-04T02:31:53Z","published":"2024-09-04T02:31:53Z","title":"Gaussian Rate-Distortion-Perception Coding and Entropy-Constrained\n Scalar Quantization","summary":" This paper investigates the best known bounds on the quadratic Gaussian\ndistortion-rate-perception function with limited common randomness for the\nKullback-Leibler divergence-based perception measure, as well as their\ncounterparts for the squared Wasserstein-2 distance-based perception measure,\nrecently established by Xie et al. These bounds are shown to be nondegenerate\nin the sense that they cannot be deduced from each other via a refined version\nof Talagrand's transportation inequality. On the other hand, an improved lower\nbound is established when the perception measure is given by the squared\nWasserstein-2 distance. In addition, it is revealed by exploiting the\nconnection between rate-distortion-perception coding and entropy-constrained\nscalar quantization that all the aforementioned bounds are generally not tight\nin the weak perception constraint regime.\n","authors":["Li Xie","Liangyan Li","Jun Chen","Lei Yu","Zhongshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11055v2","updated":"2024-09-04T02:24:25Z","published":"2023-05-18T15:50:33Z","title":"Small noise analysis for Tikhonov and RKHS regularizations","summary":" Regularization plays a pivotal role in ill-posed machine learning and inverse\nproblems. However, the fundamental comparative analysis of various\nregularization norms remains open. We establish a small noise analysis\nframework to assess the effects of norms in Tikhonov and RKHS regularizations,\nin the context of ill-posed linear inverse problems with Gaussian noise. This\nframework studies the convergence rates of regularized estimators in the small\nnoise limit and reveals the potential instability of the conventional\nL2-regularizer. We solve such instability by proposing an innovative class of\nadaptive fractional RKHS regularizers, which covers the L2 Tikhonov and RKHS\nregularizations by adjusting the fractional smoothness parameter. A surprising\ninsight is that over-smoothing via these fractional RKHSs consistently yields\noptimal convergence rates, but the optimal hyper-parameter may decay too fast\nto be selected in practice.\n","authors":["Quanjun Lang","Fei Lu"],"pdf_url":"https://arxiv.org/pdf/2305.11055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01731v2","updated":"2024-09-04T02:23:10Z","published":"2024-09-03T09:14:21Z","title":"Stacked ensemble\\-based mutagenicity prediction model using multiple\n modalities with graph attention network","summary":" Mutagenicity is a concern due to its association with genetic mutations which\ncan result in a variety of negative consequences, including the development of\ncancer. Earlier identification of mutagenic compounds in the drug development\nprocess is therefore crucial for preventing the progression of unsafe\ncandidates and reducing development costs. While computational techniques,\nespecially machine learning models have become increasingly prevalent for this\nendpoint, they rely on a single modality. In this work, we introduce a novel\nstacked ensemble based mutagenicity prediction model which incorporate multiple\nmodalities such as simplified molecular input line entry system (SMILES) and\nmolecular graph. These modalities capture diverse information about molecules\nsuch as substructural, physicochemical, geometrical and topological. To derive\nsubstructural, geometrical and physicochemical information, we use SMILES,\nwhile topological information is extracted through a graph attention network\n(GAT) via molecular graph. Our model uses a stacked ensemble of machine\nlearning classifiers to make predictions using these multiple features. We\nemploy the explainable artificial intelligence (XAI) technique SHAP (Shapley\nAdditive Explanations) to determine the significance of each classifier and the\nmost relevant features in the prediction. We demonstrate that our method\nsurpasses SOTA methods on two standard datasets across various metrics.\nNotably, we achieve an area under the curve of 95.21\\% on the Hansen benchmark\ndataset, affirming the efficacy of our method in predicting mutagenicity. We\nbelieve that this research will captivate the interest of both clinicians and\ncomputational biologists engaged in translational research.\n","authors":["Tanya Liyaqat","Tanvir Ahmad","Mohammad Kashif","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2409.01731v2.pdf","comment":"Submitted to a journal"},{"id":"http://arxiv.org/abs/2406.09246v2","updated":"2024-09-04T02:14:57Z","published":"2024-06-13T15:46:55Z","title":"OpenVLA: An Open-Source Vision-Language-Action Model","summary":" Large policies pretrained on a combination of Internet-scale vision-language\ndata and diverse robot demonstrations have the potential to change how we teach\nrobots new skills: rather than training new behaviors from scratch, we can\nfine-tune such vision-language-action (VLA) models to obtain robust,\ngeneralizable policies for visuomotor control. Yet, widespread adoption of VLAs\nfor robotics has been challenging as 1) existing VLAs are largely closed and\ninaccessible to the public, and 2) prior work fails to explore methods for\nefficiently fine-tuning VLAs for new tasks, a key component for adoption.\nAddressing these challenges, we introduce OpenVLA, a 7B-parameter open-source\nVLA trained on a diverse collection of 970k real-world robot demonstrations.\nOpenVLA builds on a Llama 2 language model combined with a visual encoder that\nfuses pretrained features from DINOv2 and SigLIP. As a product of the added\ndata diversity and new model components, OpenVLA demonstrates strong results\nfor generalist manipulation, outperforming closed models such as RT-2-X (55B)\nby 16.5% in absolute task success rate across 29 tasks and multiple robot\nembodiments, with 7x fewer parameters. We further show that we can effectively\nfine-tune OpenVLA for new settings, with especially strong generalization\nresults in multi-task environments involving multiple objects and strong\nlanguage grounding abilities, and outperform expressive from-scratch imitation\nlearning methods such as Diffusion Policy by 20.4%. We also explore compute\nefficiency; as a separate contribution, we show that OpenVLA can be fine-tuned\non consumer GPUs via modern low-rank adaptation methods and served efficiently\nvia quantization without a hit to downstream success rate. Finally, we release\nmodel checkpoints, fine-tuning notebooks, and our PyTorch codebase with\nbuilt-in support for training VLAs at scale on Open X-Embodiment datasets.\n","authors":["Moo Jin Kim","Karl Pertsch","Siddharth Karamcheti","Ted Xiao","Ashwin Balakrishna","Suraj Nair","Rafael Rafailov","Ethan Foster","Grace Lam","Pannag Sanketi","Quan Vuong","Thomas Kollar","Benjamin Burchfiel","Russ Tedrake","Dorsa Sadigh","Sergey Levine","Percy Liang","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2406.09246v2.pdf","comment":"Website: https://openvla.github.io/"},{"id":"http://arxiv.org/abs/2404.04298v2","updated":"2024-09-04T02:00:58Z","published":"2024-04-04T20:27:37Z","title":"SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated\n Responses","summary":" Can LLMs consistently improve their previous outputs for better results? For\nthis to be true, LLMs would need to be better at discriminating among\npreviously-generated alternatives, than generating initial responses. We\nexplore the validity of this hypothesis in practice. We first formulate a\nunified framework that allows us to compare the generative and discriminative\ncapability of any model on any task. In our resulting experimental analysis of\nseveral open-source and industrial LLMs, we observe that models are not\nreliably better at discriminating among previously-generated alternatives than\ngenerating initial responses. This finding challenges the notion that LLMs may\nbe able to enhance their performance only through their own judgment.\n","authors":["Dongwei Jiang","Jingyu Zhang","Orion Weller","Nathaniel Weir","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.04298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02374v1","updated":"2024-09-04T01:47:01Z","published":"2024-09-04T01:47:01Z","title":"Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable\n Image Editing","summary":" Recently, diffusion models have emerged as a powerful class of generative\nmodels. Despite their success, there is still limited understanding of their\nsemantic spaces. This makes it challenging to achieve precise and disentangled\nimage generation without additional training, especially in an unsupervised\nway. In this work, we improve the understanding of their semantic spaces from\nintriguing observations: among a certain range of noise levels, (1) the learned\nposterior mean predictor (PMP) in the diffusion model is locally linear, and\n(2) the singular vectors of its Jacobian lie in low-dimensional semantic\nsubspaces. We provide a solid theoretical basis to justify the linearity and\nlow-rankness in the PMP. These insights allow us to propose an unsupervised,\nsingle-step, training-free LOw-rank COntrollable image editing (LOCO Edit)\nmethod for precise local editing in diffusion models. LOCO Edit identified\nediting directions with nice properties: homogeneity, transferability,\ncomposability, and linearity. These properties of LOCO Edit benefit greatly\nfrom the low-dimensional semantic subspace. Our method can further be extended\nto unsupervised or text-supervised editing in various text-to-image diffusion\nmodels (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the\neffectiveness and efficiency of LOCO Edit. The codes will be released at\nhttps://github.com/ChicyChen/LOCO-Edit.\n","authors":["Siyi Chen","Huijie Zhang","Minzhe Guo","Yifu Lu","Peng Wang","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02363v1","updated":"2024-09-04T01:18:55Z","published":"2024-09-04T01:18:55Z","title":"Optimal Neural Network Approximation for High-Dimensional Continuous\n Functions","summary":" Recently, the authors of Shen Yang Zhang (JMLR, 2022) developed a neural\nnetwork with width $36d(2d + 1)$ and depth $11$, which utilizes a special\nactivation function called the elementary universal activation function, to\nachieve the super approximation property for functions in $C([a,b]^d)$. That\nis, the constructed network only requires a fixed number of neurons to\napproximate a $d$-variate continuous function on a $d$-dimensional hypercube\nwith arbitrary accuracy. Their network uses $\\mathcal{O}(d^2)$ fixed neurons.\nOne natural question to address is whether we can reduce the number of these\nneurons in such a network. By leveraging a variant of the Kolmogorov\nSuperposition Theorem, our analysis shows that there is a neural network\ngenerated by the elementary universal activation function with only $366d +365$\nfixed, intrinsic (non-repeated) neurons that attains this super approximation\nproperty. Furthermore, we present a family of continuous functions that\nrequires at least width $d$, and therefore at least $d$ intrinsic neurons, to\nachieve arbitrary accuracy in its approximation. This shows that the\nrequirement of $\\mathcal{O}(d)$ intrinsic neurons is optimal in the sense that\nit grows linearly with the input dimension $d$, unlike some approximation\nmethods where parameters may grow exponentially with $d$.\n","authors":["Ayan Maiti","Michelle Michelle","Haizhao Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13493v2","updated":"2024-09-04T01:00:12Z","published":"2024-08-24T06:32:30Z","title":"Thresholded Lexicographic Ordered Multiobjective Reinforcement Learning","summary":" Lexicographic multi-objective problems, which impose a lexicographic\nimportance order over the objectives, arise in many real-life scenarios.\nExisting Reinforcement Learning work directly addressing lexicographic tasks\nhas been scarce. The few proposed approaches were all noted to be heuristics\nwithout theoretical guarantees as the Bellman equation is not applicable to\nthem. Additionally, the practical applicability of these prior approaches also\nsuffers from various issues such as not being able to reach the goal state.\nWhile some of these issues have been known before, in this work we investigate\nfurther shortcomings, and propose fixes for improving practical performance in\nmany cases. We also present a policy optimization approach using our\nLexicographic Projection Optimization (LPO) algorithm that has the potential to\naddress these theoretical and practical concerns. Finally, we demonstrate our\nproposed algorithms on benchmark problems.\n","authors":["Alperen Tercan","Vinayak S. Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.13493v2.pdf","comment":"Full version of ECAI 2024 paper"},{"id":"http://arxiv.org/abs/2408.15221v2","updated":"2024-09-04T00:58:59Z","published":"2024-08-27T17:33:30Z","title":"LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet","summary":" Recent large language model (LLM) defenses have greatly improved models'\nability to refuse harmful queries, even when adversarially attacked. However,\nLLM defenses are primarily evaluated against automated adversarial attacks in a\nsingle turn of conversation, an insufficient threat model for real-world\nmalicious use. We demonstrate that multi-turn human jailbreaks uncover\nsignificant vulnerabilities, exceeding 70% attack success rate (ASR) on\nHarmBench against defenses that report single-digit ASRs with automated\nsingle-turn attacks. Human jailbreaks also reveal vulnerabilities in machine\nunlearning defenses, successfully recovering dual-use biosecurity knowledge\nfrom unlearned models. We compile these results into Multi-Turn Human\nJailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.\nWe publicly release MHJ alongside a compendium of jailbreak tactics developed\nacross dozens of commercial red teaming engagements, supporting research\ntowards stronger LLM defenses.\n","authors":["Nathaniel Li","Ziwen Han","Ian Steneker","Willow Primack","Riley Goodside","Hugh Zhang","Zifan Wang","Cristina Menghini","Summer Yue"],"pdf_url":"https://arxiv.org/pdf/2408.15221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02349v1","updated":"2024-09-04T00:35:55Z","published":"2024-09-04T00:35:55Z","title":"Machine Learning Applications to Computational Plasma Physics and\n Reduced-Order Plasma Modeling: A Perspective","summary":" Machine learning (ML) provides a broad spectrum of tools and architectures\nthat enable the transformation of data from simulations and experiments into\nuseful and explainable science, thereby augmenting domain knowledge.\nFurthermore, ML-enhanced numerical modelling can revamp scientific computing\nfor real-world complex engineering systems, creating unique opportunities to\nexamine the operation of the technologies in detail and automate their\noptimization and control. In recent years, ML applications have seen\nsignificant growth across various scientific domains, particularly in fluid\nmechanics, where ML has shown great promise in enhancing computational modeling\nof fluid flows. In contrast, ML applications in numerical plasma physics\nresearch remain relatively limited in scope and extent. Despite this, the close\nrelationship between fluid mechanics and plasma physics presents a valuable\nopportunity to create a roadmap for transferring ML advances in fluid flow\nmodeling to computational plasma physics. This Perspective aims to outline such\na roadmap. We begin by discussing some general fundamental aspects of ML,\nincluding the various categories of ML algorithms and the different types of\nproblems that can be solved with the help of ML. With regard to each problem\ntype, we then present specific examples from the use of ML in computational\nfluid dynamics, reviewing several insightful prior efforts. We also review\nrecent ML applications in plasma physics for each problem type. The paper\ndiscusses promising future directions and development pathways for ML in plasma\nmodelling within the different application areas. Additionally, we point out\nprominent challenges that must be addressed to realize ML's full potential in\ncomputational plasma physics, including the need for cost-effective\nhigh-fidelity simulation tools for extensive data generation.\n","authors":["Farbod Faraji","Maryam Reza"],"pdf_url":"https://arxiv.org/pdf/2409.02349v1.pdf","comment":"42 pages, 20 figures"},{"id":"http://arxiv.org/abs/2409.02347v1","updated":"2024-09-04T00:24:57Z","published":"2024-09-04T00:24:57Z","title":"Understanding the Role of Functional Diversity in Weight-Ensembling with\n Ingredient Selection and Multidimensional Scaling","summary":" Weight-ensembles are formed when the parameters of multiple neural networks\nare directly averaged into a single model. They have demonstrated\ngeneralization capability in-distribution (ID) and out-of-distribution (OOD)\nwhich is not completely understood, though they are thought to successfully\nexploit functional diversity allotted by each distinct model. Given a\ncollection of models, it is also unclear which combination leads to the optimal\nweight-ensemble; the SOTA is a linear-time ``greedy\" method. We introduce two\nnovel weight-ensembling approaches to study the link between performance\ndynamics and the nature of how each method decides to use apply the\nfunctionally diverse components, akin to diversity-encouragement in the\nprediction-ensemble literature. We develop a visualization tool to explain how\neach algorithm explores various domains defined via pairwise-distances to\nfurther investigate selection and algorithms' convergence. Empirical analyses\nshed perspectives which reinforce how high-diversity enhances weight-ensembling\nwhile qualifying the extent to which diversity alone improves accuracy. We also\ndemonstrate that sampling positionally distinct models can contribute just as\nmeaningfully to improvements in a weight-ensemble.\n","authors":["Alex Rojas","David Alvarez-Melis"],"pdf_url":"https://arxiv.org/pdf/2409.02347v1.pdf","comment":"Published at the ICML 2024 (Vienna, Austria) Workshop on Foundation\n Models in the Wild"},{"id":"http://arxiv.org/abs/2408.06266v4","updated":"2024-09-04T00:22:45Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02346v1","updated":"2024-09-04T00:20:55Z","published":"2024-09-04T00:20:55Z","title":"Robust Federated Finetuning of Foundation Models via Alternating\n Minimization of LoRA","summary":" Parameter-Efficient Fine-Tuning (PEFT) has risen as an innovative training\nstrategy that updates only a select few model parameters, significantly\nlowering both computational and memory demands. PEFT also helps to decrease\ndata transfer in federated learning settings, where communication depends on\nthe size of updates. In this work, we explore the constraints of previous\nstudies that integrate a well-known PEFT method named LoRA with federated\nfine-tuning, then introduce RoLoRA, a robust federated fine-tuning framework\nthat utilizes an alternating minimization approach for LoRA, providing greater\nrobustness against decreasing fine-tuning parameters and increasing data\nheterogeneity. Our results indicate that RoLoRA not only presents the\ncommunication benefits but also substantially enhances the robustness and\neffectiveness in multiple federated fine-tuning scenarios.\n","authors":["Shuangyi Chen","Yue Ju","Hardik Dalal","Zhongwen Zhu","Ashish Khisti"],"pdf_url":"https://arxiv.org/pdf/2409.02346v1.pdf","comment":"Presented at ES-FOMO-II@ICML2024"},{"id":"http://arxiv.org/abs/2409.02343v1","updated":"2024-09-04T00:10:36Z","published":"2024-09-04T00:10:36Z","title":"NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for\n Retrieval","summary":" $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval)\nfrom pre-trained embedding models is the predominant retrieval method for text\nand images, as well as Retrieval-Augmented Generation (RAG) pipelines. In\npractice, application developers often fine-tune the embeddings to improve\ntheir accuracy on the dataset and query workload in hand. Existing approaches\neither fine-tune the pre-trained model itself or, more efficiently, but at the\ncost of accuracy, train adaptor models to transform the output of the\npre-trained model. We present NUDGE, a family of novel non-parametric embedding\nfine-tuning approaches that are significantly more accurate and efficient than\nboth sets of existing approaches. NUDGE directly modifies the embeddings of\ndata records to maximize the accuracy of $k$-NN retrieval. We present a\nthorough theoretical and experimental study of NUDGE's non-parametric approach.\nWe show that even though the underlying problem is NP-Hard, constrained\nvariations can be solved efficiently. These constraints additionally ensure\nthat the changes to the embeddings are modest, avoiding large distortions to\nthe semantics learned during pre-training. In experiments across five\npre-trained models and nine standard text and image retrieval datasets, NUDGE\nruns in minutes and often improves NDCG@10 by more than 10% over existing\nfine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase\nin accuracy and runs 200x and 3x faster, respectively, over fine-tuning the\npre-trained model and training adaptors.\n","authors":["Sepanta Zeighami","Zac Wellmer","Aditya Parameswaran"],"pdf_url":"https://arxiv.org/pdf/2409.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02342v1","updated":"2024-09-04T00:06:23Z","published":"2024-09-04T00:06:23Z","title":"Optimal sampling for least-squares approximation","summary":" Least-squares approximation is one of the most important methods for\nrecovering an unknown function from data. While in many applications the data\nis fixed, in many others there is substantial freedom to choose where to\nsample. In this paper, we review recent progress on optimal sampling for\n(weighted) least-squares approximation in arbitrary linear spaces. We introduce\nthe Christoffel function as a key quantity in the analysis of (weighted)\nleast-squares approximation from random samples, then show how it can be used\nto construct sampling strategies that possess near-optimal sample complexity:\nnamely, the number of samples scales log-linearly in $n$, the dimension of the\napproximation space. We discuss a series of variations, extensions and further\ntopics, and throughout highlight connections to approximation theory, machine\nlearning, information-based complexity and numerical linear algebra. Finally,\nmotivated by various contemporary applications, we consider a generalization of\nthe classical setting where the samples need not be pointwise samples of a\nscalar-valued function, and the approximation space need not be linear. We show\nthat even in this significantly more general setting suitable generalizations\nof the Christoffel function still determine the sample complexity. This\nprovides a unified procedure for designing improved sampling strategies for\ngeneral recovery problems. This article is largely self-contained, and intended\nto be accessible to nonspecialists.\n","authors":["Ben Adcock"],"pdf_url":"https://arxiv.org/pdf/2409.02342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02078v2","updated":"2024-09-04T00:06:20Z","published":"2023-12-04T17:41:52Z","title":"From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video\n Solution to Enhance Community Safety","summary":" This article adopts and evaluates an AI-enabled Smart Video Solution (SVS)\ndesigned to enhance safety in the real world. The system integrates with\nexisting infrastructure camera networks, leveraging recent advancements in AI\nfor easy adoption. Prioritizing privacy and ethical standards, pose based data\nis used for downstream AI tasks such as anomaly detection. Cloud-based\ninfrastructure and mobile app are deployed, enabling real-time alerts within\ncommunities. The SVS employs innovative data representation and visualization\ntechniques, such as the Occupancy Indicator, Statistical Anomaly Detection,\nBird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance\npublic safety. Evaluation of the SVS demonstrates its capacity to convert\ncomplex computer vision outputs into actionable insights for stakeholders,\ncommunity partners, law enforcement, urban planners, and social scientists.\nThis article presents a comprehensive real-world deployment and evaluation of\nthe SVS, implemented in a community college environment across 16 cameras. The\nsystem integrates AI-driven visual processing, supported by statistical\nanalysis, database management, cloud communication, and user notifications.\nAdditionally, the article evaluates the end-to-end latency from the moment an\nAI algorithm detects anomalous behavior in real-time at the camera level to the\ntime stakeholders receive a notification. The results demonstrate the system's\nrobustness, effectively managing 16 CCTV cameras with a consistent throughput\nof 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end\nlatency of 26.76 seconds between anomaly detection and alert issuance.\n","authors":["Shanle Yao","Babak Rahimi Ardabili","Armin Danesh Pazho","Ghazal Alinezhad Noghre","Christopher Neff","Lauren Bourque","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2312.02078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02339v1","updated":"2024-09-04T00:01:15Z","published":"2024-09-04T00:01:15Z","title":"Data-driven 2D stationary quantum droplets and wave propagations in the\n amended GP equation with two potentials via deep neural networks learning","summary":" In this paper, we develop a systematic deep learning approach to solve\ntwo-dimensional (2D) stationary quantum droplets (QDs) and investigate their\nwave propagation in the 2D amended Gross-Pitaevskii equation with\nLee-Huang-Yang correction and two kinds of potentials. Firstly, we use the\ninitial-value iterative neural network (IINN) algorithm for 2D stationary\nquantum droplets of stationary equations. Then the learned stationary QDs are\nused as the initial value conditions for physics-informed neural networks\n(PINNs) to explore their evolutions in the some space-time region. Especially,\nwe consider two types of potentials, one is the 2D quadruple-well Gaussian\npotential and the other is the PT-symmetric HO-Gaussian potential, which lead\nto spontaneous symmetry breaking and the generation of multi-component QDs. The\nused deep learning method can also be applied to study wave propagations of\nother nonlinear physical models.\n","authors":["Jin Song","Zhenya Yan"],"pdf_url":"https://arxiv.org/pdf/2409.02339v1.pdf","comment":"17 pages, 12 figures (Proc. R. Soc. A, accepted for publication).\n arXiv admin note: text overlap with arXiv:2409.01124"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.02889v1","updated":"2024-09-04T17:25:21Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v1.pdf","comment":"19 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.02845v1","updated":"2024-09-04T16:17:41Z","published":"2024-09-04T16:17:41Z","title":"Multi-Track MusicLDM: Towards Versatile Music Generation with Latent\n Diffusion Model","summary":" Diffusion models have shown promising results in cross-modal generation tasks\ninvolving audio and music, such as text-to-sound and text-to-music generation.\nThese text-controlled music generation models typically focus on generating\nmusic by capturing global musical attributes like genre and mood. However,\nmusic composition is a complex, multilayered task that often involves musical\narrangement as an integral part of the process. This process involves composing\neach instrument to align with existing ones in terms of beat, dynamics,\nharmony, and melody, requiring greater precision and control over tracks than\ntext prompts usually provide. In this work, we address these challenges by\nextending the MusicLDM, a latent diffusion model for music, into a multi-track\ngenerative model. By learning the joint probability of tracks sharing a\ncontext, our model is capable of generating music across several tracks that\ncorrespond well to each other, either conditionally or unconditionally.\nAdditionally, our model is capable of arrangement generation, where the model\ncan generate any subset of tracks given the others (e.g., generating a piano\ntrack complementing given bass and drum tracks). We compared our model with an\nexisting multi-track generative model and demonstrated that our model achieves\nconsiderable improvements across objective metrics for both total and\narrangement generation tasks.\n","authors":["Tornike Karchkhadze","Mohammad Rasool Izadi","Ke Chen","Gerard Assayag","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2409.02845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02828v1","updated":"2024-09-04T15:50:16Z","published":"2024-09-04T15:50:16Z","title":"ExpLLM: Towards Chain of Thought for Facial Expression Recognition","summary":" Facial expression recognition (FER) is a critical task in multimedia with\nsignificant implications across various domains. However, analyzing the causes\nof facial expressions is essential for accurately recognizing them. Current\napproaches, such as those based on facial action units (AUs), typically provide\nAU names and intensities but lack insight into the interactions and\nrelationships between AUs and the overall expression. In this paper, we propose\na novel method called ExpLLM, which leverages large language models to generate\nan accurate chain of thought (CoT) for facial expression recognition.\nSpecifically, we have designed the CoT mechanism from three key perspectives:\nkey observations, overall emotional interpretation, and conclusion. The key\nobservations describe the AU's name, intensity, and associated emotions. The\noverall emotional interpretation provides an analysis based on multiple AUs and\ntheir interactions, identifying the dominant emotions and their relationships.\nFinally, the conclusion presents the final expression label derived from the\npreceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed\nto construct this expression CoT and generate instruction-description data for\ntraining our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets\ndemonstrate that ExpLLM outperforms current state-of-the-art FER methods.\nExpLLM also surpasses the latest GPT-4o in expression CoT generation,\nparticularly in recognizing micro-expressions where GPT-4o frequently fails.\n","authors":["Xing Lan","Jian Xue","Ji Qi","Dongmei Jiang","Ke Lu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2409.02828v1.pdf","comment":"project page: https://starhiking.github.io/ExpLLM_Page/"},{"id":"http://arxiv.org/abs/2409.02657v1","updated":"2024-09-04T12:30:25Z","published":"2024-09-04T12:30:25Z","title":"PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for\n One-Shot Talking Head Generation","summary":" While previous audio-driven talking head generation (THG) methods generate\nhead poses from driving audio, the generated poses or lips cannot match the\naudio well or are not editable. In this study, we propose \\textbf{PoseTalk}, a\nTHG system that can freely generate lip-synchronized talking head videos with\nfree head poses conditioned on text prompts and audio. The core insight of our\nmethod is using head pose to connect visual, linguistic, and audio signals.\nFirst, we propose to generate poses from both audio and text prompts, where the\naudio offers short-term variations and rhythm correspondence of the head\nmovements and the text prompts describe the long-term semantics of head\nmotions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to\ngenerate motion latent from text prompts and audio cues in a pose latent space.\nSecond, we observe a loss-imbalance problem: the loss for the lip region\ncontributes less than 4\\% of the total reconstruction loss caused by both pose\nand lip, making optimization lean towards head movements rather than lip\nshapes. To address this issue, we propose a refinement-based learning strategy\nto synthesize natural talking videos using two cascaded networks, i.e.,\nCoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce\nanimated images in novel poses and the RefineNet focuses on learning finer lip\nmotions by progressively estimating lip motions from low-to-high resolutions,\nyielding improved lip-synchronization performance. Experiments demonstrate our\npose prediction strategy achieves better pose diversity and realness compared\nto text-only or audio-only, and our video generator model outperforms\nstate-of-the-art methods in synthesizing talking videos with natural head\nmotions. Project: https://junleen.github.io/projects/posetalk.\n","authors":["Jun Ling","Yiwen Wang","Han Xue","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2409.02657v1.pdf","comment":"7+5 pages, 15 figures"},{"id":"http://arxiv.org/abs/2409.02555v1","updated":"2024-09-04T09:21:13Z","published":"2024-09-04T09:21:13Z","title":"Low-Resolution Object Recognition with Cross-Resolution Relational\n Contrastive Distillation","summary":" Recognizing objects in low-resolution images is a challenging task due to the\nlack of informative details. Recent studies have shown that knowledge\ndistillation approaches can effectively transfer knowledge from a\nhigh-resolution teacher model to a low-resolution student model by aligning\ncross-resolution representations. However, these approaches still face\nlimitations in adapting to the situation where the recognized objects exhibit\nsignificant representation discrepancies between training and testing images.\nIn this study, we propose a cross-resolution relational contrastive\ndistillation approach to facilitate low-resolution object recognition. Our\napproach enables the student model to mimic the behavior of a well-trained\nteacher model which delivers high accuracy in identifying high-resolution\nobjects. To extract sufficient knowledge, the student learning is supervised\nwith contrastive relational distillation loss, which preserves the similarities\nin various relational structures in contrastive representation space. In this\nmanner, the capability of recovering missing details of familiar low-resolution\nobjects can be effectively enhanced, leading to a better knowledge transfer.\nExtensive experiments on low-resolution object classification and\nlow-resolution face recognition clearly demonstrate the effectiveness and\nadaptability of our approach.\n","authors":["Kangkai Zhang","Shiming Ge","Ruixin Shi","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02555v1.pdf","comment":"This paper is accepted by IEEE Transactions on Circuits and Systems\n for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2409.02453v1","updated":"2024-09-04T05:19:57Z","published":"2024-09-04T05:19:57Z","title":"FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video\n Reconstruction in Resource and Timing Constrained Network Settings","summary":" Despite the growing adoption of video processing via Internet of Things (IoT)\ndevices due to their cost-effectiveness, transmitting captured data to nearby\nservers poses challenges due to varying timing constraints and scarcity of\nnetwork bandwidth. Existing video compression methods face difficulties in\nrecovering compressed data when incomplete data is provided. Here, we introduce\n\\emph{\\project}, a deep-learning based solution that utilizes previously\nreceived data to predict the missing segments of a frame, enabling the\nreconstruction of a frame from partially received data.\n","authors":["John Li","Shehab Sarar Ahmed","Deepak Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15461v2","updated":"2024-09-04T02:45:56Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":" Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v2.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2409.02376v1","updated":"2024-09-04T01:54:20Z","published":"2024-09-04T01:54:20Z","title":"Coral Model Generation from Single Images for Virtual Reality\n Applications","summary":" With the rapid development of VR technology, the demand for high-quality 3D\nmodels is increasing. Traditional methods struggle with efficiency and quality\nin large-scale customization. This paper introduces a deep-learning framework\nthat generates high-precision 3D coral models from a single image. Using the\nCoral dataset, the framework extracts geometric and texture features, performs\n3D reconstruction, and optimizes design and material blending. Advanced\noptimization and polygon count control ensure shape accuracy, detail retention,\nand flexible output for various complexities, catering to high-quality\nrendering and real-time interaction needs.The project incorporates Explainable\nAI (XAI) to transform AI-generated models into interactive \"artworks,\" best\nviewed in VR and XR. This enhances model interpretability and human-machine\ncollaboration. Real-time feedback in VR interactions displays information like\ncoral species and habitat, enriching user experience. The generated models\nsurpass traditional methods in detail, visual quality, and efficiency. This\nresearch offers an intelligent approach to 3D content creation for VR, lowering\nproduction barriers, and promoting widespread VR applications. Additionally,\nintegrating XAI provides new insights into AI-generated visual content and\nadvances research in 3D vision interpretability.\n","authors":["Jie Fu","Shun Fu","Mick Grierson"],"pdf_url":"https://arxiv.org/pdf/2409.02376v1.pdf","comment":"In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts\n 2024) arXiv:2406.14485"},{"id":"http://arxiv.org/abs/2408.11593v3","updated":"2024-09-04T01:25:55Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v3.pdf","comment":"Accepted by NCMMSC2024"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 68 + +
+
+
+ + ☆ RoboTwin: Dual-Arm Robot Benchmark with Generative Digital Twins (early + version) + + +
+ Effective collaboration of dual-arm robots and their tool use capabilities +are increasingly important areas in the advancement of robotics. These skills +play a significant role in expanding robots' ability to operate in diverse +real-world environments. However, progress is impeded by the scarcity of +specialized training data. This paper introduces RoboTwin, a novel benchmark +dataset combining real-world teleoperated data with synthetic data from digital +twins, designed for dual-arm robotic scenarios. Using the COBOT Magic platform, +we have collected diverse data on tool usage and human-robot interaction. We +present a innovative approach to creating digital twins using AI-generated +content, transforming 2D images into detailed 3D models. Furthermore, we +utilize large language models to generate expert-level training data and +task-specific pose sequences oriented toward functionality. Our key +contributions are: 1) the RoboTwin benchmark dataset, 2) an efficient +real-to-simulation pipeline, and 3) the use of language models for automatic +expert-level data generation. These advancements are designed to address the +shortage of robotic training data, potentially accelerating the development of +more capable and versatile robotic systems for a wide range of real-world +applications. The project page is available at +https://robotwin-benchmark.github.io/early-version/ + +
+
+ comment: Project page: https://robotwin-benchmark.github.io/early-version/ +
+
+
+
+
+ + ☆ Masked Diffusion Models are Secretly Time-Agnostic Masked Models and + Exploit Inaccurate Categorical Sampling + + +
+ Masked diffusion models (MDMs) have emerged as a popular research topic for +generative modeling of discrete data, thanks to their superior performance over +other discrete diffusion models, and are rivaling the auto-regressive models +(ARMs) for language modeling tasks. The recent effort in simplifying the masked +diffusion framework further leads to alignment with continuous-space diffusion +models and more principled training and sampling recipes. In this paper, +however, we reveal that both training and sampling of MDMs are theoretically +free from the time variable, arguably the key signature of diffusion models, +and are instead equivalent to masked models. The connection on the sampling +aspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we +show that the FHS is theoretically equivalent to MDMs' original generation +process while significantly alleviating the time-consuming categorical sampling +and achieving a 20$\times$ speedup. In addition, our investigation challenges +previous claims that MDMs can surpass ARMs in generative perplexity. We +identify, for the first time, an underlying numerical issue, even with the +32-bit floating-point precision, which results in inaccurate categorical +sampling. We show that the numerical issue lowers the effective temperature +both theoretically and empirically, leading to unfair assessments of MDMs' +generation results in the previous literature. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ LongCite: Enabling LLMs to Generate Fine-grained Citations in + Long-context QA + + +
+ Though current long-context large language models (LLMs) have demonstrated +impressive capacities in answering user questions based on extensive text, the +lack of citations in their responses makes user verification difficult, leading +to concerns about their trustworthiness due to their potential hallucinations. +In this work, we aim to enable long-context LLMs to generate responses with +fine-grained sentence-level citations, improving their faithfulness and +verifiability. We first introduce LongBench-Cite, an automated benchmark for +assessing current LLMs' performance in Long-Context Question Answering with +Citations (LQAC), revealing considerable room for improvement. To this end, we +propose CoF (Coarse to Fine), a novel pipeline that utilizes off-the-shelf LLMs +to automatically generate long-context QA instances with precise sentence-level +citations, and leverage this pipeline to construct LongCite-45k, a large-scale +SFT dataset for LQAC. Finally, we train LongCite-8B and LongCite-9B using the +LongCite-45k dataset, successfully enabling their generation of accurate +responses and fine-grained sentence-level citations in a single output. The +evaluation results on LongBench-Cite show that our trained models achieve +state-of-the-art citation quality, surpassing advanced proprietary models +including GPT-4o. + +
+
+
+
+
+ + ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 19 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ Configurable Foundation Models: Building LLMs from a Modular Perspective + + +
+ Advancements in LLMs have recently unveiled challenges tied to computational +efficiency and continual scalability due to their requirements of huge +parameters, making the applications and evolution of these models on devices +with limited computation resources and scenarios requiring various abilities +increasingly cumbersome. Inspired by modularity within the human brain, there +is a growing tendency to decompose LLMs into numerous functional modules, +allowing for inference with part of modules and dynamic assembly of modules to +tackle complex tasks, such as mixture-of-experts. To highlight the inherent +efficiency and composability of the modular approach, we coin the term brick to +represent each functional module, designating the modularized structure as +configurable foundation models. In this paper, we offer a comprehensive +overview and investigation of the construction, utilization, and limitation of +configurable foundation models. We first formalize modules into emergent bricks +- functional neuron partitions that emerge during the pre-training phase, and +customized bricks - bricks constructed via additional post-training to improve +the capabilities and knowledge of LLMs. Based on diverse functional bricks, we +further present four brick-oriented operations: retrieval and routing, merging, +updating, and growing. These operations allow for dynamic configuration of LLMs +based on instructions to handle complex tasks. To verify our perspective, we +conduct an empirical analysis on widely-used LLMs. We find that the FFN layers +follow modular patterns with functional specialization of neurons and +functional neuron partitions. Finally, we highlight several open issues and +directions for future research. Overall, this paper aims to offer a fresh +modular perspective on existing LLM research and inspire the future creation of +more efficient and scalable foundational models. + +
+
+
+
+
+ + ☆ Historical German Text Normalization Using Type- and Token-Based + Language Modeling + + +
+ Historic variations of spelling poses a challenge for full-text search or +natural language processing on historical digitized texts. To minimize the gap +between the historic orthography and contemporary spelling, usually an +automatic orthographic normalization of the historical source material is +pursued. This report proposes a normalization system for German literary texts +from c. 1700-1900, trained on a parallel corpus. The proposed system makes use +of a machine learning approach using Transformer language models, combining an +encoder-decoder model to normalize individual word types, and a pre-trained +causal language model to adjust these normalizations within their context. An +extensive evaluation shows that the proposed system provides state-of-the-art +accuracy, comparable with a much larger fully end-to-end sentence-based +normalization system, fine-tuning a pre-trained Transformer large language +model. However, the normalization of historical text remains a challenge due to +difficulties for models to generalize, and the lack of extensive high-quality +parallel data. + +
+
+ comment: 27 pages, 3 figures +
+
+
+
+
+ + ☆ R2GQA: Retriever-Reader-Generator Question Answering System to Support + Students Understanding Legal Regulations in Higher Education + + +
+ In this article, we propose the R2GQA system, a Retriever-Reader-Generator +Question Answering system, consisting of three main components: Document +Retriever, Machine Reader, and Answer Generator. The Retriever module employs +advanced information retrieval techniques to extract the context of articles +from a dataset of legal regulation documents. The Machine Reader module +utilizes state-of-the-art natural language understanding algorithms to +comprehend the retrieved documents and extract answers. Finally, the Generator +module synthesizes the extracted answers into concise and informative responses +to questions of students regarding legal regulations. Furthermore, we built the +ViRHE4QA dataset in the domain of university training regulations, comprising +9,758 question-answer pairs with a rigorous construction process. This is the +first Vietnamese dataset in the higher regulations domain with various types of +answers, both extractive and abstractive. In addition, the R2GQA system is the +first system to offer abstractive answers in Vietnamese. This paper discusses +the design and implementation of each module within the R2GQA system on the +ViRHE4QA dataset, highlighting their functionalities and interactions. +Furthermore, we present experimental results demonstrating the effectiveness +and utility of the proposed system in supporting the comprehension of students +of legal regulations in higher education settings. In general, the R2GQA system +and the ViRHE4QA dataset promise to contribute significantly to related +research and help students navigate complex legal documents and regulations, +empowering them to make informed decisions and adhere to institutional policies +effectively. Our dataset is available for research purposes. + +
+
+
+
+
+ + ☆ Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency + Discussions by Few-Shot Learning with Large Language Models + + +
+ This study performs analysis of Predictive statements, Hope speech, and +Regret Detection behaviors within cryptocurrency-related discussions, +leveraging advanced natural language processing techniques. We introduce a +novel classification scheme named "Prediction statements," categorizing +comments into Predictive Incremental, Predictive Decremental, Predictive +Neutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large +language model, we explore sentiment dynamics across five prominent +cryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis +reveals distinct patterns in predictive sentiments, with Matic demonstrating a +notably higher propensity for optimistic predictions. Additionally, we +investigate hope and regret sentiments, uncovering nuanced interplay between +these emotions and predictive behaviors. Despite encountering limitations +related to data volume and resource availability, our study reports valuable +discoveries concerning investor behavior and sentiment trends within the +cryptocurrency market, informing strategic decision-making and future research +endeavors. + +
+
+
+
+
+ + ☆ CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the + Mathematics Reasoning of Large Multimodal Models + + +
+ Large language models (LLMs) have obtained promising results in mathematical +reasoning, which is a foundational skill for human intelligence. Most previous +studies focus on improving and measuring the performance of LLMs based on +textual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few +researchers have released English multimodal math datasets (e.g., MATHVISTA and +MATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In +this paper, we release a Chinese multimodal math (CMM-Math) dataset, including +benchmark and training parts, to evaluate and enhance the mathematical +reasoning of LMMs. CMM-Math contains over 28,000 high-quality samples, +featuring a variety of problem types (e.g., multiple-choice, fill-in-the-blank, +and so on) with detailed solutions across 12 grade levels from elementary to +high school in China. Specifically, the visual context may be present in the +questions or opinions, which makes this dataset more challenging. Through +comprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math +dataset face challenges, emphasizing the necessity for further improvements in +LMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to +handle the problems with mixed input of multiple images and text segments. We +train our model using three stages, including foundational pre-training, +foundational fine-tuning, and mathematical fine-tuning. The extensive +experiments indicate that our model effectively improves math reasoning +performance by comparing it with the SOTA LMMs over three multimodal +mathematical datasets. + +
+
+
+
+
+ + ☆ MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding + Benchmark + + +
+ This paper introduces MMMU-Pro, a robust version of the Massive +Multi-discipline Multimodal Understanding and Reasoning (MMMU) benchmark. +MMMU-Pro rigorously assesses multimodal models' true understanding and +reasoning capabilities through a three-step process based on MMMU: (1) +filtering out questions answerable by text-only models, (2) augmenting +candidate options, and (3) introducing a vision-only input setting where +questions are embedded within images. This setting challenges AI to truly "see" +and "read" simultaneously, testing a fundamental human cognitive skill of +seamlessly integrating visual and textual information. Results show that model +performance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8% +to 26.9% across models. We explore the impact of OCR prompts and Chain of +Thought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT +generally improves performance. MMMU-Pro provides a more rigorous evaluation +tool, closely mimicking real-world scenarios and offering valuable directions +for future research in multimodal AI. + +
+
+
+
+
+ + ☆ Towards a Unified View of Preference Learning for Large Language Models: + A Survey + + +
+ Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of +the crucial factors to achieve success is aligning the LLM's output with human +preferences. This alignment process often requires only a small amount of data +to efficiently enhance the LLM's performance. While effective, research in this +area spans multiple domains, and the methods involved are relatively complex to +understand. The relationships between different methods have been +under-explored, limiting the development of the preference alignment. In light +of this, we break down the existing popular alignment strategies into different +components and provide a unified framework to study the current alignment +strategies, thereby establishing connections among them. In this survey, we +decompose all the strategies in preference learning into four components: +model, data, feedback, and algorithm. This unified view offers an in-depth +understanding of existing alignment algorithms and also opens up possibilities +to synergize the strengths of different strategies. Furthermore, we present +detailed working examples of prevalent existing algorithms to facilitate a +comprehensive understanding for the readers. Finally, based on our unified +perspective, we explore the challenges and future research directions for +aligning large language models with human preferences. + +
+
+ comment: Initial Commit, 21 pages +
+
+
+
+
+ + ☆ A Comparative Study of Pre-training and Self-training + + +
+ Pre-training and self-training are two approaches to semi-supervised +learning. The comparison between pre-training and self-training has been +explored. However, the previous works led to confusing findings: self-training +outperforms pre-training experienced on some tasks in computer vision, and +contrarily, pre-training outperforms self-training experienced on some tasks in +natural language processing, under certain conditions of incomparable settings. +We propose, comparatively and exhaustively, an ensemble method to empirical +study all feasible training paradigms combining pre-training, self-training, +and fine-tuning within consistent foundational settings comparable to data +augmentation. We conduct experiments on six datasets, four data augmentation, +and imbalanced data for sentiment analysis and natural language inference +tasks. Our findings confirm that the pre-training and fine-tuning paradigm +yields the best overall performances. Moreover, self-training offers no +additional benefits when combined with semi-supervised pre-training. + +
+
+ comment: 19 pages, 2 figures, 9 tables +
+
+
+
+
+ + ☆ Pooling And Attention: What Are Effective Designs For LLm-Based + Embedding Models? + + +
+ The significant advancements of Large Language Models (LLMs) in generative +tasks have led to a growing body of work exploring LLM-based embedding models. +While these models, employing different pooling and attention strategies, have +achieved state-of-the-art performance on public embedding benchmarks, questions +still arise about what constitutes an effective design for LLM-based embedding +models. However, these models are often trained on different datasets, using +different LLM base models or training settings. Moreover, evaluations on public +embedding benchmarks often fail to report statistical significance, making it +difficult to determine which designs truly contribute to final performance. +This complicates the process for practitioners seeking optimal training recipes +for LLM-based embedding models. In this study, we conduct a large-scale +experiment by training a series of LLM-based embedding models using the same +training data and base model but differing in their pooling and attention +strategies. The results show that there is no one-size-fits-all solution: while +bidirectional attention and an additional trainable pooling layer outperform in +text similarity and information retrieval tasks, they do not significantly +surpass simpler designs like EOS-last token pooling and default causal +attention in clustering and classification tasks. Furthermore, we propose a new +pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs +of all hidden layers, rather than just the last layer, using a cross-attention +network. This method proves to be statistically superior in text similarity and +retrieval tasks compared to existing pooling methods. Overall, this paper sheds +light on effective training strategies for LLM-based embedding models. + +
+
+ comment: https://github.com/yixuantt/PoolingAndAttn +
+
+
+
+
+ + ☆ Pre-training data selection for biomedical domain adaptation using + journal impact metrics + + +
+ Domain adaptation is a widely used method in natural language processing +(NLP) to improve the performance of a language model within a specific domain. +This method is particularly common in the biomedical domain, which sees regular +publication of numerous scientific articles. PubMed, a significant corpus of +text, is frequently used in the biomedical domain. The primary objective of +this study is to explore whether refining a pre-training dataset using specific +quality metrics for scientific papers can enhance the performance of the +resulting model. To accomplish this, we employ two straightforward journal +impact metrics and conduct experiments by continually pre-training BERT on +various subsets of the complete PubMed training set, we then evaluate the +resulting models on biomedical language understanding tasks from the BLURB +benchmark. Our results show that pruning using journal impact metrics is not +efficient. But we also show that pre-training using fewer abstracts (but with +the same number of training steps) does not necessarily decrease the resulting +model's performance. + +
+
+
+
+
+ + ☆ Alignment-Aware Model Extraction Attacks on Large Language Models + + +
+ Model extraction attacks (MEAs) on large language models (LLMs) have received +increasing research attention lately. Existing attack methods on LLMs inherit +the extraction strategies from those designed for deep neural networks (DNNs) +yet neglect the inconsistency of training tasks between MEA and LLMs' +alignments. As such, they result in poor attack performances. To tackle this +issue, we present Locality Reinforced Distillation (LoRD), a novel model +extraction attack algorithm specifically for LLMs. In particular, we design a +policy-gradient-style training task, which utilizes victim models' responses as +a signal to guide the crafting of preference for the local model. Theoretical +analysis has shown that i) LoRD's convergence procedure in MEAs is consistent +with the alignments of LLMs, and ii) LoRD can reduce query complexity while +mitigating watermark protection through exploration-based stealing. Extensive +experiments on domain-specific extractions demonstrate the superiority of our +method by examining the extraction of various state-of-the-art commercial LLMs. + +
+
+ comment: Source code: https://github.com/liangzid/alignmentExtraction +
+
+
+
+
+ + ☆ A Data Selection Approach for Enhancing Low Resource Machine Translation + Using Cross-Lingual Sentence Representations + + +
+ Machine translation in low-resource language pairs faces significant +challenges due to the scarcity of parallel corpora and linguistic resources. +This study focuses on the case of English-Marathi language pairs, where +existing datasets are notably noisy, impeding the performance of machine +translation models. To mitigate the impact of data quality issues, we propose a +data filtering approach based on cross-lingual sentence representations. Our +methodology leverages a multilingual SBERT model to filter out problematic +translations in the training data. Specifically, we employ an IndicSBERT +similarity model to assess the semantic equivalence between original and +translated sentences, allowing us to retain linguistically correct translations +while discarding instances with substantial deviations. The results demonstrate +a significant improvement in translation quality over the baseline +post-filtering with IndicSBERT. This illustrates how cross-lingual sentence +representations can reduce errors in machine translation scenarios with limited +resources. By integrating multilingual sentence BERT models into the +translation pipeline, this research contributes to advancing machine +translation techniques in low-resource environments. The proposed method not +only addresses the challenges in English-Marathi language pairs but also +provides a valuable framework for enhancing translation quality in other +low-resource language translation tasks. + +
+
+ comment: Accepted at I2CT 2024 +
+
+
+
+
+ + ☆ Detecting Calls to Action in Multimodal Content: Analysis of the 2021 + German Federal Election Campaign on Instagram + + +
+ This study investigates the automated classification of Calls to Action +(CTAs) within the 2021 German Instagram election campaign to advance the +understanding of mobilization in social media contexts. We analyzed over 2,208 +Instagram stories and 712 posts using fine-tuned BERT models and OpenAI's GPT-4 +models. The fine-tuned BERT model incorporating synthetic training data +achieved a macro F1 score of 0.93, demonstrating a robust classification +performance. Our analysis revealed that 49.58% of Instagram posts and 10.64% of +stories contained CTAs, highlighting significant differences in mobilization +strategies between these content types. Additionally, we found that FDP and the +Greens had the highest prevalence of CTAs in posts, whereas CDU and CSU led in +story CTAs. + +
+
+ comment: Accepted Archival Paper for the CPSS Workshop at KONVENS 2024. Camera + Ready Submission +
+
+
+
+
+ + ☆ Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for + Problem-Solving Improvement of LLMs + + +
+ Large Language Models (LLMs) have demonstrated remarkable efficiency in +tackling various tasks based on human instructions, but recent studies reveal +that these models often fail to achieve satisfactory results on questions +involving reasoning, such as mathematics or physics questions. This phenomenon +is usually attributed to the uncertainty regarding whether these models could +genuinely comprehend the knowledge embedded in the text or merely learn to +replicate the token distribution without a true understanding of the content. +In this paper, we delve into this problem and aim to enhance the reasoning +capabilities of LLMs. First, we investigate if the model has genuine reasoning +capabilities by visualizing the text generation process at the attention and +representation level. Then, we formulate the reasoning process of LLMs into a +causal framework, which provides a formal explanation of the problems we +observe in the visualization. Finally, building upon this causal framework, we +propose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient +fine-tuning (PEFT) method to enhance the model's reasoning capabilities by +encouraging the model to extract the general problem-solving skills and apply +these skills to different questions. Experiments show that our method +outperforms the baseline consistently across multiple benchmarks, and with only +1.2M tunable parameters, we achieve better or comparable results to other +fine-tuning methods. This demonstrates the effectiveness and efficiency of our +method in improving the overall accuracy and reliability of LLMs. + +
+
+
+
+
+ + ☆ Creating Domain-Specific Translation Memories for Machine Translation + Fine-tuning: The TRENCARD Bilingual Cardiology Corpus + + +
+ This article investigates how translation memories (TM) can be created by +translators or other language professionals in order to compile domain-specific +parallel corpora , which can then be used in different scenarios, such as +machine translation training and fine-tuning, TM leveraging, and/or large +language model fine-tuning. The article introduces a semi-automatic TM +preparation methodology leveraging primarily translation tools used by +translators in favor of data quality and control by the translators. This +semi-automatic methodology is then used to build a cardiology-based Turkish -> +English corpus from bilingual abstracts of Turkish cardiology journals. The +resulting corpus called TRENCARD Corpus has approximately 800,000 source words +and 50,000 sentences. Using this methodology, translators can build their +custom TMs in a reasonable time and use them in their bilingual data requiring +tasks. + +
+
+
+
+
+ + ☆ OpenFact at CheckThat! 2024: Combining Multiple Attack Methods for + Effective Adversarial Text Generation + + +
+ This paper presents the experiments and results for the CheckThat! Lab at +CLEF 2024 Task 6: Robustness of Credibility Assessment with Adversarial +Examples (InCrediblAE). The primary objective of this task was to generate +adversarial examples in five problem domains in order to evaluate the +robustness of widely used text classification methods (fine-tuned BERT, BiLSTM, +and RoBERTa) when applied to credibility assessment issues. + This study explores the application of ensemble learning to enhance +adversarial attacks on natural language processing (NLP) models. We +systematically tested and refined several adversarial attack methods, including +BERT-Attack, Genetic algorithms, TextFooler, and CLARE, on five datasets across +various misinformation tasks. By developing modified versions of BERT-Attack +and hybrid methods, we achieved significant improvements in attack +effectiveness. Our results demonstrate the potential of modification and +combining multiple methods to create more sophisticated and effective +adversarial attack strategies, contributing to the development of more robust +and secure systems. + +
+
+ comment: CLEF 2024 - Conference and Labs of the Evaluation Forum +
+
+
+
+
+ + ☆ A Survey on Emergent Language + + +
+ The field of emergent language represents a novel area of research within the +domain of artificial intelligence, particularly within the context of +multi-agent reinforcement learning. Although the concept of studying language +emergence is not new, early approaches were primarily concerned with explaining +human language formation, with little consideration given to its potential +utility for artificial agents. In contrast, studies based on reinforcement +learning aim to develop communicative capabilities in agents that are +comparable to or even superior to human language. Thus, they extend beyond the +learned statistical representations that are common in natural language +processing research. This gives rise to a number of fundamental questions, from +the prerequisites for language emergence to the criteria for measuring its +success. This paper addresses these questions by providing a comprehensive +review of 181 scientific publications on emergent language in artificial +intelligence. Its objective is to serve as a reference for researchers +interested in or proficient in the field. Consequently, the main contributions +are the definition and overview of the prevailing terminology, the analysis of +existing evaluation methods and metrics, and the description of the identified +research gaps. + +
+
+
+
+
+ + ☆ PUB: Plot Understanding Benchmark and Dataset for Evaluating Large + Language Models on Synthetic Visual Data Interpretation + + +
+ The ability of large language models (LLMs) to interpret visual +representations of data is crucial for advancing their application in data +analysis and decision-making processes. This paper presents a novel synthetic +dataset designed to evaluate the proficiency of LLMs in interpreting various +forms of data visualizations, including plots like time series, histograms, +violins, boxplots, and clusters. Our dataset is generated using controlled +parameters to ensure comprehensive coverage of potential real-world scenarios. +We employ multimodal text prompts with questions related to visual data in +images to benchmark several state-of-the-art models like ChatGPT or Gemini, +assessing their understanding and interpretative accuracy. + To ensure data integrity, our benchmark dataset is generated automatically, +making it entirely new and free from prior exposure to the models being tested. +This strategy allows us to evaluate the models' ability to truly interpret and +understand the data, eliminating possibility of pre-learned responses, and +allowing for an unbiased evaluation of the models' capabilities. We also +introduce quantitative metrics to assess the performance of the models, +providing a robust and comprehensive evaluation tool. + Benchmarking several state-of-the-art LLMs with this dataset reveals varying +degrees of success, highlighting specific strengths and weaknesses in +interpreting diverse types of visual data. The results provide valuable +insights into the current capabilities of LLMs and identify key areas for +improvement. This work establishes a foundational benchmark for future research +and development aimed at enhancing the visual interpretative abilities of +language models. In the future, improved LLMs with robust visual interpretation +skills can significantly aid in automated data analysis, scientific research, +educational tools, and business intelligence applications. + +
+
+
+
+
+ + ☆ An Analysis of Linear Complexity Attention Substitutes with BEST-RQ + + +
+ Self-Supervised Learning (SSL) has proven to be effective in various domains, +including speech processing. However, SSL is computationally and memory +expensive. This is in part due the quadratic complexity of multi-head +self-attention (MHSA). Alternatives for MHSA have been proposed and used in the +speech domain, but have yet to be investigated properly in an SSL setting. In +this work, we study the effects of replacing MHSA with recent state-of-the-art +alternatives that have linear complexity, namely, HyperMixing, Fastformer, +SummaryMixing, and Mamba. We evaluate these methods by looking at the speed, +the amount of VRAM consumed, and the performance on the SSL MP3S benchmark. +Results show that these linear alternatives maintain competitive performance +compared to MHSA while, on average, decreasing VRAM consumption by around 20% +to 60% and increasing speed from 7% to 65% for input sequences ranging from 20 +to 80 seconds. + +
+
+ comment: Accepted in the IEEE Soken Language Technology Workshop 2024 +
+
+
+
+
+ + ☆ More is More: Addition Bias in Large Language Models + + +
+ In this paper, we investigate the presence of additive bias in Large Language +Models (LLMs), drawing a parallel to the cognitive bias observed in humans +where individuals tend to favor additive over subtractive changes. Using a +series of controlled experiments, we tested various LLMs, including GPT-3.5 +Turbo, Claude 3.5 Sonnet, Mistral, Math$\Sigma$tral, and Llama 3.1, on tasks +designed to measure their propensity for additive versus subtractive +modifications. Our findings demonstrate a significant preference for additive +changes across all tested models. For example, in a palindrome creation task, +Llama 3.1 favored adding letters 97.85% of the time over removing them. +Similarly, in a Lego tower balancing task, GPT-3.5 Turbo chose to add a brick +76.38% of the time rather than remove one. In a text summarization task, +Mistral 7B produced longer summaries in 59.40% to 75.10% of cases when asked to +improve its own or others' writing. These results indicate that, similar to +humans, LLMs exhibit a marked additive bias, which might have implications when +LLMs are used on a large scale. Addittive bias might increase resource use and +environmental impact, leading to higher economic costs due to overconsumption +and waste. This bias should be considered in the development and application of +LLMs to ensure balanced and efficient problem-solving approaches. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ☆ Language is Scary when Over-Analyzed: Unpacking Implied Misogynistic + Reasoning with Argumentation Theory-Driven Prompts + + +
+ We propose misogyny detection as an Argumentative Reasoning task and we +investigate the capacity of large language models (LLMs) to understand the +implicit reasoning used to convey misogyny in both Italian and English. The +central aim is to generate the missing reasoning link between a message and the +implied meanings encoding the misogyny. Our study uses argumentation theory as +a foundation to form a collection of prompts in both zero-shot and few-shot +settings. These prompts integrate different techniques, including +chain-of-thought reasoning and augmented knowledge. Our findings show that LLMs +fall short on reasoning capabilities about misogynistic comments and that they +mostly rely on their implicit knowledge derived from internalized common +stereotypes about women to generate implied assumptions, rather than on +inductive reasoning. + +
+
+
+
+
+ + ☆ Word and Phrase Features in Graph Convolutional Network for Automatic + Question Classification + + +
+ Effective question classification is crucial for AI-driven educational tools, +enabling adaptive learning systems to categorize questions by skill area, +difficulty level, and competence. This classification not only supports +educational diagnostics and analytics but also enhances complex tasks like +information retrieval and question answering by associating questions with +relevant categories. Traditional methods, often based on word embeddings and +conventional classifiers, struggle to capture the nuanced relationships in +natural language, leading to suboptimal performance. To address this, we +propose a novel approach leveraging graph convolutional networks (GCNs), named +Phrase Question-Graph Convolutional Network (PQ-GCN) to better model the +inherent structure of questions. By representing questions as graphs -- where +nodes signify words or phrases and edges denote syntactic or semantic +relationships -- our method allows GCNs to learn from the interconnected nature +of language more effectively. Additionally, we explore the incorporation of +phrase-based features to enhance classification accuracy, especially in +low-resource settings. Our findings demonstrate that GCNs, augmented with these +features, offer a promising solution for more accurate and context-aware +question classification, bridging the gap between graph neural network research +and practical educational applications. + +
+
+
+
+
+ + ☆ A Comparative Study on Large Language Models for Log Parsing + + +
+ Background: Log messages provide valuable information about the status of +software systems. This information is provided in an unstructured fashion and +automated approaches are applied to extract relevant parameters. To ease this +process, log parsing can be applied, which transforms log messages into +structured log templates. Recent advances in language models have led to +several studies that apply ChatGPT to the task of log parsing with promising +results. However, the performance of other state-of-the-art large language +models (LLMs) on the log parsing task remains unclear. + Aims: In this study, we investigate the current capability of +state-of-the-art LLMs to perform log parsing. + Method: We select six recent LLMs, including both paid proprietary (GPT-3.5, +Claude 2.1) and four free-to-use open models, and compare their performance on +system logs obtained from a selection of mature open-source projects. We design +two different prompting approaches and apply the LLMs on 1, 354 log templates +across 16 different projects. We evaluate their effectiveness, in the number of +correctly identified templates, and the syntactic similarity between the +generated templates and the ground truth. + Results: We found that free-to-use models are able to compete with paid +models, with CodeLlama extracting 10% more log templates correctly than +GPT-3.5. Moreover, we provide qualitative insights into the usability of +language models (e.g., how easy it is to use their responses). + Conclusions: Our results reveal that some of the smaller, free-to-use LLMs +can considerably assist log parsing compared to their paid proprietary +competitors, especially code-specialized models. + +
+
+ comment: Accepted for publication in the 18th ACM/IEEE International Symposium + on Empirical Software Engineering and Measurement (ESEM '24) +
+
+
+
+
+ + ☆ DetectiveQA: Evaluating Long-Context Reasoning on Detective Novels + + +
+ With the rapid advancement of Large Language Models (LLMs), long-context +information understanding and processing have become a hot topic in academia +and industry. However, benchmarks for evaluating the ability of LLMs to handle +long-context information do not seem to have kept pace with the development of +LLMs. Despite the emergence of various long-context evaluation benchmarks, the +types of capability assessed are still limited, without new capability +dimensions. In this paper, we introduce DetectiveQA, a narrative reasoning +benchmark featured with an average context length of over 100K tokens. +DetectiveQA focuses on evaluating the long-context reasoning ability of LLMs, +which not only requires a full understanding of context but also requires +extracting important evidences from the context and reasoning according to +extracted evidences to answer the given questions. This is a new dimension of +capability evaluation, which is more in line with the current intelligence +level of LLMs. We use detective novels as data sources, which naturally have +various reasoning elements. Finally, we manually annotated 600 questions in +Chinese and then also provided an English edition of the context information +and questions. We evaluate many long-context LLMs on DetectiveQA, including +commercial and open-sourced models, and the results indicate that existing +long-context LLMs still require significant advancements to effectively process +true long-context dependency questions. + +
+
+
+
+
+ + ☆ What is lost in Normalization? Exploring Pitfalls in Multilingual ASR + Model Evaluations EMNLP 2024 + + +
+ This paper explores the pitfalls in evaluating multilingual automatic speech +recognition (ASR) models, with a particular focus on Indic language scripts. We +investigate the text normalization routine employed by leading ASR models, +including OpenAI Whisper, Meta's MMS, Seamless, and Assembly AI's Conformer, +and their unintended consequences on performance metrics. Our research reveals +that current text normalization practices, while aiming to standardize ASR +outputs for fair comparison, by removing inconsistencies such as variations in +spelling, punctuation, and special characters, are fundamentally flawed when +applied to Indic scripts. Through empirical analysis using text similarity +scores and in-depth linguistic examination, we demonstrate that these flaws +lead to artificially inflated performance metrics for Indic languages. We +conclude by proposing a shift towards developing normalization routines that +leverage native linguistic expertise, ensuring more robust and accurate +evaluations of multilingual ASR models. + +
+
+ comment: Sumbitted to EMNLP 2024 +
+
+
+
+
+ + ☆ Large Language Models as Efficient Reward Function Searchers for + Custom-Environment Multi-Objective Reinforcement Learning + + +
+ Leveraging large language models (LLMs) for designing reward functions +demonstrates significant potential. However, achieving effective design and +improvement of reward functions in reinforcement learning (RL) tasks with +complex custom environments and multiple requirements presents considerable +challenges. In this paper, we enable LLMs to be effective white-box searchers, +highlighting their advanced semantic understanding capabilities. Specifically, +we generate reward components for each explicit user requirement and employ the +reward critic to identify the correct code form. Then, LLMs assign weights to +the reward components to balance their values and iteratively search and +optimize these weights based on the context provided by the training log +analyzer, while adaptively determining the search step size. We applied the +framework to an underwater information collection RL task without direct human +feedback or reward examples (zero-shot). The reward critic successfully correct +the reward code with only one feedback for each requirement, effectively +preventing irreparable errors that can occur when reward function feedback is +provided in aggregate. The effective initialization of weights enables the +acquisition of different reward functions within the Pareto solution set +without weight search. Even in the case where a weight is 100 times off, fewer +than four iterations are needed to obtain solutions that meet user +requirements. The framework also works well with most prompts utilizing GPT-3.5 +Turbo, since it does not require advanced numerical understanding or +calculation. + +
+
+
+
+
+ + ☆ Abstractive Text Summarization: State of the Art, Challenges, and + Improvements + + +
+ Specifically focusing on the landscape of abstractive text summarization, as +opposed to extractive techniques, this survey presents a comprehensive +overview, delving into state-of-the-art techniques, prevailing challenges, and +prospective research directions. We categorize the techniques into traditional +sequence-to-sequence models, pre-trained large language models, reinforcement +learning, hierarchical methods, and multi-modal summarization. Unlike prior +works that did not examine complexities, scalability and comparisons of +techniques in detail, this review takes a comprehensive approach encompassing +state-of-the-art methods, challenges, solutions, comparisons, limitations and +charts out future improvements - providing researchers an extensive overview to +advance abstractive summarization research. We provide vital comparison tables +across techniques categorized - offering insights into model complexity, +scalability and appropriate applications. The paper highlights challenges such +as inadequate meaning representation, factual consistency, controllable text +summarization, cross-lingual summarization, and evaluation metrics, among +others. Solutions leveraging knowledge incorporation and other innovative +strategies are proposed to address these challenges. The paper concludes by +highlighting emerging research areas like factual inconsistency, +domain-specific, cross-lingual, multilingual, and long-document summarization, +as well as handling noisy data. Our objective is to provide researchers and +practitioners with a structured overview of the domain, enabling them to better +understand the current landscape and identify potential areas for further +research and improvement. + +
+
+ comment: 9 Tables, 7 Figures +
+
+
+
+
+ + ☆ Determination of language families using deep learning + + +
+ We use a c-GAN (convolutional generative adversarial) neural network to +analyze transliterated text fragments of extant, dead comprehensible, and one +dead non-deciphered (Cypro-Minoan) language to establish linguistic affinities. +The paper is agnostic with respect to translation and/or deciphering. However, +there is hope that the proposed approach can be useful for decipherment with +more sophisticated neural network techniques. + +
+
+ comment: First draft. Comments are welcome +
+
+
+
+
+ + ☆ Large Language Models and Cognitive Science: A Comprehensive Review of + Similarities, Differences, and Challenges + + +
+ This comprehensive review explores the intersection of Large Language Models +(LLMs) and cognitive science, examining similarities and differences between +LLMs and human cognitive processes. We analyze methods for evaluating LLMs +cognitive abilities and discuss their potential as cognitive models. The review +covers applications of LLMs in various cognitive fields, highlighting insights +gained for cognitive science research. We assess cognitive biases and +limitations of LLMs, along with proposed methods for improving their +performance. The integration of LLMs with cognitive architectures is examined, +revealing promising avenues for enhancing artificial intelligence (AI) +capabilities. Key challenges and future research directions are identified, +emphasizing the need for continued refinement of LLMs to better align with +human cognition. This review provides a balanced perspective on the current +state and future potential of LLMs in advancing our understanding of both +artificial and human intelligence. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ☆ STAB: Speech Tokenizer Assessment Benchmark + + +
+ Representing speech as discrete tokens provides a framework for transforming +speech into a format that closely resembles text, thus enabling the use of +speech as an input to the widely successful large language models (LLMs). +Currently, while several speech tokenizers have been proposed, there is +ambiguity regarding the properties that are desired from a tokenizer for +specific downstream tasks and its overall generalizability. Evaluating the +performance of tokenizers across different downstream tasks is a +computationally intensive effort that poses challenges for scalability. To +circumvent this requirement, we present STAB (Speech Tokenizer Assessment +Benchmark), a systematic evaluation framework designed to assess speech +tokenizers comprehensively and shed light on their inherent characteristics. +This framework provides a deeper understanding of the underlying mechanisms of +speech tokenization, thereby offering a valuable resource for expediting the +advancement of future tokenizer models and enabling comparative analysis using +a standardized benchmark. We evaluate the STAB metrics and correlate this with +downstream task performance across a range of speech tasks and tokenizer +choices. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ How Privacy-Savvy Are Large Language Models? A Case Study on Compliance + and Privacy Technical Review + + +
+ The recent advances in large language models (LLMs) have significantly +expanded their applications across various fields such as language generation, +summarization, and complex question answering. However, their application to +privacy compliance and technical privacy reviews remains under-explored, +raising critical concerns about their ability to adhere to global privacy +standards and protect sensitive user data. This paper seeks to address this gap +by providing a comprehensive case study evaluating LLMs' performance in +privacy-related tasks such as privacy information extraction (PIE), legal and +regulatory key point detection (KPD), and question answering (QA) with respect +to privacy policies and data protection regulations. We introduce a Privacy +Technical Review (PTR) framework, highlighting its role in mitigating privacy +risks during the software development life-cycle. Through an empirical +assessment, we investigate the capacity of several prominent LLMs, including +BERT, GPT-3.5, GPT-4, and custom models, in executing privacy compliance checks +and technical privacy reviews. Our experiments benchmark the models across +multiple dimensions, focusing on their precision, recall, and F1-scores in +extracting privacy-sensitive information and detecting key regulatory +compliance points. While LLMs show promise in automating privacy reviews and +identifying regulatory discrepancies, significant gaps persist in their ability +to fully comply with evolving legal standards. We provide actionable +recommendations for enhancing LLMs' capabilities in privacy compliance, +emphasizing the need for robust model improvements and better integration with +legal and regulatory requirements. This study underscores the growing +importance of developing privacy-aware LLMs that can both support businesses in +compliance efforts and safeguard user privacy rights. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Do Large Language Models Possess Sensitive to Sentiment? + + +
+ Large Language Models (LLMs) have recently displayed their extraordinary +capabilities in language understanding. However, how to comprehensively assess +the sentiment capabilities of LLMs continues to be a challenge. This paper +investigates the ability of LLMs to detect and react to sentiment in text +modal. As the integration of LLMs into diverse applications is on the rise, it +becomes highly critical to comprehend their sensitivity to emotional tone, as +it can influence the user experience and the efficacy of sentiment-driven +tasks. We conduct a series of experiments to evaluate the performance of +several prominent LLMs in identifying and responding appropriately to +sentiments like positive, negative, and neutral emotions. The models' outputs +are analyzed across various sentiment benchmarks, and their responses are +compared with human evaluations. Our discoveries indicate that although LLMs +show a basic sensitivity to sentiment, there are substantial variations in +their accuracy and consistency, emphasizing the requirement for further +enhancements in their training processes to better capture subtle emotional +cues. Take an example in our findings, in some cases, the models might wrongly +classify a strongly positive sentiment as neutral, or fail to recognize sarcasm +or irony in the text. Such misclassifications highlight the complexity of +sentiment analysis and the areas where the models need to be refined. Another +aspect is that different LLMs might perform differently on the same set of +data, depending on their architecture and training datasets. This variance +calls for a more in-depth study of the factors that contribute to the +performance differences and how they can be optimized. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Diversify-verify-adapt: Efficient and Robust Retrieval-Augmented + Ambiguous Question Answering + + +
+ The retrieval augmented generation (RAG) framework addresses an ambiguity in +user queries in QA systems by retrieving passages that cover all plausible +interpretations and generating comprehensive responses based on the passages. +However, our preliminary studies reveal that a single retrieval process often +suffers from low quality results, as the retrieved passages frequently fail to +capture all plausible interpretations. Although the iterative RAG approach has +been proposed to address this problem, it comes at the cost of significantly +reduced efficiency. To address these issues, we propose the +diversify-verify-adapt (DIVA) framework. DIVA first diversifies the retrieved +passages to encompass diverse interpretations. Subsequently, DIVA verifies the +quality of the passages and adapts the most suitable approach tailored to their +quality. This approach improves the QA systems accuracy and robustness by +handling low quality retrieval issue in ambiguous questions, while enhancing +efficiency. + +
+
+
+
+
+ + ☆ NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for + Retrieval + + +
+ $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval) +from pre-trained embedding models is the predominant retrieval method for text +and images, as well as Retrieval-Augmented Generation (RAG) pipelines. In +practice, application developers often fine-tune the embeddings to improve +their accuracy on the dataset and query workload in hand. Existing approaches +either fine-tune the pre-trained model itself or, more efficiently, but at the +cost of accuracy, train adaptor models to transform the output of the +pre-trained model. We present NUDGE, a family of novel non-parametric embedding +fine-tuning approaches that are significantly more accurate and efficient than +both sets of existing approaches. NUDGE directly modifies the embeddings of +data records to maximize the accuracy of $k$-NN retrieval. We present a +thorough theoretical and experimental study of NUDGE's non-parametric approach. +We show that even though the underlying problem is NP-Hard, constrained +variations can be solved efficiently. These constraints additionally ensure +that the changes to the embeddings are modest, avoiding large distortions to +the semantics learned during pre-training. In experiments across five +pre-trained models and nine standard text and image retrieval datasets, NUDGE +runs in minutes and often improves NDCG@10 by more than 10% over existing +fine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase +in accuracy and runs 200x and 3x faster, respectively, over fine-tuning the +pre-trained model and training adaptors. + +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ The Need for Guardrails with Large Language Models in Medical + Safety-Critical Settings: An Artificial Intelligence Application in the + Pharmacovigilance Ecosystem + + +
+ Large language models (LLMs) are useful tools with the capacity for +performing specific types of knowledge work at an effective scale. However, LLM +deployments in high-risk and safety-critical domains pose unique challenges, +notably the issue of ``hallucination,'' where LLMs can generate fabricated +information. This is particularly concerning in settings such as drug safety, +where inaccuracies could lead to patient harm. To mitigate these risks, we have +developed and demonstrated a proof of concept suite of guardrails specifically +designed to mitigate certain types of hallucinations and errors for drug +safety, and potentially applicable to other medical safety-critical contexts. +These guardrails include mechanisms to detect anomalous documents to prevent +the ingestion of inappropriate data, identify incorrect drug names or adverse +event terms, and convey uncertainty in generated content. We integrated these +guardrails with an LLM fine-tuned for a text-to-text task, which involves +converting both structured and unstructured data within adverse event reports +into natural language. This method was applied to translate individual case +safety reports, demonstrating effective application in a pharmacovigilance +processing task. Our guardrail framework offers a set of tools with broad +applicability across various domains, ensuring LLMs can be safely used in +high-risk situations by eliminating the occurrence of key errors, including the +generation of incorrect pharmacovigilance-related terms, thus adhering to +stringent regulatory and quality standards in medical safety-critical +environments. + +
+
+ comment: 27 pages, 6 figures, 4 tables and supplementary material provided +
+
+
+
+
+ + ♻ ☆ Simple and Scalable Strategies to Continually Pre-train Large Language + Models + + +
+ Large language models (LLMs) are routinely pre-trained on billions of tokens, +only to start the process over again once new data becomes available. A much +more efficient solution is to continually pre-train these models, saving +significant compute compared to re-training. However, the distribution shift +induced by new data typically results in degraded performance on previous data +or poor adaptation to the new data. In this work, we show that a simple and +scalable combination of learning rate (LR) re-warming, LR re-decaying, and +replay of previous data is sufficient to match the performance of fully +re-training from scratch on all available data, as measured by the final loss +and the average score on several language model (LM) evaluation benchmarks. +Specifically, we show this for a weak but realistic distribution shift between +two commonly used LLM pre-training datasets (English$\rightarrow$English) and a +stronger distribution shift (English$\rightarrow$German) at the $405$M +parameter model scale with large dataset sizes (hundreds of billions of +tokens). Selecting the weak but realistic shift for larger-scale experiments, +we also find that our continual learning strategies match the re-training +baseline for a 10B parameter LLM. Our results demonstrate that LLMs can be +successfully updated via simple and scalable continual learning strategies, +matching the re-training baseline using only a fraction of the compute. +Finally, inspired by previous work, we propose alternatives to the cosine +learning rate schedule that help circumvent forgetting induced by LR re-warming +and that are not bound to a fixed token budget. + +
+
+
+
+
+ + ♻ ☆ LongRecipe: Recipe for Efficient Long Context Generalization in Large + Language Models + + +
+ Large language models (LLMs) face significant challenges in handling +long-context tasks because of their limited effective context window size +during pretraining, which restricts their ability to generalize over extended +sequences. Meanwhile, extending the context window in LLMs through +post-pretraining is highly resource-intensive. To address this, we introduce +LongRecipe, an efficient training strategy for extending the context window of +LLMs, including impactful token analysis, position index transformation, and +training optimization strategies. It simulates long-sequence inputs while +maintaining training efficiency and significantly improves the model's +understanding of long-range dependencies. Experiments on three types of LLMs +show that LongRecipe can utilize long sequences while requiring only 30% of the +target context window size, and reduces computational training resource over +85% compared to full sequence training. Furthermore, LongRecipe also preserves +the original LLM's capabilities in general tasks. Ultimately, we can extend the +effective context window of open-source LLMs from 8k to 128k, achieving +performance close to GPT-4 with just one day of dedicated training using a +single GPU with 80G memory. Our code is released at +https://github.com/zhiyuanhubj/LongRecipe. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Revisiting Character-level Adversarial Attacks for Language Models ICML 2024 + + +
+ Adversarial attacks in Natural Language Processing apply perturbations in the +character or token levels. Token-level attacks, gaining prominence for their +use of gradient-based methods, are susceptible to altering sentence semantics, +leading to invalid adversarial examples. While character-level attacks easily +maintain semantics, they have received less attention as they cannot easily +adopt popular gradient-based methods, and are thought to be easy to defend. +Challenging these beliefs, we introduce Charmer, an efficient query-based +adversarial attack capable of achieving high attack success rate (ASR) while +generating highly similar adversarial examples. Our method successfully targets +both small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2, +Charmer improves the ASR in 4.84% points and the USE similarity in 8% points +with respect to the previous art. Our implementation is available in +https://github.com/LIONS-EPFL/Charmer. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language + Models + + +
+ Large Language Models (LLMs) have demonstrated notable capabilities across +various tasks, showcasing complex problem-solving abilities. Understanding and +executing complex rules, along with multi-step planning, are fundamental to +logical reasoning and critical for practical LLM agents and decision-making +systems. However, evaluating LLMs as effective rule-based executors and +planners remains underexplored. In this paper, we introduce LogicGame, a novel +benchmark designed to evaluate the comprehensive rule understanding, execution, +and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame +provides diverse games that contain a series of rules with an initial state, +requiring models to comprehend and apply predefined regulations to solve +problems. We create simulated scenarios in which models execute or plan +operations to achieve specific outcomes. These game scenarios are specifically +designed to distinguish logical reasoning from mere knowledge by relying +exclusively on predefined rules. This separation allows for a pure assessment +of rule-based reasoning capabilities. The evaluation considers not only final +outcomes but also intermediate steps, providing a comprehensive assessment of +model performance. Moreover, these intermediate steps are deterministic and can +be automatically verified. LogicGame defines game scenarios with varying +difficulty levels, from simple rule applications to complex reasoning chains, +in order to offer a precise evaluation of model performance on rule +understanding and multi-step execution. Utilizing LogicGame, we test various +LLMs and identify notable shortcomings in their rule-based logical reasoning +abilities. + +
+
+
+
+
+ + ♻ ☆ AI-generated text boundary detection with RoFT + + +
+ Due to the rapid development of large language models, people increasingly +often encounter texts that may start as written by a human but continue as +machine-generated. Detecting the boundary between human-written and +machine-generated parts of such texts is a challenging problem that has not +received much attention in literature. We attempt to bridge this gap and +examine several ways to adapt state of the art artificial text detection +classifiers to the boundary detection setting. We push all detectors to their +limits, using the Real or Fake text benchmark that contains short texts on +several topics and includes generations of various language models. We use this +diversity to deeply examine the robustness of all detectors in cross-domain and +cross-model settings to provide baselines and insights for future research. In +particular, we find that perplexity-based approaches to boundary detection tend +to be more robust to peculiarities of domain-specific data than supervised +fine-tuning of the RoBERTa model; we also find which features of the text +confuse boundary detection algorithms and negatively influence their +performance in cross-domain settings. + +
+
+ comment: Our official repository: + https://github.com/SilverSolver/ai_boundary_detection +
+
+
+
+
+ + ♻ ☆ Negation Blindness in Large Language Models: Unveiling the NO Syndrome + in Image Generation + + +
+ Foundational Large Language Models (LLMs) have changed the way we perceive +technology. They have been shown to excel in tasks ranging from poem writing +and coding to essay generation and puzzle solving. With the incorporation of +image generation capability, they have become more comprehensive and versatile +AI tools. At the same time, researchers are striving to identify the +limitations of these tools to improve them further. Currently identified flaws +include hallucination, biases, and bypassing restricted commands to generate +harmful content. In the present work, we have identified a fundamental +limitation related to the image generation ability of LLMs, and termed it The +NO Syndrome. This negation blindness refers to LLMs inability to correctly +comprehend NO related natural language prompts to generate the desired images. +Interestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found +to be suffering from this syndrome. To demonstrate the generalization of this +limitation, we carried out simulation experiments and conducted entropy-based +and benchmark statistical analysis tests on various LLMs in multiple languages, +including English, Hindi, and French. We conclude that the NO syndrome is a +significant flaw in current LLMs that needs to be addressed. A related finding +of this study showed a consistent discrepancy between image and textual +responses as a result of this NO syndrome. We posit that the introduction of a +negation context-aware reinforcement learning based feedback loop between the +LLMs textual response and generated image could help ensure the generated text +is based on both the LLMs correct contextual understanding of the negation +query and the generated visual output. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Seeing Like an AI: How LLMs Apply (and Misapply) Wikipedia Neutrality + Norms + + +
+ Large language models (LLMs) are trained on broad corpora and then used in +communities with specialized norms. Is providing LLMs with community rules +enough for models to follow these norms? We evaluate LLMs' capacity to detect +(Task 1) and correct (Task 2) biased Wikipedia edits according to Wikipedia's +Neutral Point of View (NPOV) policy. LLMs struggled with bias detection, +achieving only 64% accuracy on a balanced dataset. Models exhibited contrasting +biases (some under- and others over-predicted bias), suggesting distinct priors +about neutrality. LLMs performed better at generation, removing 79% of words +removed by Wikipedia editors. However, LLMs made additional changes beyond +Wikipedia editors' simpler neutralizations, resulting in high-recall but +low-precision editing. Interestingly, crowdworkers rated AI rewrites as more +neutral (70%) and fluent (61%) than Wikipedia-editor rewrites. Qualitative +analysis found LLMs sometimes applied NPOV more comprehensively than Wikipedia +editors but often made extraneous non-NPOV-related changes (such as grammar). +LLMs may apply rules in ways that resonate with the public but diverge from +community experts. While potentially effective for generation, LLMs may reduce +editor agency and increase moderation workload (e.g., verifying additions). +Even when rules are easy to articulate, having LLMs apply them like community +members may still be difficult. + +
+
+
+
+
+ + ♻ ☆ A Causal Explainable Guardrails for Large Language Models + + +
+ Large Language Models (LLMs) have shown impressive performance in natural +language tasks, but their outputs can exhibit undesirable attributes or biases. +Existing methods for steering LLMs toward desired attributes often assume +unbiased representations and rely solely on steering prompts. However, the +representations learned from pre-training can introduce semantic biases that +influence the steering process, leading to suboptimal results. We propose +LLMGuardrail, a novel framework that incorporates causal analysis and +adversarial learning to obtain unbiased steering representations in LLMs. +LLMGuardrail systematically identifies and blocks the confounding effects of +biases, enabling the extraction of unbiased steering representations. +Additionally, it includes an explainable component that provides insights into +the alignment between the generated output and the desired direction. +Experiments demonstrate LLMGuardrail's effectiveness in steering LLMs toward +desired attributes while mitigating biases. Our work contributes to the +development of safe and reliable LLMs that align with desired attributes. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Parallel Speculative Decoding with Adaptive Draft Length + + +
+ Speculative decoding (SD), where an extra draft model is employed to provide +multiple \textit{draft} tokens first and then the original target model +verifies these tokens in parallel, has shown great power for LLM inference +acceleration. However, existing SD methods suffer from the mutual waiting +problem, i.e., the target model gets stuck when the draft model is +\textit{guessing} tokens, and vice versa. This problem is directly incurred by +the asynchronous execution of the draft model and the target model, and is +exacerbated due to the fixed draft length in speculative decoding. To address +these challenges, we propose a conceptually simple, flexible, and general +framework to boost speculative decoding, namely \textbf{P}arallel +sp\textbf{E}culative decoding with \textbf{A}daptive d\textbf{R}aft +\textbf{L}ength (PEARL). Specifically, PEARL proposes \textit{pre-verify} to +verify the first draft token in advance during the drafting phase, and +\textit{post-verify} to generate more draft tokens during the verification +phase. PEARL parallels the drafting phase and the verification phase via +applying the two strategies, and achieves adaptive draft length for different +scenarios, which effectively alleviates the mutual waiting problem. Moreover, +we theoretically demonstrate that the mean accepted tokens of PEARL is more +than existing \textit{draft-then-verify} works. Experiments on various text +generation benchmarks demonstrate the effectiveness of our \name, leading to a +superior speedup performance up to \textbf{3.79$\times$} and +\textbf{1.52$\times$}, compared to auto-regressive decoding and vanilla +speculative decoding, respectively. + +
+
+
+
+
+ + ♻ ☆ HIRO: Hierarchical Information Retrieval Optimization + + +
+ Retrieval-Augmented Generation (RAG) has revolutionized natural language +processing by dynamically integrating external knowledge into Large Language +Models (LLMs), addressing their limitation of static training datasets. Recent +implementations of RAG leverage hierarchical data structures, which organize +documents at various levels of summarization and information density. This +complexity, however, can cause LLMs to "choke" on information overload, +necessitating more sophisticated querying mechanisms. In this context, we +introduce Hierarchical Information Retrieval Optimization (HIRO), a novel +querying approach that employs a Depth-First Search (DFS)-based recursive +similarity score calculation and branch pruning. This method uniquely minimizes +the context delivered to the LLM without informational loss, effectively +managing the challenge of excessive data. HIRO's refined approach is validated +by a 10.85% improvement in performance on the NarrativeQA dataset. + +
+
+
+
+
+ + ♻ ☆ What Formal Languages Can Transformers Express? A Survey + + +
+ As transformers have gained prominence in natural language processing, some +researchers have investigated theoretically what problems they can and cannot +solve, by treating problems as formal languages. Exploring such questions can +help clarify the power of transformers relative to other models of computation, +their fundamental capabilities and limits, and the impact of architectural +choices. Work in this subarea has made considerable progress in recent years. +Here, we undertake a comprehensive survey of this work, documenting the diverse +assumptions that underlie different results and providing a unified framework +for harmonizing seemingly contradictory findings. + +
+
+ comment: One minor correction in {\S}5.1 +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions, such as +search agents, within this expanding field. + +
+
+ comment: updated to version 3 +
+
+
+
+
+ + ♻ ☆ Towards a Universal Method for Meaningful Signal Detection + + +
+ It is known that human speech and certain animal vocalizations can convey +meaningful content because we can decipher the content that a given utterance +does convey. This paper explores an alternative approach to determining whether +a signal is meaningful, one that analyzes only the signal itself and is +independent of what the conveyed meaning might be. We devise a method that +takes a waveform as input and outputs a score indicating its degree of +`meaningfulness`. We cluster contiguous portions of the input to minimize the +total description length, and then take the length of the code of the assigned +cluster labels as meaningfulness score. We evaluate our method empirically, +against several baselines, and show that it is the only one to give a high +score to human speech in various languages and with various speakers, a +moderate score to animal vocalizations from birds and orcas, and a low score to +ambient noise from various sources. + +
+
+
+
+
+ + ♻ ☆ Open Implementation and Study of BEST-RQ for Speech Processing ICASSP 2024 + + +
+ Self-Supervised Learning (SSL) has proven to be useful in various speech +tasks. However, these methods are generally very demanding in terms of data, +memory, and computational resources. BERT-based Speech pre-Training with +Random-projection Quantizer (BEST-RQ), is an SSL method that has shown great +performance on Automatic Speech Recognition (ASR) while being simpler than +other SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance, +details are lacking in the original paper, such as the amount of GPU/TPU hours +used in pre-training, and there is no official easy-to-use open-source +implementation. Furthermore, BEST-RQ has not been evaluated on other downstream +tasks aside from ASR and speech translation. In this work, we describe a +re-implementation of a Random-projection quantizer and perform a preliminary +study with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the +details and differences of our implementation. We show that a random projection +quantizer can achieve similar downstream performance as wav2vec 2.0 while +decreasing training time by over a factor of two. + +
+
+ comment: Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio, + Speech and Beyond (SASB 2024) +
+
+
+
+
+ + ♻ ☆ Prompt Compression with Context-Aware Sentence Encoding for Fast and + Improved LLM Inference + + +
+ Large language models (LLMs) have triggered a new stream of research focusing +on compressing the context length to reduce the computational cost while +ensuring the retention of helpful information for LLMs to answer the given +question. Token-based removal methods are one of the most prominent approaches +in this direction, but risk losing the semantics of the context caused by +intermediate token removal, especially under high compression ratios, while +also facing challenges in computational efficiency. In this work, we propose +context-aware prompt compression (CPC), a sentence-level prompt compression +technique where its key innovation is a novel context-aware sentence encoder +that provides a relevance score for each sentence for a given question. To +train this encoder, we generate a new dataset consisting of questions, +positives, and negative pairs where positives are sentences relevant to the +question, while negatives are irrelevant context sentences. We train the +encoder in a contrastive setup to learn context-aware sentence representations. +Our method considerably outperforms prior works on prompt compression on +benchmark datasets and is up to 10.93x faster at inference compared to the best +token-level compression method. We also find better improvement for shorter +length constraints in most benchmarks, showing the effectiveness of our +proposed solution in the compression of relevant information in a shorter +context. Finally, we release the code and the dataset for quick reproducibility +and further development: https://github.com/Workday/cpc. + +
+
+
+
+
+ + ♻ ☆ CADGE: Context-Aware Dialogue Generation Enhanced with Graph-Structured + Knowledge Aggregation + + +
+ Commonsense knowledge is crucial to many natural language processing tasks. +Existing works usually incorporate graph knowledge with conventional graph +neural networks (GNNs), leading to the text and graph knowledge encoding +processes being separated in a serial pipeline. We argue that these separate +representation learning stages may be suboptimal for neural networks to learn +the overall context contained in both types of input knowledge. In this paper, +we propose a novel context-aware graph-attention model (Context-aware GAT), +which can effectively incorporate global features of relevant knowledge graphs +based on a context-enhanced knowledge aggregation process. Specifically, our +framework leverages a novel representation learning approach to process +heterogeneous features - combining flattened graph knowledge with text. To the +best of our knowledge, this is the first attempt at hierarchically applying +graph knowledge aggregation on a connected subgraph in addition to contextual +information to support commonsense dialogue generation. This framework shows +superior performance compared to conventional GNN-based language frameworks. +Both automatic and human evaluation demonstrates that our proposed model has +significant performance uplifts over state-of-the-art baselines. + +
+
+ comment: Accepted by INLG 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Sindhi Word Segmentation using Subword Representation Learning + and Position-aware Self-attention + + +
+ Sindhi word segmentation is a challenging task due to space omission and +insertion issues. The Sindhi language itself adds to this complexity. It's +cursive and consists of characters with inherent joining and non-joining +properties, independent of word boundaries. Existing Sindhi word segmentation +methods rely on designing and combining hand-crafted features. However, these +methods have limitations, such as difficulty handling out-of-vocabulary words, +limited robustness for other languages, and inefficiency with large amounts of +noisy or raw text. Neural network-based models, in contrast, can automatically +capture word boundary information without requiring prior knowledge. In this +paper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses +word segmentation as a sequence labeling task. The SGNWS model incorporates +subword representation learning through a bidirectional long short-term memory +encoder, position-aware self-attention, and a conditional random field. Our +empirical results demonstrate that the SGNWS model achieves state-of-the-art +performance in Sindhi word segmentation on six datasets. + +
+
+ comment: Journal Paper, 14 pages +
+
+
+
+
+ + ♻ ☆ A Sentence is Worth a Thousand Pictures: Can Large Language Models + Understand Hum4n L4ngu4ge and the W0rld behind W0rds? + + +
+ Modern Artificial Intelligence applications show great potential for +language-related tasks that rely on next-word prediction. The current +generation of Large Language Models (LLMs) have been linked to claims about +human-like linguistic performance and their applications are hailed both as a +step towards artificial general intelligence and as a major advance in +understanding the cognitive, and even neural basis of human language. To assess +these claims, first we analyze the contribution of LLMs as theoretically +informative representations of a target cognitive system vs. atheoretical +mechanistic tools. Second, we evaluate the models' ability to see the bigger +picture, through top-down feedback from higher levels of processing, which +requires grounding in previous expectations and past world experience. We +hypothesize that since models lack grounded cognition, they cannot take +advantage of these features and instead solely rely on fixed associations +between represented words and word vectors. To assess this, we designed and ran +a novel 'leet task' (l33t t4sk), which requires decoding sentences in which +letters are systematically replaced by numbers. The results suggest that humans +excel in this task whereas models struggle, confirming our hypothesis. We +interpret the results by identifying the key abilities that are still missing +from the current state of development of these models, which require solutions +that go beyond increased system scaling. + +
+
+
+
+
+ + ♻ ☆ Exploring Interpretability of Independent Components of Word Embeddings + with Automated Word Intruder Test LREC + + +
+ Independent Component Analysis (ICA) is an algorithm originally developed for +finding separate sources in a mixed signal, such as a recording of multiple +people in the same room speaking at the same time. Unlike Principal Component +Analysis (PCA), ICA permits the representation of a word as an unstructured set +of features, without any particular feature being deemed more significant than +the others. In this paper, we used ICA to analyze word embeddings. We have +found that ICA can be used to find semantic features of the words, and these +features can easily be combined to search for words that satisfy the +combination. We show that most of the independent components represent such +features. To quantify the interpretability of the components, we use the word +intruder test, performed both by humans and by large language models. We +propose to use the automated version of the word intruder test as a fast and +inexpensive way of quantifying vector interpretability without the need for +human effort. + +
+
+ comment: Presented at LREC-COLING 2024, cite this version please: + https://aclanthology.org/2024.lrec-main.605/ +
+
+
+
+
+ + ♻ ☆ Vision-Language and Large Language Model Performance in + Gastroenterology: GPT, Claude, Llama, Phi, Mistral, Gemma, and Quantized + Models + + +
+ Background and Aims: This study evaluates the medical reasoning performance +of large language models (LLMs) and vision language models (VLMs) in +gastroenterology. + Methods: We used 300 gastroenterology board exam-style multiple-choice +questions, 138 of which contain images to systematically assess the impact of +model configurations and parameters and prompt engineering strategies utilizing +GPT-3.5. Next, we assessed the performance of proprietary and open-source LLMs +(versions), including GPT (3.5, 4, 4o, 4omini), Claude (3, 3.5), Gemini (1.0), +Mistral, Llama (2, 3, 3.1), Mixtral, and Phi (3), across different interfaces +(web and API), computing environments (cloud and local), and model precisions +(with and without quantization). Finally, we assessed accuracy using a +semiautomated pipeline. + Results: Among the proprietary models, GPT-4o (73.7%) and Claude3.5-Sonnet +(74.0%) achieved the highest accuracy, outperforming the top open-source +models: Llama3.1-405b (64%), Llama3.1-70b (58.3%), and Mixtral-8x7b (54.3%). +Among the quantized open-source models, the 6-bit quantized Phi3-14b (48.7%) +performed best. The scores of the quantized models were comparable to those of +the full-precision models Llama2-7b, Llama2--13b, and Gemma2-9b. Notably, VLM +performance on image-containing questions did not improve when the images were +provided and worsened when LLM-generated captions were provided. In contrast, a +10% increase in accuracy was observed when images were accompanied by +human-crafted image descriptions. + Conclusion: In conclusion, while LLMs exhibit robust zero-shot performance in +medical reasoning, the integration of visual data remains a challenge for VLMs. +Effective deployment involves carefully determining optimal model +configurations, encouraging users to consider either the high performance of +proprietary models or the flexible adaptability of open-source models. + +
+
+ comment: Manuscript Pages: 34, Figures: 7, Tables: 2, Supplementary File + Pages: 35, Data Transparency Statement: Code is available at: + https://github.com/Sdamirsa/LLM-VLM-in-Gastroenterology . Study data from + American College of Gastroenterology (ACG) are restricted and available upon + request with ACG permission. Correction: updated abstract considering + Llama3.1 results +
+
+
+
+
+ + ♻ ☆ Towards Measuring and Modeling "Culture" in LLMs: A Survey + + +
+ We present a survey of more than 90 recent papers that aim to study cultural +representation and inclusion in large language models (LLMs). We observe that +none of the studies explicitly define "culture, which is a complex, +multifaceted concept; instead, they probe the models on some specially designed +datasets which represent certain aspects of "culture". We call these aspects +the proxies of culture, and organize them across two dimensions of demographic +and semantic proxies. We also categorize the probing methods employed. Our +analysis indicates that only certain aspects of ``culture,'' such as values and +objectives, have been studied, leaving several other interesting and important +facets, especially the multitude of semantic domains (Thompson et al., 2020) +and aboutness (Hershcovich et al., 2022), unexplored. Two other crucial gaps +are the lack of robustness of probing techniques and situated studies on the +impact of cultural mis- and under-representation in LLM-based applications. + +
+
+
+
+
+ + ♻ ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever EMNLP + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce a novel architecture and a training framework to +support long context window and multilingual retrieval. Our new model, +Jina-ColBERT-v2, demonstrates strong performance across a range of English and +multilingual retrieval tasks, + +
+
+ comment: 8 pages, references at pp7,8; EMNLP workshop submission +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Information Extraction using Large Language Models + + +
+ Human-like large language models (LLMs), especially the most powerful and +popular ones in OpenAI's GPT family, have proven to be very helpful for many +natural language processing (NLP) related tasks. Therefore, various attempts +have been made to apply LLMs to information extraction (IE), which is a +fundamental NLP task that involves extracting information from unstructured +plain text. To demonstrate the latest representative progress in LLMs' +information extraction ability, we assess the information extraction ability of +GPT-4 (the latest version of GPT at the time of writing this paper) from four +perspectives: Performance, Evaluation Criteria, Robustness, and Error Types. +Our results suggest a visible performance gap between GPT-4 and +state-of-the-art (SOTA) IE methods. To alleviate this problem, considering the +LLMs' human-like characteristics, we propose and analyze the effects of a +series of simple prompt-based methods, which can be generalized to other LLMs +and NLP tasks. Rich experiments show our methods' effectiveness and some of +their remaining issues in improving GPT-4's information extraction ability. + +
+
+ comment: This article has an original arxiv version entitled "Is Information + Extraction Solved by ChatGPT? An Analysis of Performance, Evaluation + Criteria, Robustness and Errors", whose url link is arXiv/2305.14450 +
+
+
+
+
+ + ♻ ☆ Can AI Replace Human Subjects? A Large-Scale Replication of + Psychological Experiments with LLMs + + +
+ Artificial Intelligence (AI) is increasingly being integrated into scientific +research, particularly in the social sciences, where understanding human +behavior is critical. Large Language Models (LLMs) like GPT-4 have shown +promise in replicating human-like responses in various psychological +experiments. However, the extent to which LLMs can effectively replace human +subjects across diverse experimental contexts remains unclear. Here, we conduct +a large-scale study replicating 154 psychological experiments from top social +science journals with 618 main effects and 138 interaction effects using GPT-4 +as a simulated participant. We find that GPT-4 successfully replicates 76.0 +percent of main effects and 47.0 percent of interaction effects observed in the +original studies, closely mirroring human responses in both direction and +significance. However, only 19.44 percent of GPT-4's replicated confidence +intervals contain the original effect sizes, with the majority of replicated +effect sizes exceeding the 95 percent confidence interval of the original +studies. Additionally, there is a 71.6 percent rate of unexpected significant +results where the original studies reported null findings, suggesting potential +overestimation or false positives. Our results demonstrate the potential of +LLMs as powerful tools in psychological research but also emphasize the need +for caution in interpreting AI-driven findings. While LLMs can complement human +studies, they cannot yet fully replace the nuanced insights provided by human +subjects. + +
+
+ comment: 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Enhancing Dialogue Generation in Werewolf Game Through Situation + Analysis and Persuasion Strategies + + +
+ Recent advancements in natural language processing, particularly with large +language models (LLMs) like GPT-4, have significantly enhanced dialogue +systems, enabling them to generate more natural and fluent conversations. +Despite these improvements, challenges persist, such as managing continuous +dialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024 +addresses these challenges by employing the Werewolf Game, an incomplete +information game, to test the capabilities of LLMs in complex interactive +environments. This paper introduces a LLM-based Werewolf Game AI, where each +role is supported by situation analysis to aid response generation. +Additionally, for the werewolf role, various persuasion strategies, including +logical appeal, credibility appeal, and emotional appeal, are employed to +effectively persuade other players to align with its actions. + +
+
+ comment: Accepted to the AIWolfDial2024 workshop at INLG 2024 +
+
+
+
+
+ + ♻ ☆ SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated + Responses + + +
+ Can LLMs consistently improve their previous outputs for better results? For +this to be true, LLMs would need to be better at discriminating among +previously-generated alternatives, than generating initial responses. We +explore the validity of this hypothesis in practice. We first formulate a +unified framework that allows us to compare the generative and discriminative +capability of any model on any task. In our resulting experimental analysis of +several open-source and industrial LLMs, we observe that models are not +reliably better at discriminating among previously-generated alternatives than +generating initial responses. This finding challenges the notion that LLMs may +be able to enhance their performance only through their own judgment. + +
+
+
+
+
+ + ♻ ☆ LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet + + +
+ Recent large language model (LLM) defenses have greatly improved models' +ability to refuse harmful queries, even when adversarially attacked. However, +LLM defenses are primarily evaluated against automated adversarial attacks in a +single turn of conversation, an insufficient threat model for real-world +malicious use. We demonstrate that multi-turn human jailbreaks uncover +significant vulnerabilities, exceeding 70% attack success rate (ASR) on +HarmBench against defenses that report single-digit ASRs with automated +single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine +unlearning defenses, successfully recovering dual-use biosecurity knowledge +from unlearned models. We compile these results into Multi-Turn Human +Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks. +We publicly release MHJ alongside a compendium of jailbreak tactics developed +across dozens of commercial red teaming engagements, supporting research +towards stronger LLM defenses. + +
+
+
+
+
+ + ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 119 + +
+
+
+ + ☆ HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical + MLLM Prompts + + +
+ The potential for higher-resolution image generation using pretrained +diffusion models is immense, yet these models often struggle with issues of +object repetition and structural artifacts especially when scaling to 4K +resolution and higher. We figure out that the problem is caused by that, a +single prompt for the generation of multiple scales provides insufficient +efficacy. In response, we propose HiPrompt, a new tuning-free solution that +tackles the above problems by introducing hierarchical prompts. The +hierarchical prompts offer both global and local guidance. Specifically, the +global guidance comes from the user input that describes the overall content, +while the local guidance utilizes patch-wise descriptions from MLLMs to +elaborately guide the regional structure and texture generation. Furthermore, +during the inverse denoising process, the generated noise is decomposed into +low- and high-frequency spatial components. These components are conditioned on +multiple prompt levels, including detailed patch-wise descriptions and broader +image-level prompts, facilitating prompt-guided denoising under hierarchical +semantic guidance. It further allows the generation to focus more on local +spatial regions and ensures the generated images maintain coherent local and +global semantics, structures, and textures with high definition. Extensive +experiments demonstrate that HiPrompt outperforms state-of-the-art works in +higher-resolution image generation, significantly reducing object repetition +and enhancing structural quality. + +
+
+
+
+
+ + ☆ UC-NeRF: Uncertainty-aware Conditional Neural Radiance Fields from + Endoscopic Sparse Views + + +
+ Visualizing surgical scenes is crucial for revealing internal anatomical +structures during minimally invasive procedures. Novel View Synthesis is a +vital technique that offers geometry and appearance reconstruction, enhancing +understanding, planning, and decision-making in surgical scenes. Despite the +impressive achievements of Neural Radiance Field (NeRF), its direct application +to surgical scenes produces unsatisfying results due to two challenges: +endoscopic sparse views and significant photometric inconsistencies. In this +paper, we propose uncertainty-aware conditional NeRF for novel view synthesis +to tackle the severe shape-radiance ambiguity from sparse surgical views. The +core of UC-NeRF is to incorporate the multi-view uncertainty estimation to +condition the neural radiance field for modeling the severe photometric +inconsistencies adaptively. Specifically, our UC-NeRF first builds a +consistency learner in the form of multi-view stereo network, to establish the +geometric correspondence from sparse views and generate uncertainty estimation +and feature priors. In neural rendering, we design a base-adaptive NeRF network +to exploit the uncertainty estimation for explicitly handling the photometric +inconsistencies. Furthermore, an uncertainty-guided geometry distillation is +employed to enhance geometry learning. Experiments on the SCARED and Hamlyn +datasets demonstrate our superior performance in rendering appearance and +geometry, consistently outperforming the current state-of-the-art approaches. +Our code will be released at \url{https://github.com/wrld/UC-NeRF}. + +
+
+
+
+
+ + ☆ Can LVLMs Obtain a Driver's License? A Benchmark Towards Reliable AGI + for Autonomous Driving + + +
+ Large Vision-Language Models (LVLMs) have recently garnered significant +attention, with many efforts aimed at harnessing their general knowledge to +enhance the interpretability and robustness of autonomous driving models. +However, LVLMs typically rely on large, general-purpose datasets and lack the +specialized expertise required for professional and safe driving. Existing +vision-language driving datasets focus primarily on scene understanding and +decision-making, without providing explicit guidance on traffic rules and +driving skills, which are critical aspects directly related to driving safety. +To bridge this gap, we propose IDKB, a large-scale dataset containing over one +million data items collected from various countries, including driving +handbooks, theory test data, and simulated road test data. Much like the +process of obtaining a driver's license, IDKB encompasses nearly all the +explicit knowledge needed for driving from theory to practice. In particular, +we conducted comprehensive tests on 15 LVLMs using IDKB to assess their +reliability in the context of autonomous driving and provided extensive +analysis. We also fine-tuned popular models, achieving notable performance +improvements, which further validate the significance of our dataset. The +project page can be found at: +\url{https://4dvlab.github.io/project_page/idkb.html} + +
+
+
+
+
+ + ☆ SITAR: Semi-supervised Image Transformer for Action Recognition ICPR 2024 + + +
+ Recognizing actions from a limited set of labeled videos remains a challenge +as annotating visual data is not only tedious but also can be expensive due to +classified nature. Moreover, handling spatio-temporal data using deep $3$D +transformers for this can introduce significant computational complexity. In +this paper, our objective is to address video action recognition in a +semi-supervised setting by leveraging only a handful of labeled videos along +with a collection of unlabeled videos in a compute efficient manner. +Specifically, we rearrange multiple frames from the input videos in row-column +form to construct super images. Subsequently, we capitalize on the vast pool of +unlabeled samples and employ contrastive learning on the encoded super images. +Our proposed approach employs two pathways to generate representations for +temporally augmented super images originating from the same video. +Specifically, we utilize a 2D image-transformer to generate representations and +apply a contrastive loss function to minimize the similarity between +representations from different videos while maximizing the representations of +identical videos. Our method demonstrates superior performance compared to +existing state-of-the-art approaches for semi-supervised action recognition +across various benchmark datasets, all while significantly reducing +computational costs. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 19 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ CanvOI, an Oncology Intelligence Foundation Model: Scaling FLOPS + Differently + + +
+ The rapidly evolving field of digital oncopathology faces significant +challenges, including the need to address diverse and complex clinical +questions, often involving rare conditions, with limited availability of +labeled data. These limitations hinder the development of robust AI-driven +tools in the biomedical space, where accuracy in probabilistic determinations +is of utmost importance. To address this, digital pathology foundation models +have begun to emerge, typically developed with the size and diversity of the +pre-training dataset and model parameters in mind. Here, we present CanvOI, a +ViT-g/10-based foundation model designed to enhance the capabilities of digital +pathology by addressing these challenges through a different approach. +Considering the unique nature of oncologic histopathological images and the +requirements from the embeddings to provide meaningful representations for +Multiple Instance Learning (MIL) downstream models, we chose to modify the +input image characteristics. By introducing larger tile sizes (380 x 380 +pixels) and smaller patch sizes (10 x 10 pixels), we were able to optimize the +model's performance, pushing computational resources in a new direction and +achieving state-of-the-art performance on cancer-related benchmarks. CanvOI +demonstrated a 1.5-7.4% improvement in averaged AUC compared to other leading +foundation models built for digital pathology. Moreover, our results +demonstrate that CanvOI significantly outperformed the other models, with the +performance gap widening substantially when trained on just 10% of the initial +cohort. This work highlights an alternative approach that, if integrated with +traditional development approaches, has the potential to advance Oncology +Intelligence (OI), overcome some of the current barriers and ultimately improve +the clinical outcome of cancer patients. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Multi-stream deep learning framework to predict mild cognitive + impairment with Rey Complex Figure Test + + +
+ Drawing tests like the Rey Complex Figure Test (RCFT) are widely used to +assess cognitive functions such as visuospatial skills and memory, making them +valuable tools for detecting mild cognitive impairment (MCI). Despite their +utility, existing predictive models based on these tests often suffer from +limitations like small sample sizes and lack of external validation, which +undermine their reliability. We developed a multi-stream deep learning +framework that integrates two distinct processing streams: a multi-head +self-attention based spatial stream using raw RCFT images and a scoring stream +employing a previously developed automated scoring system. Our model was +trained on data from 1,740 subjects in the Korean cohort and validated on an +external hospital dataset of 222 subjects from Korea. The proposed multi-stream +model demonstrated superior performance over baseline models (AUC = 0.872, +Accuracy = 0.781) in external validation. The integration of both spatial and +scoring streams enables the model to capture intricate visual details from the +raw images while also incorporating structured scoring data, which together +enhance its ability to detect subtle cognitive impairments. This dual approach +not only improves predictive accuracy but also increases the robustness of the +model, making it more reliable in diverse clinical settings. Our model has +practical implications for clinical settings, where it could serve as a +cost-effective tool for early MCI screening. + +
+
+ comment: 20 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Benchmarking Spurious Bias in Few-Shot Image Classifiers ECCV 2024 + + +
+ Few-shot image classifiers are designed to recognize and classify new data +with minimal supervision and limited data but often show reliance on spurious +correlations between classes and spurious attributes, known as spurious bias. +Spurious correlations commonly hold in certain samples and few-shot classifiers +can suffer from spurious bias induced from them. There is an absence of an +automatic benchmarking system to assess the robustness of few-shot classifiers +against spurious bias. In this paper, we propose a systematic and rigorous +benchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied +degrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates +few-shot evaluation tasks with biased attributes so that using them for +predictions can demonstrate poor performance. To construct these tasks, we +propose attribute-based sample selection strategies based on a pre-trained +vision-language model, eliminating the need for manual dataset curation. This +allows FewSTAB to automatically benchmark spurious bias using any existing test +data. FewSTAB offers evaluation results in a new dimension along with a new +design guideline for building robust classifiers. Moreover, it can benchmark +spurious bias in varied degrees and enable designs for varied degrees of +robustness. Its effectiveness is demonstrated through experiments on ten +few-shot learning methods across three datasets. We hope our framework can +inspire new designs of robust few-shot classifiers. Our code is available at +https://github.com/gtzheng/FewSTAB. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ The Impact of Balancing Real and Synthetic Data on Accuracy and Fairness + in Face Recognition ECCV 2024 + + +
+ Over the recent years, the advancements in deep face recognition have fueled +an increasing demand for large and diverse datasets. Nevertheless, the +authentic data acquired to create those datasets is typically sourced from the +web, which, in many cases, can lead to significant privacy issues due to the +lack of explicit user consent. Furthermore, obtaining a demographically +balanced, large dataset is even more difficult because of the natural imbalance +in the distribution of images from different demographic groups. In this paper, +we investigate the impact of demographically balanced authentic and synthetic +data, both individually and in combination, on the accuracy and fairness of +face recognition models. Initially, several generative methods were used to +balance the demographic representations of the corresponding synthetic +datasets. Then a state-of-the-art face encoder was trained and evaluated using +(combinations of) synthetic and authentic images. Our findings emphasized two +main points: (i) the increased effectiveness of training data generated by +diffusion-based models in enhancing accuracy, whether used alone or combined +with subsets of authentic data, and (ii) the minimal impact of incorporating +balanced data from pre-trained generative methods on fairness (in nearly all +tested scenarios using combined datasets, fairness scores remained either +unchanged or worsened, even when compared to unbalanced authentic datasets). +Source code and data are available at \url{https://cutt.ly/AeQy1K5G} for +reproducibility. + +
+
+ comment: Accepted at Synthetic Data for Computer Vision Workshop - Side Event + at ECCV 2024 +
+
+
+
+
+ + ☆ Hybrid-Segmentor: A Hybrid Approach to Automated Fine-Grained Crack + Segmentation in Civil Infrastructure + + +
+ Detecting and segmenting cracks in infrastructure, such as roads and +buildings, is crucial for safety and cost-effective maintenance. In spite of +the potential of deep learning, there are challenges in achieving precise +results and handling diverse crack types. With the proposed dataset and model, +we aim to enhance crack detection and infrastructure maintenance. We introduce +Hybrid-Segmentor, an encoder-decoder based approach that is capable of +extracting both fine-grained local and global crack features. This allows the +model to improve its generalization capabilities in distinguish various type of +shapes, surfaces and sizes of cracks. To keep the computational performances +low for practical purposes, while maintaining the high the generalization +capabilities of the model, we incorporate a self-attention model at the encoder +level, while reducing the complexity of the decoder component. The proposed +model outperforms existing benchmark models across 5 quantitative metrics +(accuracy 0.971, precision 0.804, recall 0.744, F1-score 0.770, and IoU score +0.630), achieving state-of-the-art status. + +
+
+ comment: 25 pages, 6 figures +
+
+
+
+
+ + ☆ Human-VDM: Learning Single-Image 3D Human Gaussian Splatting from Video + Diffusion Models + + +
+ Generating lifelike 3D humans from a single RGB image remains a challenging +task in computer vision, as it requires accurate modeling of geometry, +high-quality texture, and plausible unseen parts. Existing methods typically +use multi-view diffusion models for 3D generation, but they often face +inconsistent view issues, which hinder high-quality 3D human generation. To +address this, we propose Human-VDM, a novel method for generating 3D human from +a single RGB image using Video Diffusion Models. Human-VDM provides temporally +consistent views for 3D human generation using Gaussian Splatting. It consists +of three modules: a view-consistent human video diffusion module, a video +augmentation module, and a Gaussian Splatting module. First, a single image is +fed into a human video diffusion module to generate a coherent human video. +Next, the video augmentation module applies super-resolution and video +interpolation to enhance the textures and geometric smoothness of the generated +video. Finally, the 3D Human Gaussian Splatting module learns lifelike humans +under the guidance of these high-resolution and view-consistent images. +Experiments demonstrate that Human-VDM achieves high-quality 3D human from a +single image, outperforming state-of-the-art methods in both generation quality +and quantity. Project page: https://human-vdm.github.io/Human-VDM/ + +
+
+ comment: 14 Pages, 8 figures, Project page: + https://human-vdm.github.io/Human-VDM/ +
+
+
+
+
+ + ☆ MaDis-Stereo: Enhanced Stereo Matching via Distilled Masked Image + Modeling + + +
+ In stereo matching, CNNs have traditionally served as the predominant +architectures. Although Transformer-based stereo models have been studied +recently, their performance still lags behind CNN-based stereo models due to +the inherent data scarcity issue in the stereo matching task. In this paper, we +propose Masked Image Modeling Distilled Stereo matching model, termed +MaDis-Stereo, that enhances locality inductive bias by leveraging Masked Image +Modeling (MIM) in training Transformer-based stereo model. Given randomly +masked stereo images as inputs, our method attempts to conduct both image +reconstruction and depth prediction tasks. While this strategy is beneficial to +resolving the data scarcity issue, the dual challenge of reconstructing masked +tokens and subsequently performing stereo matching poses significant +challenges, particularly in terms of training stability. To address this, we +propose to use an auxiliary network (teacher), updated via Exponential Moving +Average (EMA), along with the original stereo model (student), where teacher +predictions serve as pseudo supervisory signals to effectively distill +knowledge into the student model. State-of-the-arts performance is achieved +with the proposed method on several stereo matching such as ETH3D and KITTI +2015. Additionally, to demonstrate that our model effectively leverages +locality inductive bias, we provide the attention distance measurement. + +
+
+
+
+
+ + ☆ iConFormer: Dynamic Parameter-Efficient Tuning with Input-Conditioned + Adaptation + + +
+ Transfer learning based on full fine-tuning (FFT) of the pre-trained encoder +and task-specific decoder becomes increasingly complex as deep models grow +exponentially. Parameter efficient fine-tuning (PEFT) approaches using adapters +consisting of small learnable layers have emerged as an alternative to FFT, +achieving comparable performance while maintaining high training efficiency. +However, the inflexibility of the adapter with respect to input instances +limits its capability of learning task-specific information in diverse +downstream tasks. In this paper, we propose a novel PEFT approach, +input-Conditioned transFormer, termed iConFormer, that leverages a dynamic +adapter conditioned on the input instances. To secure flexible learning ability +on input instances in various downstream tasks, we introduce an +input-Conditioned Network (iCoN) in the dynamic adapter that enables +instance-level feature transformation. To be specific, iCoN generates +channel-wise convolutional kernels for each feature and transform it using +adaptive convolution process to effectively capture task-specific and +fine-grained details tailor to downstream tasks. Experimental results +demonstrate that by tuning just 1.6% to 2.8% of the Transformer backbone +parameters, iConFormer achieves performance comparable to FFT in monocular +depth estimation and semantic segmentation, while outperforming it in image +classification and instance segmentation. Also, the proposed method +consistently outperforms recent PEFT methods for all the tasks mentioned above. + +
+
+
+
+
+ + ☆ ExpLLM: Towards Chain of Thought for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is a critical task in multimedia with +significant implications across various domains. However, analyzing the causes +of facial expressions is essential for accurately recognizing them. Current +approaches, such as those based on facial action units (AUs), typically provide +AU names and intensities but lack insight into the interactions and +relationships between AUs and the overall expression. In this paper, we propose +a novel method called ExpLLM, which leverages large language models to generate +an accurate chain of thought (CoT) for facial expression recognition. +Specifically, we have designed the CoT mechanism from three key perspectives: +key observations, overall emotional interpretation, and conclusion. The key +observations describe the AU's name, intensity, and associated emotions. The +overall emotional interpretation provides an analysis based on multiple AUs and +their interactions, identifying the dominant emotions and their relationships. +Finally, the conclusion presents the final expression label derived from the +preceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed +to construct this expression CoT and generate instruction-description data for +training our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets +demonstrate that ExpLLM outperforms current state-of-the-art FER methods. +ExpLLM also surpasses the latest GPT-4o in expression CoT generation, +particularly in recognizing micro-expressions where GPT-4o frequently fails. + +
+
+ comment: project page: https://starhiking.github.io/ExpLLM_Page/ +
+
+
+
+
+ + ☆ Automatic facial axes standardization of 3D fetal ultrasound images + + +
+ Craniofacial anomalies indicate early developmental disturbances and are +usually linked to many genetic syndromes. Early diagnosis is critical, yet +ultrasound (US) examinations often fail to identify these features. This study +presents an AI-driven tool to assist clinicians in standardizing fetal facial +axes/planes in 3D US, reducing sonographer workload and facilitating the facial +evaluation. Our network, structured into three blocks-feature extractor, +rotation and translation regression, and spatial transformer-processes three +orthogonal 2D slices to estimate the necessary transformations for +standardizing the facial planes in the 3D US. These transformations are applied +to the original 3D US using a differentiable module (the spatial transformer +block), yielding a standardized 3D US and the corresponding 2D facial standard +planes. The dataset used consists of 1180 fetal facial 3D US images acquired +between weeks 20 and 35 of gestation. Results show that our network +considerably reduces inter-observer rotation variability in the test set, with +a mean geodesic angle difference of 14.12$^{\circ}$ $\pm$ 18.27$^{\circ}$ and +an Euclidean angle error of 7.45$^{\circ}$ $\pm$ 14.88$^{\circ}$. These +findings demonstrate the network's ability to effectively standardize facial +axes, crucial for consistent fetal facial assessments. In conclusion, the +proposed network demonstrates potential for improving the consistency and +accuracy of fetal facial assessments in clinical settings, facilitating early +evaluation of craniofacial anomalies. + +
+
+
+
+
+ + ☆ Deep Learning Meets Satellite Images -- An Evaluation on Handcrafted and + Learning-based Features for Multi-date Satellite Stereo Images ECCV2024 + + +
+ A critical step in the digital surface models(DSM) generation is feature +matching. Off-track (or multi-date) satellite stereo images, in particular, can +challenge the performance of feature matching due to spectral distortions +between images, long baseline, and wide intersection angles. Feature matching +methods have evolved over the years from handcrafted methods (e.g., SIFT) to +learning-based methods (e.g., SuperPoint and SuperGlue). In this paper, we +compare the performance of different features, also known as feature extraction +and matching methods, applied to satellite imagery. A wide range of stereo +pairs(~500) covering two separate study sites are used. SIFT, as a widely used +classic feature extraction and matching algorithm, is compared with seven +deep-learning matching methods: SuperGlue, LightGlue, LoFTR, ASpanFormer, DKM, +GIM-LightGlue, and GIM-DKM. Results demonstrate that traditional matching +methods are still competitive in this age of deep learning, although for +particular scenarios learning-based methods are very promising. + +
+
+ comment: ECCV2024 Workshop - TradiCV +
+
+
+
+
+ + ☆ MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding + Benchmark + + +
+ This paper introduces MMMU-Pro, a robust version of the Massive +Multi-discipline Multimodal Understanding and Reasoning (MMMU) benchmark. +MMMU-Pro rigorously assesses multimodal models' true understanding and +reasoning capabilities through a three-step process based on MMMU: (1) +filtering out questions answerable by text-only models, (2) augmenting +candidate options, and (3) introducing a vision-only input setting where +questions are embedded within images. This setting challenges AI to truly "see" +and "read" simultaneously, testing a fundamental human cognitive skill of +seamlessly integrating visual and textual information. Results show that model +performance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8% +to 26.9% across models. We explore the impact of OCR prompts and Chain of +Thought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT +generally improves performance. MMMU-Pro provides a more rigorous evaluation +tool, closely mimicking real-world scenarios and offering valuable directions +for future research in multimodal AI. + +
+
+
+
+
+ + ☆ UnLearning from Experience to Avoid Spurious Correlations + + +
+ While deep neural networks can achieve state-of-the-art performance in many +tasks, these models are more fragile than they appear. They are prone to +learning spurious correlations in their training data, leading to surprising +failure cases. In this paper, we propose a new approach that addresses the +issue of spurious correlations: UnLearning from Experience (ULE). Our method is +based on using two classification models trained in parallel: student and +teacher models. Both models receive the same batches of training data. The +student model is trained with no constraints and pursues the spurious +correlations in the data. The teacher model is trained to solve the same +classification problem while avoiding the mistakes of the student model. As +training is done in parallel, the better the student model learns the spurious +correlations, the more robust the teacher model becomes. The teacher model uses +the gradient of the student's output with respect to its input to unlearn +mistakes made by the student. We show that our method is effective on the +Waterbirds, CelebA, Spawrious and UrbanCars datasets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Validation of musculoskeletal segmentation model with uncertainty + estimation for bone and muscle assessment in hip-to-knee clinical CT images + + +
+ Deep learning-based image segmentation has allowed for the fully automated, +accurate, and rapid analysis of musculoskeletal (MSK) structures from medical +images. However, current approaches were either applied only to 2D +cross-sectional images, addressed few structures, or were validated on small +datasets, which limit the application in large-scale databases. This study +aimed to validate an improved deep learning model for volumetric MSK +segmentation of the hip and thigh with uncertainty estimation from clinical +computed tomography (CT) images. Databases of CT images from multiple +manufacturers/scanners, disease status, and patient positioning were used. The +segmentation accuracy, and accuracy in estimating the structures volume and +density, i.e., mean HU, were evaluated. An approach for segmentation failure +detection based on predictive uncertainty was also investigated. The model has +shown an overall improvement with respect to all segmentation accuracy and +structure volume/density evaluation metrics. The predictive uncertainty yielded +large areas under the receiver operating characteristic (AUROC) curves +(AUROCs>=.95) in detecting inaccurate and failed segmentations. The high +segmentation and muscle volume/density estimation accuracy, along with the high +accuracy in failure detection based on the predictive uncertainty, exhibited +the model's reliability for analyzing individual MSK structures in large-scale +CT databases. + +
+
+ comment: 29 pages, 7+10supp figures, 8 tables +
+
+
+
+
+ + ☆ CLDA: Collaborative Learning for Enhanced Unsupervised Domain Adaptation + + +
+ Unsupervised Domain Adaptation (UDA) endeavors to bridge the gap between a +model trained on a labeled source domain and its deployment in an unlabeled +target domain. However, current high-performance models demand significant +resources, resulting in prohibitive deployment costs and highlighting the need +for small yet effective models. For UDA of lightweight models, Knowledge +Distillation (KD) in a Teacher-Student framework can be a common approach, but +we find that domain shift in UDA leads to a significant increase in non-salient +parameters in the teacher model, degrading model's generalization ability and +transferring misleading information to the student model. Interestingly, we +observed that this phenomenon occurs considerably less in the student model. +Driven by this insight, we introduce Collaborative Learning, a method that +updates the teacher's non-salient parameters using the student model and at the +same time enhance the student's performance using the updated teacher model. +Experiments across various tasks and datasets show consistent performance +improvements for both student and teacher models. For example, in semantic +segmentation, CLDA achieves an improvement of +0.7% mIoU for teacher and +1.4% +mIoU for student compared to the baseline model in the GTA to Cityscapes. In +the Synthia to Cityscapes, it achieves an improvement of +0.8% mIoU for teacher +and +2.0% mIoU for student. + +
+
+
+
+
+ + ☆ Rethinking HTG Evaluation: Bridging Generation and Recognition + + +
+ The evaluation of generative models for natural image tasks has been +extensively studied. Similar protocols and metrics are used in cases with +unique particularities, such as Handwriting Generation, even if they might not +be completely appropriate. In this work, we introduce three measures tailored +for HTG evaluation, $ \text{HTG}_{\text{HTR}} $, $ \text{HTG}_{\text{style}} $, +and $ \text{HTG}_{\text{OOV}} $, and argue that they are more expedient to +evaluate the quality of generated handwritten images. The metrics rely on the +recognition error/accuracy of Handwriting Text Recognition and Writer +Identification models and emphasize writing style, textual content, and +diversity as the main aspects that adhere to the content of handwritten images. +We conduct comprehensive experiments on the IAM handwriting database, +showcasing that widely used metrics such as FID fail to properly quantify the +diversity and the practical utility of generated handwriting samples. Our +findings show that our metrics are richer in information and underscore the +necessity of standardized evaluation protocols in HTG. The proposed metrics +provide a more robust and informative protocol for assessing HTG quality, +contributing to improved performance in HTR. Code for the evaluation protocol +is available at: https://github.com/koninik/HTG_evaluation. + +
+
+
+
+
+ + ☆ Improved Single Camera BEV Perception Using Multi-Camera Training SC 2024 + + +
+ Bird's Eye View (BEV) map prediction is essential for downstream autonomous +driving tasks like trajectory prediction. In the past, this was accomplished +through the use of a sophisticated sensor configuration that captured a +surround view from multiple cameras. However, in large-scale production, cost +efficiency is an optimization goal, so that using fewer cameras becomes more +relevant. But the consequence of fewer input images correlates with a +performance drop. This raises the problem of developing a BEV perception model +that provides a sufficient performance on a low-cost sensor setup. Although, +primarily relevant for inference time on production cars, this cost restriction +is less problematic on a test vehicle during training. Therefore, the objective +of our approach is to reduce the aforementioned performance drop as much as +possible using a modern multi-camera surround view model reduced for +single-camera inference. The approach includes three features, a modern masking +technique, a cyclic Learning Rate (LR) schedule, and a feature reconstruction +loss for supervising the transition from six-camera inputs to one-camera input +during training. Our method outperforms versions trained strictly with one +camera or strictly with six-camera surround view for single-camera inference +resulting in reduced hallucination and better quality of the BEV map. + +
+
+ comment: This Paper has been accepted to the 27th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2024) +
+
+
+
+
+ + ☆ Multi-Head Attention Residual Unfolded Network for Model-Based + Pansharpening + + +
+ The objective of pansharpening and hypersharpening is to accurately combine a +high-resolution panchromatic (PAN) image with a low-resolution multispectral +(MS) or hyperspectral (HS) image, respectively. Unfolding fusion methods +integrate the powerful representation capabilities of deep learning with the +robustness of model-based approaches. These techniques involve unrolling the +steps of the optimization scheme derived from the minimization of an energy +into a deep learning framework, resulting in efficient and highly interpretable +architectures. In this paper, we propose a model-based deep unfolded method for +satellite image fusion. Our approach is based on a variational formulation that +incorporates the classic observation model for MS/HS data, a high-frequency +injection constraint based on the PAN image, and an arbitrary convex prior. For +the unfolding stage, we introduce upsampling and downsampling layers that use +geometric information encoded in the PAN image through residual networks. The +backbone of our method is a multi-head attention residual network (MARNet), +which replaces the proximity operator in the optimization scheme and combines +multiple head attentions with residual learning to exploit image +self-similarities via nonlocal operators defined in terms of patches. +Additionally, we incorporate a post-processing module based on the MARNet +architecture to further enhance the quality of the fused images. Experimental +results on PRISMA, Quickbird, and WorldView2 datasets demonstrate the superior +performance of our method and its ability to generalize across different sensor +configurations and varying spatial and spectral resolutions. The source code +will be available at https://github.com/TAMI-UIB/MARNet. + +
+
+
+
+
+ + ☆ Standing on the Shoulders of Giants: Reprogramming Visual-Language Model + for General Deepfake Detection + + +
+ The proliferation of deepfake faces poses huge potential negative impacts on +our daily lives. Despite substantial advancements in deepfake detection over +these years, the generalizability of existing methods against forgeries from +unseen datasets or created by emerging generative models remains constrained. +In this paper, inspired by the zero-shot advantages of Vision-Language Models +(VLMs), we propose a novel approach that repurposes a well-trained VLM for +general deepfake detection. Motivated by the model reprogramming paradigm that +manipulates the model prediction via data perturbations, our method can +reprogram a pretrained VLM model (e.g., CLIP) solely based on manipulating its +input without tuning the inner parameters. Furthermore, we insert a pseudo-word +guided by facial identity into the text prompt. Extensive experiments on +several popular benchmarks demonstrate that (1) the cross-dataset and +cross-manipulation performances of deepfake detection can be significantly and +consistently improved (e.g., over 88% AUC in cross-dataset setting from FF++ to +WildDeepfake) using a pre-trained CLIP model with our proposed reprogramming +method; (2) our superior performances are at less cost of trainable parameters, +making it a promising approach for real-world applications. + +
+
+
+
+
+ + ☆ PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for + One-Shot Talking Head Generation + + +
+ While previous audio-driven talking head generation (THG) methods generate +head poses from driving audio, the generated poses or lips cannot match the +audio well or are not editable. In this study, we propose \textbf{PoseTalk}, a +THG system that can freely generate lip-synchronized talking head videos with +free head poses conditioned on text prompts and audio. The core insight of our +method is using head pose to connect visual, linguistic, and audio signals. +First, we propose to generate poses from both audio and text prompts, where the +audio offers short-term variations and rhythm correspondence of the head +movements and the text prompts describe the long-term semantics of head +motions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to +generate motion latent from text prompts and audio cues in a pose latent space. +Second, we observe a loss-imbalance problem: the loss for the lip region +contributes less than 4\% of the total reconstruction loss caused by both pose +and lip, making optimization lean towards head movements rather than lip +shapes. To address this issue, we propose a refinement-based learning strategy +to synthesize natural talking videos using two cascaded networks, i.e., +CoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce +animated images in novel poses and the RefineNet focuses on learning finer lip +motions by progressively estimating lip motions from low-to-high resolutions, +yielding improved lip-synchronization performance. Experiments demonstrate our +pose prediction strategy achieves better pose diversity and realness compared +to text-only or audio-only, and our video generator model outperforms +state-of-the-art methods in synthesizing talking videos with natural head +motions. Project: https://junleen.github.io/projects/posetalk. + +
+
+ comment: 7+5 pages, 15 figures +
+
+
+
+
+ + ☆ Skip-and-Play: Depth-Driven Pose-Preserved Image Generation for Any + Objects + + +
+ The emergence of diffusion models has enabled the generation of diverse +high-quality images solely from text, prompting subsequent efforts to enhance +the controllability of these models. Despite the improvement in +controllability, pose control remains limited to specific objects (e.g., +humans) or poses (e.g., frontal view) due to the fact that pose is generally +controlled via camera parameters (e.g., rotation angle) or keypoints (e.g., +eyes, nose). Specifically, camera parameters-conditional pose control models +generate unrealistic images depending on the object, owing to the small size of +3D datasets for training. Also, keypoint-based approaches encounter challenges +in acquiring reliable keypoints for various objects (e.g., church) or poses +(e.g., back view). To address these limitations, we propose depth-based pose +control, as depth maps are easily obtainable from a single depth estimation +model regardless of objects and poses, unlike camera parameters and keypoints. +However, depth-based pose control confronts issues of shape dependency, as +depth maps influence not only the pose but also the shape of the generated +images. To tackle this issue, we propose Skip-and-Play (SnP), designed via +analysis of the impact of three components of depth-conditional ControlNet on +the pose and the shape of the generated images. To be specific, based on the +analysis, we selectively skip parts of the components to mitigate shape +dependency on the depth map while preserving the pose. Through various +experiments, we demonstrate the superiority of SnP over baselines and showcase +the ability of SnP to generate images of diverse objects and poses. Remarkably, +SnP exhibits the ability to generate images even when the objects in the +condition (e.g., a horse) and the prompt (e.g., a hedgehog) differ from each +other. + +
+
+
+
+
+ + ☆ Creating a Microstructure Latent Space with Rich Material Information + for Multiphase Alloy Design + + +
+ The intricate microstructure serves as the cornerstone for the +composition/processing-structure-property (CPSP) connection in multiphase +alloys. Traditional alloy design methods often overlook microstructural +details, which diminishes the reliability and effectiveness of the outcomes. +This study introduces an improved alloy design algorithm that integrates +authentic microstructural information to establish precise CPSP relationships. +The approach utilizes a deep-learning framework based on a variational +autoencoder to map real microstructural data to a latent space, enabling the +prediction of composition, processing steps, and material properties from the +latent space vector. By integrating this deep learning model with a specific +sampling strategy in the latent space, a novel, microstructure-centered +algorithm for multiphase alloy design is developed. This algorithm is +demonstrated through the design of a unified dual-phase steel, and the results +are assessed at three performance levels. Moreover, an exploration into the +latent vector space of the model highlights its seamless interpolation ability +and its rich material information content. Notably, the current configuration +of the latent space is particularly advantageous for alloy design, offering an +exhaustive representation of microstructure, composition, processing, and +property variations essential for multiphase alloys. + +
+
+
+
+
+ + ☆ Learning-Based Error Detection System for Advanced Vehicle Instrument + Cluster Rendering + + +
+ The automotive industry is currently expanding digital display options with +every new model that comes onto the market. This entails not just an expansion +in dimensions, resolution, and customization choices, but also the capability +to employ novel display effects like overlays while assembling the content of +the display cluster. Unfortunately, this raises the need for appropriate +monitoring systems that can detect rendering errors and apply appropriate +countermeasures when required. Classical solutions such as Cyclic Redundancy +Checks (CRC) will soon be no longer viable as any sort of alpha blending, +warping of scaling of content can cause unwanted CRC violations. Therefore, we +propose a novel monitoring approach to verify correctness of displayed content +using telltales (e.g. warning signs) as example. It uses a learning-based +approach to separate "good" telltales, i.e. those that a human driver will +understand correctly, and "corrupted" telltales, i.e. those that will not be +visible or perceived correctly. As a result, it possesses inherent resilience +against individual pixel errors and implicitly supports changing backgrounds, +overlay or scaling effects. This is underlined by our experimental study where +all "corrupted" test patterns were correctly classified, while no false alarms +were triggered. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ MADiff: Motion-Aware Mamba Diffusion Models for Hand Trajectory + Prediction on Egocentric Videos + + +
+ Understanding human intentions and actions through egocentric videos is +important on the path to embodied artificial intelligence. As a branch of +egocentric vision techniques, hand trajectory prediction plays a vital role in +comprehending human motion patterns, benefiting downstream tasks in extended +reality and robot manipulation. However, capturing high-level human intentions +consistent with reasonable temporal causality is challenging when only +egocentric videos are available. This difficulty is exacerbated under camera +egomotion interference and the absence of affordance labels to explicitly guide +the optimization of hand waypoint distribution. In this work, we propose a +novel hand trajectory prediction method dubbed MADiff, which forecasts future +hand waypoints with diffusion models. The devised denoising operation in the +latent space is achieved by our proposed motion-aware Mamba, where the camera +wearer's egomotion is integrated to achieve motion-driven selective scan +(MDSS). To discern the relationship between hands and scenarios without +explicit affordance supervision, we leverage a foundation model that fuses +visual and language features to capture high-level semantics from video clips. +Comprehensive experiments conducted on five public datasets with the existing +and our proposed new evaluation metrics demonstrate that MADiff predicts +comparably reasonable hand trajectories compared to the state-of-the-art +baselines, and achieves real-time performance. We will release our code and +pretrained models of MADiff at the project page: +https://irmvlab.github.io/madiff.github.io. + +
+
+
+
+
+ + ☆ Loopy: Taming Audio-Driven Portrait Avatar with Long-Term Motion + Dependency + + +
+ With the introduction of diffusion-based video generation techniques, +audio-conditioned human video generation has recently achieved significant +breakthroughs in both the naturalness of motion and the synthesis of portrait +details. Due to the limited control of audio signals in driving human motion, +existing methods often add auxiliary spatial signals to stabilize movements, +which may compromise the naturalness and freedom of motion. In this paper, we +propose an end-to-end audio-only conditioned video diffusion model named Loopy. +Specifically, we designed an inter- and intra-clip temporal module and an +audio-to-latents module, enabling the model to leverage long-term motion +information from the data to learn natural motion patterns and improving +audio-portrait movement correlation. This method removes the need for manually +specified spatial motion templates used in existing methods to constrain motion +during inference. Extensive experiments show that Loopy outperforms recent +audio-driven portrait diffusion models, delivering more lifelike and +high-quality results across various scenarios. + +
+
+
+
+
+ + ☆ AdvSecureNet: A Python Toolkit for Adversarial Machine Learning + + +
+ Machine learning models are vulnerable to adversarial attacks. Several tools +have been developed to research these vulnerabilities, but they often lack +comprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch +based toolkit for adversarial machine learning that is the first to natively +support multi-GPU setups for attacks, defenses, and evaluation. It is the first +toolkit that supports both CLI and API interfaces and external YAML +configuration files to enhance versatility and reproducibility. The toolkit +includes multiple attacks, defenses and evaluation metrics. Rigiorous software +engineering practices are followed to ensure high code quality and +maintainability. The project is available as an open-source project on GitHub +at https://github.com/melihcatal/advsecurenet and installable via PyPI. + +
+
+
+
+
+ + ☆ GoT-CQA: Graph-of-Thought Guided Compositional Reasoning for Chart + Question Answering + + +
+ Chart Question Answering (CQA) aims at answering questions based on the +visual chart content, which plays an important role in chart sumarization, +business data analysis, and data report generation. CQA is a challenging +multi-modal task because of the strong context dependence and complex reasoning +requirement. The former refers to answering this question strictly based on the +analysis of the visual content or internal data of the given chart, while the +latter emphasizes the various logical and numerical reasoning involved in +answer prediction process. In this paper, we pay more attention on the complex +reasoning in CQA task, and propose a novel Graph-of-Thought (GoT) guided +compositional reasoning model called GoT-CQA to overcome this problem. At +first, we transform the chart-oriented question into a directed acyclic GoT +composed of multiple operator nodes, including localization, numerical and +logical operator. It intuitively reflects the human brain's solution process to +this question. After that, we design an efficient auto-compositional reasoning +framework guided by the GoT, to excute the multi-step reasoning operations in +various types of questions. Comprehensive experiments on ChartQA and PlotQA-D +datasets show that GoT-CQA achieves outstanding performance, especially in +complex human-written and reasoning questions, comparing with the latest +popular baselines. + +
+
+
+
+
+ + ☆ A Medical Multimodal Large Language Model for Pediatric Pneumonia + + +
+ Pediatric pneumonia is the leading cause of death among children under five +years worldwide, imposing a substantial burden on affected families. Currently, +there are three significant hurdles in diagnosing and treating pediatric +pneumonia. Firstly, pediatric pneumonia shares similar symptoms with other +respiratory diseases, making rapid and accurate differential diagnosis +challenging. Secondly, primary hospitals often lack sufficient medical +resources and experienced doctors. Lastly, providing personalized diagnostic +reports and treatment recommendations is labor-intensive and time-consuming. To +tackle these challenges, we proposed a Medical Multimodal Large Language Model +for Pediatric Pneumonia (P2Med-MLLM). It was capable of handling diverse +clinical tasks, such as generating free-text radiology reports and medical +records within a unified framework. Specifically, P2Med-MLLM can process both +pure text and image-text data, trained on an extensive and large-scale dataset +(P2Med-MD), including real clinical information from 163,999 outpatient and +8,684 inpatient cases. This dataset comprised 2D chest X-ray images, 3D chest +CT images, corresponding radiology reports, and outpatient and inpatient +records. We designed a three-stage training strategy to enable P2Med-MLLM to +comprehend medical knowledge and follow instructions for various clinical +tasks. To rigorously evaluate P2Med-MLLM's performance, we developed +P2Med-MBench, a benchmark consisting of 642 meticulously verified samples by +pediatric pulmonology specialists, covering six clinical decision-support tasks +and a balanced variety of diseases. The automated scoring results demonstrated +the superiority of P2Med-MLLM. This work plays a crucial role in assisting +primary care doctors with prompt disease diagnosis and treatment planning, +reducing severe symptom mortality rates, and optimizing the allocation of +medical resources. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ☆ A Fashion Item Recommendation Model in Hyperbolic Space CVPR 2024 + + +
+ In this work, we propose a fashion item recommendation model that +incorporates hyperbolic geometry into user and item representations. Using +hyperbolic space, our model aims to capture implicit hierarchies among items +based on their visual data and users' purchase history. During training, we +apply a multi-task learning framework that considers both hyperbolic and +Euclidean distances in the loss function. Our experiments on three data sets +show that our model performs better than previous models trained in Euclidean +space only, confirming the effectiveness of our model. Our ablation studies +show that multi-task learning plays a key role, and removing the Euclidean loss +substantially deteriorates the model performance. + +
+
+ comment: This work was presented at the CVFAD Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ SurgTrack: CAD-Free 3D Tracking of Real-world Surgical Instruments + + +
+ Vision-based surgical navigation has received increasing attention due to its +non-invasive, cost-effective, and flexible advantages. In particular, a +critical element of the vision-based navigation system is tracking surgical +instruments. Compared with 2D instrument tracking methods, 3D instrument +tracking has broader value in clinical practice, but is also more challenging +due to weak texture, occlusion, and lack of Computer-Aided Design (CAD) models +for 3D registration. To solve these challenges, we propose the SurgTrack, a +two-stage 3D instrument tracking method for CAD-free and robust real-world +applications. In the first registration stage, we incorporate an Instrument +Signed Distance Field (SDF) modeling the 3D representation of instruments, +achieving CAD-freed 3D registration. Due to this, we can obtain the location +and orientation of instruments in the 3D space by matching the video stream +with the registered SDF model. In the second tracking stage, we devise a +posture graph optimization module, leveraging the historical tracking results +of the posture memory pool to optimize the tracking results and improve the +occlusion robustness. Furthermore, we collect the Instrument3D dataset to +comprehensively evaluate the 3D tracking of surgical instruments. The extensive +experiments validate the superiority and scalability of our SurgTrack, by +outperforming the state-of-the-arts with a remarkable improvement. The code and +dataset are available at https://github.com/wenwucode/SurgTrack. + +
+
+
+
+
+ + ☆ BMI Prediction from Handwritten English Characters Using a Convolutional + Neural Network + + +
+ A person's Body Mass Index, or BMI, is the most widely used parameter for +assessing their health. BMI is a crucial predictor of potential diseases that +may arise at higher body fat levels because it is correlated with body fat. +Conversely, a community's or an individual's nutritional status can be +determined using the BMI. Although deep learning models are used in several +studies to estimate BMI from face photos and other data, no previous research +established a clear connection between deep learning techniques for handwriting +analysis and BMI prediction. This article addresses this research gap with a +deep learning approach to estimating BMI from handwritten characters by +developing a convolutional neural network (CNN). A dataset containing samples +from 48 people in lowercase English scripts is successfully captured for the +BMI prediction task. The proposed CNN-based approach reports a commendable +accuracy of 99.92%. Performance comparison with other popular CNN architectures +reveals that AlexNet and InceptionV3 achieve the second and third-best +performance, with the accuracy of 99.69% and 99.53%, respectively. + +
+
+
+
+
+ + ☆ Object Gaussian for Monocular 6D Pose Estimation from Sparse Views + + +
+ Monocular object pose estimation, as a pivotal task in computer vision and +robotics, heavily depends on accurate 2D-3D correspondences, which often demand +costly CAD models that may not be readily available. Object 3D reconstruction +methods offer an alternative, among which recent advancements in 3D Gaussian +Splatting (3DGS) afford a compelling potential. Yet its performance still +suffers and tends to overfit with fewer input views. Embracing this challenge, +we introduce SGPose, a novel framework for sparse view object pose estimation +using Gaussian-based methods. Given as few as ten views, SGPose generates a +geometric-aware representation by starting with a random cuboid initialization, +eschewing reliance on Structure-from-Motion (SfM) pipeline-derived geometry as +required by traditional 3DGS methods. SGPose removes the dependence on CAD +models by regressing dense 2D-3D correspondences between images and the +reconstructed model from sparse input and random initialization, while the +geometric-consistent depth supervision and online synthetic view warping are +key to the success. Experiments on typical benchmarks, especially on the +Occlusion LM-O dataset, demonstrate that SGPose outperforms existing methods +even under sparse view constraints, under-scoring its potential in real-world +applications. + +
+
+
+
+
+ + ☆ Solving Video Inverse Problems Using Image Diffusion Models + + +
+ Recently, diffusion model-based inverse problem solvers (DIS) have emerged as +state-of-the-art approaches for addressing inverse problems, including image +super-resolution, deblurring, inpainting, etc. However, their application to +video inverse problems arising from spatio-temporal degradation remains largely +unexplored due to the challenges in training video diffusion models. To address +this issue, here we introduce an innovative video inverse solver that leverages +only image diffusion models. Specifically, by drawing inspiration from the +success of the recent decomposed diffusion sampler (DDS), our method treats the +time dimension of a video as the batch dimension of image diffusion models and +solves spatio-temporal optimization problems within denoised spatio-temporal +batches derived from each image diffusion model. Moreover, we introduce a +batch-consistent diffusion sampling strategy that encourages consistency across +batches by synchronizing the stochastic noise components in image diffusion +models. Our approach synergistically combines batch-consistent sampling with +simultaneous optimization of denoised spatio-temporal batches at each reverse +diffusion step, resulting in a novel and efficient diffusion sampling strategy +for video inverse problems. Experimental results demonstrate that our method +effectively addresses various spatio-temporal degradations in video inverse +problems, achieving state-of-the-art reconstructions. Project page: +https://solving-video-inverse.github.io/main/ + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ☆ Evaluation Study on SAM 2 for Class-agnostic Instance-level Segmentation + + +
+ Segment Anything Model (SAM) has demonstrated powerful zero-shot segmentation +performance in natural scenes. The recently released Segment Anything Model 2 +(SAM2) has further heightened researchers' expectations towards image +segmentation capabilities. To evaluate the performance of SAM2 on +class-agnostic instance-level segmentation tasks, we adopt different prompt +strategies for SAM2 to cope with instance-level tasks for three relevant +scenarios: Salient Instance Segmentation (SIS), Camouflaged Instance +Segmentation (CIS), and Shadow Instance Detection (SID). In addition, to +further explore the effectiveness of SAM2 in segmenting granular object +structures, we also conduct detailed tests on the high-resolution Dichotomous +Image Segmentation (DIS) benchmark to assess the fine-grained segmentation +capability. Qualitative and quantitative experimental results indicate that the +performance of SAM2 varies significantly across different scenarios. Besides, +SAM2 is not particularly sensitive to segmenting high-resolution fine details. +We hope this technique report can drive the emergence of SAM2-based adapters, +aiming to enhance the performance ceiling of large vision models on +class-agnostic instance segmentation tasks. + +
+
+
+
+
+ + ☆ How Do You Perceive My Face? Recognizing Facial Expressions in + Multi-Modal Context by Modeling Mental Representations + + +
+ Facial expression perception in humans inherently relies on prior knowledge +and contextual cues, contributing to efficient and flexible processing. For +instance, multi-modal emotional context (such as voice color, affective text, +body pose, etc.) can prompt people to perceive emotional expressions in +objectively neutral faces. Drawing inspiration from this, we introduce a novel +approach for facial expression classification that goes beyond simple +classification tasks. Our model accurately classifies a perceived face and +synthesizes the corresponding mental representation perceived by a human when +observing a face in context. With this, our model offers visual insights into +its internal decision-making process. We achieve this by learning two +independent representations of content and context using a VAE-GAN +architecture. Subsequently, we propose a novel attention mechanism for +context-dependent feature adaptation. The adapted representation is used for +classification and to generate a context-augmented expression. We evaluate +synthesized expressions in a human study, showing that our model effectively +produces approximations of human mental representations. We achieve +State-of-the-Art classification accuracies of 81.01% on the RAVDESS dataset and +79.34% on the MEAD dataset. We make our code publicly available. + +
+
+ comment: GCPR 2024 +
+
+
+
+
+ + ☆ Interacting Multiple Model-based Joint Homography Matrix and Multiple + Object State Estimation + + +
+ A novel MOT algorithm, IMM Joint Homography State Estimation (IMM-JHSE), is +proposed. By jointly modelling the camera projection matrix as part of track +state vectors, IMM-JHSE removes the explicit influence of camera motion +compensation techniques on predicted track position states, which was prevalent +in previous approaches. Expanding upon this, static and dynamic camera motion +models are combined through the use of an IMM filter. A simple bounding box +motion model is used to predict bounding box positions to incorporate image +plane information. In addition to applying an IMM to camera motion, a +non-standard IMM approach is applied where bounding-box-based BIoU scores are +mixed with ground-plane-based Mahalanobis distances in an IMM-like fashion to +perform association only. Finally, IMM-JHSE makes use of dynamic process and +measurement noise estimation techniques. IMM-JHSE improves upon related +techniques on the DanceTrack and KITTI-car datasets, increasing HOTA by 2.64 +and 2.11, respectively, while offering competitive performance on the MOT17, +MOT20 and KITTI-pedestrian datasets. + +
+
+ comment: Preprint submitted to Information Fusion +
+
+
+
+
+ + ☆ Low-Resolution Object Recognition with Cross-Resolution Relational + Contrastive Distillation + + +
+ Recognizing objects in low-resolution images is a challenging task due to the +lack of informative details. Recent studies have shown that knowledge +distillation approaches can effectively transfer knowledge from a +high-resolution teacher model to a low-resolution student model by aligning +cross-resolution representations. However, these approaches still face +limitations in adapting to the situation where the recognized objects exhibit +significant representation discrepancies between training and testing images. +In this study, we propose a cross-resolution relational contrastive +distillation approach to facilitate low-resolution object recognition. Our +approach enables the student model to mimic the behavior of a well-trained +teacher model which delivers high accuracy in identifying high-resolution +objects. To extract sufficient knowledge, the student learning is supervised +with contrastive relational distillation loss, which preserves the similarities +in various relational structures in contrastive representation space. In this +manner, the capability of recovering missing details of familiar low-resolution +objects can be effectively enhanced, leading to a better knowledge transfer. +Extensive experiments on low-resolution object classification and +low-resolution face recognition clearly demonstrate the effectiveness and +adaptability of our approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Circuits and Systems + for Video Technology (TCSVT) +
+
+
+
+
+ + ☆ Real-Time Dynamic Scale-Aware Fusion Detection Network: Take Road Damage + Detection as an example + + +
+ Unmanned Aerial Vehicle (UAV)-based Road Damage Detection (RDD) is important +for daily maintenance and safety in cities, especially in terms of +significantly reducing labor costs. However, current UAV-based RDD research is +still faces many challenges. For example, the damage with irregular size and +direction, the masking of damage by the background, and the difficulty of +distinguishing damage from the background significantly affect the ability of +UAV to detect road damage in daily inspection. To solve these problems and +improve the performance of UAV in real-time road damage detection, we design +and propose three corresponding modules: a feature extraction module that +flexibly adapts to shape and background; a module that fuses multiscale +perception and adapts to shape and background ; an efficient downsampling +module. Based on these modules, we designed a multi-scale, adaptive road damage +detection model with the ability to automatically remove background +interference, called Dynamic Scale-Aware Fusion Detection Model (RT-DSAFDet). +Experimental results on the UAV-PDD2023 public dataset show that our model +RT-DSAFDet achieves a mAP50 of 54.2%, which is 11.1% higher than that of +YOLOv10-m, an efficient variant of the latest real-time object detection model +YOLOv10, while the amount of parameters is reduced to 1.8M and FLOPs to 4.6G, +with a decreased by 88% and 93%, respectively. Furthermore, on the large +generalized object detection public dataset MS COCO2017 also shows the +superiority of our model with mAP50-95 is the same as YOLOv9-t, but with 0.5% +higher mAP50, 10% less parameters volume, and 40% less FLOPs. + +
+
+
+
+
+ + ☆ UniTT-Stereo: Unified Training of Transformer for Enhanced Stereo + Matching + + +
+ Unlike other vision tasks where Transformer-based approaches are becoming +increasingly common, stereo depth estimation is still dominated by +convolution-based approaches. This is mainly due to the limited availability of +real-world ground truth for stereo matching, which is a limiting factor in +improving the performance of Transformer-based stereo approaches. In this +paper, we propose UniTT-Stereo, a method to maximize the potential of +Transformer-based stereo architectures by unifying self-supervised learning +used for pre-training with stereo matching framework based on supervised +learning. To be specific, we explore the effectiveness of reconstructing +features of masked portions in an input image and at the same time predicting +corresponding points in another image from the perspective of locality +inductive bias, which is crucial in training models with limited training data. +Moreover, to address these challenging tasks of reconstruction-and-prediction, +we present a new strategy to vary a masking ratio when training the stereo +model with stereo-tailored losses. State-of-the-art performance of UniTT-Stereo +is validated on various benchmarks such as ETH3D, KITTI 2012, and KITTI 2015 +datasets. Lastly, to investigate the advantages of the proposed approach, we +provide a frequency analysis of feature maps and the analysis of locality +inductive bias based on attention maps. + +
+
+
+
+
+ + ☆ StyleTokenizer: Defining Image Style by a Single Instance for + Controlling Diffusion Models ECCV2024 + + +
+ Despite the burst of innovative methods for controlling the diffusion +process, effectively controlling image styles in text-to-image generation +remains a challenging task. Many adapter-based methods impose image +representation conditions on the denoising process to accomplish image control. +However these conditions are not aligned with the word embedding space, leading +to interference between image and text control conditions and the potential +loss of semantic information from the text prompt. Addressing this issue +involves two key challenges. Firstly, how to inject the style representation +without compromising the effectiveness of text representation in control. +Secondly, how to obtain the accurate style representation from a single +reference image. To tackle these challenges, we introduce StyleTokenizer, a +zero-shot style control image generation method that aligns style +representation with text representation using a style tokenizer. This alignment +effectively minimizes the impact on the effectiveness of text prompts. +Furthermore, we collect a well-labeled style dataset named Style30k to train a +style feature extractor capable of accurately representing style while +excluding other content information. Experimental results demonstrate that our +method fully grasps the style characteristics of the reference image, +generating appealing images that are consistent with both the target image +style and text prompt. The code and dataset are available at +https://github.com/alipay/style-tokenizer. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ Sample what you cant compress + + +
+ For learned image representations, basic autoencoders often produce blurry +results. Reconstruction quality can be improved by incorporating additional +penalties such as adversarial (GAN) and perceptual losses. Arguably, these +approaches lack a principled interpretation. Concurrently, in generative +settings diffusion has demonstrated a remarkable ability to create crisp, high +quality results and has solid theoretical underpinnings (from variational +inference to direct study as the Fisher Divergence). Our work combines +autoencoder representation learning with diffusion and is, to our knowledge, +the first to demonstrate the efficacy of jointly learning a continuous encoder +and decoder under a diffusion-based loss. We demonstrate that this approach +yields better reconstruction quality as compared to GAN-based autoencoders +while being easier to tune. We also show that the resulting representation is +easier to model with a latent diffusion model as compared to the representation +obtained from a state-of-the-art GAN-based loss. Since our decoder is +stochastic, it can generate details not encoded in the otherwise deterministic +latent representation; we therefore name our approach "Sample what you can't +compress", or SWYCC for short. + +
+
+
+
+
+ + ☆ SG-MIM: Structured Knowledge Guided Efficient Pre-training for Dense + Prediction + + +
+ Masked Image Modeling (MIM) techniques have redefined the landscape of +computer vision, enabling pre-trained models to achieve exceptional performance +across a broad spectrum of tasks. Despite their success, the full potential of +MIM-based methods in dense prediction tasks, particularly in depth estimation, +remains untapped. Existing MIM approaches primarily rely on single-image +inputs, which makes it challenging to capture the crucial structured +information, leading to suboptimal performance in tasks requiring fine-grained +feature representation. To address these limitations, we propose SG-MIM, a +novel Structured knowledge Guided Masked Image Modeling framework designed to +enhance dense prediction tasks by utilizing structured knowledge alongside +images. SG-MIM employs a lightweight relational guidance framework, allowing it +to guide structured knowledge individually at the feature level rather than +naively combining at the pixel level within the same architecture, as is common +in traditional multi-modal pre-training methods. This approach enables the +model to efficiently capture essential information while minimizing +discrepancies between pre-training and downstream tasks. Furthermore, SG-MIM +employs a selective masking strategy to incorporate structured knowledge, +maximizing the synergy between general representation learning and structured +knowledge-specific learning. Our method requires no additional annotations, +making it a versatile and efficient solution for a wide range of applications. +Our evaluations on the KITTI, NYU-v2, and ADE20k datasets demonstrate SG-MIM's +superiority in monocular depth estimation and semantic segmentation. + +
+
+
+
+
+ + ☆ TLD: A Vehicle Tail Light signal Dataset and Benchmark + + +
+ Understanding other drivers' intentions is crucial for safe driving. The role +of taillights in conveying these intentions is underemphasized in current +autonomous driving systems. Accurately identifying taillight signals is +essential for predicting vehicle behavior and preventing collisions. +Open-source taillight datasets are scarce, often small and inconsistently +annotated. To address this gap, we introduce a new large-scale taillight +dataset called TLD. Sourced globally, our dataset covers diverse traffic +scenarios. To our knowledge, TLD is the first dataset to separately annotate +brake lights and turn signals in real driving scenarios. We collected 17.78 +hours of driving videos from the internet. This dataset consists of 152k +labeled image frames sampled at a rate of 2 Hz, along with 1.5 million +unlabeled frames interspersed throughout. Additionally, we have developed a +two-stage vehicle light detection model consisting of two primary modules: a +vehicle detector and a taillight classifier. Initially, YOLOv10 and DeepSORT +captured consecutive vehicle images over time. Subsequently, the two +classifiers work simultaneously to determine the states of the brake lights and +turn signals. A post-processing procedure is then used to eliminate noise +caused by misidentifications and provide the taillight states of the vehicle +within a given time frame. Our method shows exceptional performance on our +dataset, establishing a benchmark for vehicle taillight detection. The dataset +is available at https://huggingface.co/datasets/ChaiJohn/TLD/tree/main + +
+
+
+
+
+ + ☆ A Learnable Color Correction Matrix for RAW Reconstruction BMVC2024 + + +
+ Autonomous driving algorithms usually employ sRGB images as model input due +to their compatibility with the human visual system. However, visually pleasing +sRGB images are possibly sub-optimal for downstream tasks when compared to RAW +images. The availability of RAW images is constrained by the difficulties in +collecting real-world driving data and the associated challenges of annotation. +To address this limitation and support research in RAW-domain driving +perception, we design a novel and ultra-lightweight RAW reconstruction method. +The proposed model introduces a learnable color correction matrix (CCM), which +uses only a single convolutional layer to approximate the complex inverse image +signal processor (ISP). Experimental results demonstrate that simulated RAW +(simRAW) images generated by our method provide performance improvements +equivalent to those produced by more complex inverse ISP methods when +pretraining RAW-domain object detectors, which highlights the effectiveness and +practicality of our approach. + +
+
+ comment: Accepted by BMVC2024 +
+
+
+
+
+ + ☆ Plane2Depth: Hierarchical Adaptive Plane Guidance for Monocular Depth + Estimation + + +
+ Monocular depth estimation aims to infer a dense depth map from a single +image, which is a fundamental and prevalent task in computer vision. Many +previous works have shown impressive depth estimation results through carefully +designed network structures, but they usually ignore the planar information and +therefore perform poorly in low-texture areas of indoor scenes. In this paper, +we propose Plane2Depth, which adaptively utilizes plane information to improve +depth prediction within a hierarchical framework. Specifically, in the proposed +plane guided depth generator (PGDG), we design a set of plane queries as +prototypes to softly model planes in the scene and predict per-pixel plane +coefficients. Then the predicted plane coefficients can be converted into +metric depth values with the pinhole camera model. In the proposed adaptive +plane query aggregation (APGA) module, we introduce a novel feature interaction +approach to improve the aggregation of multi-scale plane features in a top-down +manner. Extensive experiments show that our method can achieve outstanding +performance, especially in low-texture or repetitive areas. Furthermore, under +the same backbone network, our method outperforms the state-of-the-art methods +on the NYU-Depth-v2 dataset, achieves competitive results with state-of-the-art +methods KITTI dataset and can be generalized to unseen scenes effectively. + +
+
+ comment: 14 pages, 12 figures, 8 tables +
+
+
+
+
+ + ☆ Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of + Data-Driven Optimization Routine + + +
+ Diffusion tensor imaging (DTI) holds significant importance in clinical +diagnosis and neuroscience research. However, conventional model-based fitting +methods often suffer from sensitivity to noise, leading to decreased accuracy +in estimating DTI parameters. While traditional data-driven deep learning +methods have shown potential in terms of accuracy and efficiency, their limited +generalization to out-of-training-distribution data impedes their broader +application due to the diverse scan protocols used across centers, scanners, +and studies. This work aims to tackle these challenges and promote the use of +DTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI +combines the weighted linear least squares fitting algorithm and regularization +by denoising technique. The former fits DW images from diverse acquisition +settings into diffusion tensor field, while the latter applies a deep +learning-based denoiser to regularize the diffusion tensor field instead of the +DW images, which is free from the limitation of fixed-channel assignment of the +network. The optimization object is solved using the alternating direction +method of multipliers and then unrolled to construct a deep neural network, +leveraging a data-driven strategy to learn network parameters. Extensive +validation experiments are conducted utilizing both internally simulated +datasets and externally obtained in-vivo datasets. The results, encompassing +both qualitative and quantitative analyses, showcase that the proposed method +attains state-of-the-art performance in DTI parameter estimation. Notably, it +demonstrates superior generalization, accuracy, and efficiency, rendering it +highly reliable for widespread application in the field. + +
+
+
+
+
+ + ☆ TP-GMOT: Tracking Generic Multiple Object by Textual Prompt with + Motion-Appearance Cost (MAC) SORT + + +
+ While Multi-Object Tracking (MOT) has made substantial advancements, it is +limited by heavy reliance on prior knowledge and limited to predefined +categories. In contrast, Generic Multiple Object Tracking (GMOT), tracking +multiple objects with similar appearance, requires less prior information about +the targets but faces challenges with variants like viewpoint, lighting, +occlusion, and resolution. Our contributions commence with the introduction of +the \textbf{\text{Refer-GMOT dataset}} a collection of videos, each accompanied +by fine-grained textual descriptions of their attributes. Subsequently, we +introduce a novel text prompt-based open-vocabulary GMOT framework, called +\textbf{\text{TP-GMOT}}, which can track never-seen object categories with zero +training examples. Within \text{TP-GMOT} framework, we introduce two novel +components: (i) {\textbf{\text{TP-OD}}, an object detection by a textual +prompt}, for accurately detecting unseen objects with specific characteristics. +(ii) Motion-Appearance Cost SORT \textbf{\text{MAC-SORT}}, a novel object +association approach that adeptly integrates motion and appearance-based +matching strategies to tackle the complex task of tracking multiple generic +objects with high similarity. Our contributions are benchmarked on the +\text{Refer-GMOT} dataset for GMOT task. Additionally, to assess the +generalizability of the proposed \text{TP-GMOT} framework and the effectiveness +of \text{MAC-SORT} tracker, we conduct ablation studies on the DanceTrack and +MOT20 datasets for the MOT task. Our dataset, code, and models will be publicly +available at: https://fsoft-aic.github.io/TP-GMOT + +
+
+
+
+
+ + ☆ Boosting Generalizability towards Zero-Shot Cross-Dataset Single-Image + Indoor Depth by Meta-Initialization IROS 2024 + + +
+ Indoor robots rely on depth to perform tasks like navigation or obstacle +detection, and single-image depth estimation is widely used to assist +perception. Most indoor single-image depth prediction focuses less on model +generalizability to unseen datasets, concerned with in-the-wild robustness for +system deployment. This work leverages gradient-based meta-learning to gain +higher generalizability on zero-shot cross-dataset inference. Unlike the +most-studied meta-learning of image classification associated with explicit +class labels, no explicit task boundaries exist for continuous depth values +tied to highly varying indoor environments regarding object arrangement and +scene composition. We propose fine-grained task that treats each RGB-D +mini-batch as a task in our meta-learning formulation. We first show that our +method on limited data induces a much better prior (max 27.8% in RMSE). Then, +finetuning on meta-learned initialization consistently outperforms baselines +without the meta approach. Aiming at generalization, we propose zero-shot +cross-dataset protocols and validate higher generalizability induced by our +meta-initialization, as a simple and useful plugin to many existing depth +estimation methods. The work at the intersection of depth and meta-learning +potentially drives both research to step closer to practical robotic and +machine perception usage. + +
+
+ comment: IROS 2024. The version supersedes 2305.07269. arXiv admin note: text + overlap with arXiv:2305.07269 +
+
+
+
+
+ + ☆ TASAR: Transferable Attack on Skeletal Action Recognition + + +
+ Skeletal sequences, as well-structured representations of human behaviors, +are crucial in Human Activity Recognition (HAR). The transferability of +adversarial skeletal sequences enables attacks in real-world HAR scenarios, +such as autonomous driving, intelligent surveillance, and human-computer +interactions. However, existing Skeleton-based HAR (S-HAR) attacks exhibit weak +adversarial transferability and, therefore, cannot be considered true +transfer-based S-HAR attacks. More importantly, the reason for this failure +remains unclear. In this paper, we study this phenomenon through the lens of +loss surface, and find that its sharpness contributes to the poor +transferability in S-HAR. Inspired by this observation, we assume and +empirically validate that smoothening the rugged loss landscape could +potentially improve adversarial transferability in S-HAR. To this end, we +propose the first Transfer-based Attack on Skeletal Action Recognition, TASAR. +TASAR explores the smoothed model posterior without re-training the pre-trained +surrogates, which is achieved by a new post-train Dual Bayesian optimization +strategy. Furthermore, unlike previous transfer-based attacks that treat each +frame independently and overlook temporal coherence within sequences, TASAR +incorporates motion dynamics into the Bayesian attack gradient, effectively +disrupting the spatial-temporal coherence of S-HARs. To exhaustively evaluate +the effectiveness of existing methods and our method, we build the first +large-scale robust S-HAR benchmark, comprising 7 S-HAR models, 10 attack +methods, 3 S-HAR datasets and 2 defense models. Extensive results demonstrate +the superiority of TASAR. Our benchmark enables easy comparisons for future +studies, with the code available in the supplementary material. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.08572 +
+
+
+
+
+ + ☆ Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes + + +
+ High-quality real-time view synthesis methods are based on volume rendering, +splatting, or surface rendering. While surface-based methods generally are the +fastest, they cannot faithfully model fuzzy geometry like hair. In turn, +alpha-blending techniques excel at representing fuzzy materials but require an +unbounded number of samples per ray (P1). Further overheads are induced by +empty space skipping in volume rendering (P2) and sorting input primitives in +splatting (P3). These problems are exacerbated on low-performance graphics +hardware, e.g. on mobile devices. We present a novel representation for +real-time view synthesis where the (P1) number of sampling locations is small +and bounded, (P2) sampling locations are efficiently found via rasterization, +and (P3) rendering is sorting-free. We achieve this by representing objects as +semi-transparent multi-layer meshes, rendered in fixed layer order from +outermost to innermost. We model mesh layers as SDF shells with optimal spacing +learned during training. After baking, we fit UV textures to the corresponding +meshes. We show that our method can represent challenging fuzzy objects while +achieving higher frame rates than volume-based and splatting-based methods on +low-end and mobile devices. + +
+
+
+
+
+ + ☆ FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video + Reconstruction in Resource and Timing Constrained Network Settings + + +
+ Despite the growing adoption of video processing via Internet of Things (IoT) +devices due to their cost-effectiveness, transmitting captured data to nearby +servers poses challenges due to varying timing constraints and scarcity of +network bandwidth. Existing video compression methods face difficulties in +recovering compressed data when incomplete data is provided. Here, we introduce +\emph{\project}, a deep-learning based solution that utilizes previously +received data to predict the missing segments of a frame, enabling the +reconstruction of a frame from partially received data. + +
+
+
+
+
+ + ☆ Detecting Korean Food Using Image using Hierarchical Model + + +
+ A solution was made available for Korean Food lovers who have dietary +restrictions to identify the Korean food before consuming. Just by uploading a +clear photo of the dish, people can get to know what they are eating. Image +processing techniques together with machine learning helped to come up with +this solution. + +
+
+
+
+
+ + ☆ Non-target Divergence Hypothesis: Toward Understanding Domain Gaps in + Cross-Modal Knowledge Distillation + + +
+ Compared to single-modal knowledge distillation, cross-modal knowledge +distillation faces more severe challenges due to domain gaps between +modalities. Although various methods have proposed various solutions to +overcome these challenges, there is still limited research on how domain gaps +affect cross-modal knowledge distillation. This paper provides an in-depth +analysis and evaluation of this issue. We first introduce the Non-Target +Divergence Hypothesis (NTDH) to reveal the impact of domain gaps on cross-modal +knowledge distillation. Our key finding is that domain gaps between modalities +lead to distribution differences in non-target classes, and the smaller these +differences, the better the performance of cross-modal knowledge distillation. +Subsequently, based on Vapnik-Chervonenkis (VC) theory, we derive the upper and +lower bounds of the approximation error for cross-modal knowledge distillation, +thereby theoretically validating the NTDH. Finally, experiments on five +cross-modal datasets further confirm the validity, generalisability, and +applicability of the NTDH. + +
+
+
+
+
+ + ☆ Training-free Color-Style Disentanglement for Constrained Text-to-Image + Synthesis + + +
+ We consider the problem of independently, in a disentangled fashion, +controlling the outputs of text-to-image diffusion models with color and style +attributes of a user-supplied reference image. We present the first +training-free, test-time-only method to disentangle and condition text-to-image +models on color and style attributes from reference image. To realize this, we +propose two key innovations. Our first contribution is to transform the latent +codes at inference time using feature transformations that make the covariance +matrix of current generation follow that of the reference image, helping +meaningfully transfer color. Next, we observe that there exists a natural +disentanglement between color and style in the LAB image space, which we +exploit to transform the self-attention feature maps of the image being +generated with respect to those of the reference computed from its L channel. +Both these operations happen purely at test time and can be done independently +or merged. This results in a flexible method where color and style information +can come from the same reference image or two different sources, and a new +generation can seamlessly fuse them in either scenario. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ☆ Diffusion Models Learn Low-Dimensional Distributions via Subspace + Clustering + + +
+ Recent empirical studies have demonstrated that diffusion models can +effectively learn the image distribution and generate new samples. Remarkably, +these models can achieve this even with a small number of training samples +despite a large image dimension, circumventing the curse of dimensionality. In +this work, we provide theoretical insights into this phenomenon by leveraging +key empirical observations: (i) the low intrinsic dimensionality of image data, +(ii) a union of manifold structure of image data, and (iii) the low-rank +property of the denoising autoencoder in trained diffusion models. These +observations motivate us to assume the underlying data distribution of image +data as a mixture of low-rank Gaussians and to parameterize the denoising +autoencoder as a low-rank model according to the score function of the assumed +distribution. With these setups, we rigorously show that optimizing the +training loss of diffusion models is equivalent to solving the canonical +subspace clustering problem over the training samples. Based on this +equivalence, we further show that the minimal number of samples required to +learn the underlying distribution scales linearly with the intrinsic dimensions +under the above data and model assumptions. This insight sheds light on why +diffusion models can break the curse of dimensionality and exhibit the phase +transition in learning distributions. Moreover, we empirically establish a +correspondence between the subspaces and the semantic representations of image +data, facilitating image editing. We validate these results with corroborated +experimental results on both simulated distributions and image datasets. + +
+
+ comment: 39 pages, 9 figures +
+
+
+
+
+ + ☆ MOSMOS: Multi-organ segmentation facilitated by medical report + supervision + + +
+ Owing to a large amount of multi-modal data in modern medical systems, such +as medical images and reports, Medical Vision-Language Pre-training (Med-VLP) +has demonstrated incredible achievements in coarse-grained downstream tasks +(i.e., medical classification, retrieval, and visual question answering). +However, the problem of transferring knowledge learned from Med-VLP to +fine-grained multi-organ segmentation tasks has barely been investigated. +Multi-organ segmentation is challenging mainly due to the lack of large-scale +fully annotated datasets and the wide variation in the shape and size of the +same organ between individuals with different diseases. In this paper, we +propose a novel pre-training & fine-tuning framework for Multi-Organ +Segmentation by harnessing Medical repOrt Supervision (MOSMOS). Specifically, +we first introduce global contrastive learning to maximally align the medical +image-report pairs in the pre-training stage. To remedy the granularity +discrepancy, we further leverage multi-label recognition to implicitly learn +the semantic correspondence between image pixels and organ tags. More +importantly, our pre-trained models can be transferred to any segmentation +model by introducing the pixel-tag attention maps. Different network settings, +i.e., 2D U-Net and 3D UNETR, are utilized to validate the generalization. We +have extensively evaluated our approach using different diseases and modalities +on BTCV, AMOS, MMWHS, and BRATS datasets. Experimental results in various +settings demonstrate the effectiveness of our framework. This framework can +serve as the foundation to facilitate future research on automatic annotation +tasks under the supervision of medical reports. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Local map Construction Methods with SD map: A Novel Survey + + +
+ In recent years, significant academic advancements have been made in the +field of autonomous vehicles, with Local maps emerging as a crucial component +of autonomous driving technology. Local maps not only provide intricate details +of road networks but also serve as fundamental inputs for critical tasks such +as vehicle localization, navigation, and decision-making. Given the +characteristics of SD map (Standard Definition Map), which include low cost, +ease of acquisition, and high versatility, perception methods that integrate SD +map as prior information have demonstrated significant potential in the field +of Local map perception. The purpose of this paper is to provide researchers +with a comprehensive overview and summary of the latest advancements in the +integration of SD map as prior information for Local map perception methods. +This review begins by introducing the task definition and general pipeline of +local map perception methods that incorporate SD maps as prior information, +along with relevant public datasets. And then it focuses on the representation +and encoding methods of multi-source information, as well as the methods for +fusing multi-source information. In response to this burgeoning trend, this +article presents a comprehensive and meticulous overview of the diverse +research efforts in this particular field. Finally, the article addresses +pertinent issues and future challenges with the aim of guiding researchers in +understanding the current trends and methodologies prevalent in the field. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Hadamard Row-Wise Generation Algorithm + + +
+ In this paper, we introduce an efficient algorithm for generating specific +Hadamard rows, addressing the memory demands of pre-computing the entire +matrix. Leveraging Sylvester's recursive construction, our method generates the +required $i$-th row on demand, significantly reducing computational resources. +The algorithm uses the Kronecker product to construct the desired row from the +binary representation of the index, without creating the full matrix. This +approach is particularly useful for single-pixel imaging systems that need only +one row at a time. + +
+
+
+
+
+ + ☆ Neural Dynamics Model of Visual Decision-Making: Learning from Human + Experts + + +
+ Uncovering the fundamental neural correlates of biological intelligence, +developing mathematical models, and conducting computational simulations are +critical for advancing new paradigms in artificial intelligence (AI). In this +study, we implemented a comprehensive visual decision-making model that spans +from visual input to behavioral output, using a neural dynamics modeling +approach. Drawing inspiration from the key components of the dorsal visual +pathway in primates, our model not only aligns closely with human behavior but +also reflects neural activities in primates, and achieving accuracy comparable +to convolutional neural networks (CNNs). Moreover, magnetic resonance imaging +(MRI) identified key neuroimaging features such as structural connections and +functional connectivity that are associated with performance in perceptual +decision-making tasks. A neuroimaging-informed fine-tuning approach was +introduced and applied to the model, leading to performance improvements that +paralleled the behavioral variations observed among subjects. Compared to +classical deep learning models, our model more accurately replicates the +behavioral performance of biological intelligence, relying on the structural +characteristics of biological neural networks rather than extensive training +data, and demonstrating enhanced resilience to perturbation. + +
+
+
+
+
+ + ☆ Multi-modal Situated Reasoning in 3D Scenes + + +
+ Situation awareness is essential for understanding and reasoning about 3D +scenes in embodied AI agents. However, existing datasets and benchmarks for +situated understanding are limited in data modality, diversity, scale, and task +scope. To address these limitations, we propose Multi-modal Situated Question +Answering (MSQA), a large-scale multi-modal situated reasoning dataset, +scalably collected leveraging 3D scene graphs and vision-language models (VLMs) +across a diverse range of real-world 3D scenes. MSQA includes 251K situated +question-answering pairs across 9 distinct question categories, covering +complex scenarios within 3D scenes. We introduce a novel interleaved +multi-modal input setting in our benchmark to provide text, image, and point +cloud for situation and question description, resolving ambiguity in previous +single-modality convention (e.g., text). Additionally, we devise the +Multi-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models' +situated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN +highlight the limitations of existing vision-language models and underscore the +importance of handling multi-modal interleaved inputs and situation modeling. +Experiments on data scaling and cross-domain transfer further demonstrate the +efficacy of leveraging MSQA as a pre-training dataset for developing more +powerful situated reasoning models. + +
+
+ comment: Project page: https://msr3d.github.io/ +
+
+
+
+
+ + ☆ Unified Framework with Consistency across Modalities for Human Activity + Recognition BMVC 2024 + + +
+ Recognizing human activities in videos is challenging due to the +spatio-temporal complexity and context-dependence of human interactions. Prior +studies often rely on single input modalities, such as RGB or skeletal data, +limiting their ability to exploit the complementary advantages across +modalities. Recent studies focus on combining these two modalities using simple +feature fusion techniques. However, due to the inherent disparities in +representation between these input modalities, designing a unified neural +network architecture to effectively leverage their complementary information +remains a significant challenge. To address this, we propose a comprehensive +multimodal framework for robust video-based human activity recognition. Our key +contribution is the introduction of a novel compositional query machine, called +COMPUTER ($\textbf{COMP}ositional h\textbf{U}man-cen\textbf{T}ric +qu\textbf{ER}y$ machine), a generic neural architecture that models the +interactions between a human of interest and its surroundings in both space and +time. Thanks to its versatile design, COMPUTER can be leveraged to distill +distinctive representations for various input modalities. Additionally, we +introduce a consistency loss that enforces agreement in prediction between +modalities, exploiting the complementary information from multimodal inputs for +robust human movement recognition. Through extensive experiments on action +localization and group activity recognition tasks, our approach demonstrates +superior performance when compared with state-of-the-art methods. Our code is +available at: https://github.com/tranxuantuyen/COMPUTER. + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+ + ☆ GGS: Generalizable Gaussian Splatting for Lane Switching in Autonomous + Driving + + +
+ We propose GGS, a Generalizable Gaussian Splatting method for Autonomous +Driving which can achieve realistic rendering under large viewpoint changes. +Previous generalizable 3D gaussian splatting methods are limited to rendering +novel views that are very close to the original pair of images, which cannot +handle large differences in viewpoint. Especially in autonomous driving +scenarios, images are typically collected from a single lane. The limited +training perspective makes rendering images of a different lane very +challenging. To further improve the rendering capability of GGS under large +viewpoint changes, we introduces a novel virtual lane generation module into +GSS method to enables high-quality lane switching even without a multi-lane +dataset. Besides, we design a diffusion loss to supervise the generation of +virtual lane image to further address the problem of lack of data in the +virtual lanes. Finally, we also propose a depth refinement module to optimize +depth estimation in the GSS model. Extensive validation of our method, compared +to existing approaches, demonstrates state-of-the-art performance. + +
+
+
+
+
+ + ☆ Coral Model Generation from Single Images for Virtual Reality + Applications + + +
+ With the rapid development of VR technology, the demand for high-quality 3D +models is increasing. Traditional methods struggle with efficiency and quality +in large-scale customization. This paper introduces a deep-learning framework +that generates high-precision 3D coral models from a single image. Using the +Coral dataset, the framework extracts geometric and texture features, performs +3D reconstruction, and optimizes design and material blending. Advanced +optimization and polygon count control ensure shape accuracy, detail retention, +and flexible output for various complexities, catering to high-quality +rendering and real-time interaction needs.The project incorporates Explainable +AI (XAI) to transform AI-generated models into interactive "artworks," best +viewed in VR and XR. This enhances model interpretability and human-machine +collaboration. Real-time feedback in VR interactions displays information like +coral species and habitat, enriching user experience. The generated models +surpass traditional methods in detail, visual quality, and efficiency. This +research offers an intelligent approach to 3D content creation for VR, lowering +production barriers, and promoting widespread VR applications. Additionally, +integrating XAI provides new insights into AI-generated visual content and +advances research in 3D vision interpretability. + +
+
+ comment: In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts + 2024) arXiv:2406.14485 +
+
+
+
+
+ + ☆ Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable + Image Editing + + +
+ Recently, diffusion models have emerged as a powerful class of generative +models. Despite their success, there is still limited understanding of their +semantic spaces. This makes it challenging to achieve precise and disentangled +image generation without additional training, especially in an unsupervised +way. In this work, we improve the understanding of their semantic spaces from +intriguing observations: among a certain range of noise levels, (1) the learned +posterior mean predictor (PMP) in the diffusion model is locally linear, and +(2) the singular vectors of its Jacobian lie in low-dimensional semantic +subspaces. We provide a solid theoretical basis to justify the linearity and +low-rankness in the PMP. These insights allow us to propose an unsupervised, +single-step, training-free LOw-rank COntrollable image editing (LOCO Edit) +method for precise local editing in diffusion models. LOCO Edit identified +editing directions with nice properties: homogeneity, transferability, +composability, and linearity. These properties of LOCO Edit benefit greatly +from the low-dimensional semantic subspace. Our method can further be extended +to unsupervised or text-supervised editing in various text-to-image diffusion +models (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the +effectiveness and efficiency of LOCO Edit. The codes will be released at +https://github.com/ChicyChen/LOCO-Edit. + +
+
+
+
+
+ + ☆ Unfolding Videos Dynamics via Taylor Expansion + + +
+ Taking inspiration from physical motion, we present a new self-supervised +dynamics learning strategy for videos: Video Time-Differentiation for Instance +Discrimination (ViDiDi). ViDiDi is a simple and data-efficient strategy, +readily applicable to existing self-supervised video representation learning +frameworks based on instance discrimination. At its core, ViDiDi observes +different aspects of a video through various orders of temporal derivatives of +its frame sequence. These derivatives, along with the original frames, support +the Taylor series expansion of the underlying continuous dynamics at discrete +times, where higher-order derivatives emphasize higher-order motion features. +ViDiDi learns a single neural network that encodes a video and its temporal +derivatives into consistent embeddings following a balanced alternating +learning algorithm. By learning consistent representations for original frames +and derivatives, the encoder is steered to emphasize motion features over +static backgrounds and uncover the hidden dynamics in original frames. Hence, +video representations are better separated by dynamic features. We integrate +ViDiDi into existing instance discrimination frameworks (VICReg, BYOL, and +SimCLR) for pretraining on UCF101 or Kinetics and test on standard benchmarks +including video retrieval, action recognition, and action detection. The +performances are enhanced by a significant margin without the need for large +models or extensive datasets. + +
+
+
+
+
+ + ☆ Pluralistic Salient Object Detection + + +
+ We introduce pluralistic salient object detection (PSOD), a novel task aimed +at generating multiple plausible salient segmentation results for a given input +image. Unlike conventional SOD methods that produce a single segmentation mask +for salient objects, this new setting recognizes the inherent complexity of +real-world images, comprising multiple objects, and the ambiguity in defining +salient objects due to different user intentions. To study this task, we +present two new SOD datasets "DUTS-MM" and "DUS-MQ", along with newly designed +evaluation metrics. DUTS-MM builds upon the DUTS dataset but enriches the +ground-truth mask annotations from three aspects which 1) improves the mask +quality especially for boundary and fine-grained structures; 2) alleviates the +annotation inconsistency issue; and 3) provides multiple ground-truth masks for +images with saliency ambiguity. DUTS-MQ consists of approximately 100K +image-mask pairs with human-annotated preference scores, enabling the learning +of real human preferences in measuring mask quality. Building upon these two +datasets, we propose a simple yet effective pluralistic SOD baseline based on a +Mixture-of-Experts (MOE) design. Equipped with two prediction heads, it +simultaneously predicts multiple masks using different query prompts and +predicts human preference scores for each mask candidate. Extensive experiments +and analyses underscore the significance of our proposed datasets and affirm +the effectiveness of our PSOD framework. + +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ Quantifying uncertainty in lung cancer segmentation with foundation + models applied to mixed-domain datasets + + +
+ Medical image foundation models have shown the ability to segment organs and +tumors with minimal fine-tuning. These models are typically evaluated on +task-specific in-distribution (ID) datasets. However, reliable performance on +ID dataset does not guarantee robust generalization on out-of-distribution +(OOD) datasets. Importantly, once deployed for clinical use, it is impractical +to have `ground truth' delineations to assess ongoing performance drifts, +especially when images fall into OOD category due to different imaging +protocols. Hence, we introduced a comprehensive set of computationally fast +metrics to evaluate the performance of multiple foundation models (Swin UNETR, +SimMIM, iBOT, SMIT) trained with self-supervised learning (SSL). SSL +pretraining was selected as this approach is applicable for large, diverse, and +unlabeled image sets. All models were fine-tuned on identical datasets for lung +tumor segmentation from computed tomography (CT) scans. SimMIM, iBOT, and SMIT +used identical architecture, pretraining, and fine-tuning datasets to assess +performance variations with the choice of pretext tasks used in SSL. Evaluation +was performed on two public lung cancer datasets (LRAD: n = 140, 5Rater: n = +21) with different image acquisitions and tumor stage compared to training data +(n = 317 public resource with stage III-IV lung cancers) and a public +non-cancer dataset containing volumetric CT scans of patients with pulmonary +embolism (n = 120). All models produced similarly accurate tumor segmentation +on the lung cancer testing datasets. SMIT produced a highest F1-score (LRAD: +0.60, 5Rater: 0.64) and lowest entropy (LRAD: 0.06, 5Rater: 0.12), indicating +higher tumor detection rate and confident segmentations. In the OOD dataset, +SMIT misdetected least number of tumors, indicated by median volume occupancy +of 5.67 cc compared to second best method SimMIM of 9.97 cc. + +
+
+
+
+
+ + ♻ ☆ Multi-task Learning Approach for Intracranial Hemorrhage Prognosis MICCAI 2024 + + +
+ Prognosis after intracranial hemorrhage (ICH) is influenced by a complex +interplay between imaging and tabular data. Rapid and reliable prognosis are +crucial for effective patient stratification and informed treatment +decision-making. In this study, we aim to enhance image-based prognosis by +learning a robust feature representation shared between prognosis and the +clinical and demographic variables most highly correlated with it. Our approach +mimics clinical decision-making by reinforcing the model to learn valuable +prognostic data embedded in the image. We propose a 3D multi-task image model +to predict prognosis, Glasgow Coma Scale and age, improving accuracy and +interpretability. Our method outperforms current state-of-the-art baseline +image models, and demonstrates superior performance in ICH prognosis compared +to four board-certified neuroradiologists using only CT scans as input. We +further validate our model with interpretability saliency maps. Code is +available at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git. + +
+
+ comment: 16 pages. Accepted at Machine Learning in Medical Imaging Workshop @ + MICCAI 2024 (MLMI2024). This is the submitted manuscript with added link to + github repo, funding acknowledgements and authors' names and affiliations. No + further post submission improvements or corrections were integrated. Final + version not published yet +
+
+
+
+
+ + ♻ ☆ SDE-based Multiplicative Noise Removal + + +
+ Multiplicative noise, also known as speckle or pepper noise, commonly affects +images produced by synthetic aperture radar (SAR), lasers, or optical lenses. +Unlike additive noise, which typically arises from thermal processes or +external factors, multiplicative noise is inherent to the system, originating +from the fluctuation in diffuse reflections. These fluctuations result in +multiple copies of the same signal with varying magnitudes being combined. +Consequently, despeckling, or removing multiplicative noise, necessitates +different techniques compared to those used for additive noise removal. + In this paper, we propose a novel approach using Stochastic Differential +Equations based diffusion models to address multiplicative noise. We +demonstrate that multiplicative noise can be effectively modeled as a Geometric +Brownian Motion process in the logarithmic domain. Utilizing the Fokker-Planck +equation, we derive the corresponding reverse process for image denoising. To +validate our method, we conduct extensive experiments on two different +datasets, comparing our approach to both classical signal processing techniques +and contemporary CNN-based noise removal models. Our results indicate that the +proposed method significantly outperforms existing methods on perception-based +metrics such as FID and LPIPS, while maintaining competitive performance on +traditional metrics like PSNR and SSIM. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Learning Local Pattern Modularization for Point Cloud Reconstruction + from Unseen Classes ECCV 2024 + + +
+ It is challenging to reconstruct 3D point clouds in unseen classes from +single 2D images. Instead of object-centered coordinate system, current methods +generalized global priors learned in seen classes to reconstruct 3D shapes from +unseen classes in viewer-centered coordinate system. However, the +reconstruction accuracy and interpretability are still eager to get improved. +To resolve this issue, we introduce to learn local pattern modularization for +reconstructing 3D shapes in unseen classes, which achieves both good +generalization ability and high reconstruction accuracy. Our insight is to +learn a local prior which is class-agnostic and easy to generalize in +object-centered coordinate system. Specifically, the local prior is learned via +a process of learning and customizing local pattern modularization in seen +classes. During this process, we first learn a set of patterns in local +regions, which is the basis in the object-centered coordinate system to +represent an arbitrary region on shapes across different classes. Then, we +modularize each region on an initially reconstructed shape using the learned +local patterns. Based on that, we customize the local pattern modularization +using the input image by refining the reconstruction with more details. Our +method enables to reconstruct high fidelity point clouds from unseen classes in +object-centered coordinate system without requiring a large number of patterns +or any additional information, such as segmentation supervision or camera +poses. Our experimental results under widely used benchmarks show that our +method achieves the state-of-the-art reconstruction accuracy for shapes from +unseen classes. The code is available at https://github.com/chenchao15/Unseen. + +
+
+ comment: 14pages, 11figures, accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ CONDA: Condensed Deep Association Learning for Co-Salient Object + Detection + + +
+ Inter-image association modeling is crucial for co-salient object detection. +Despite satisfactory performance, previous methods still have limitations on +sufficient inter-image association modeling. Because most of them focus on +image feature optimization under the guidance of heuristically calculated raw +inter-image associations. They directly rely on raw associations which are not +reliable in complex scenarios, and their image feature optimization approach is +not explicit for inter-image association modeling. To alleviate these +limitations, this paper proposes a deep association learning strategy that +deploys deep networks on raw associations to explicitly transform them into +deep association features. Specifically, we first create hyperassociations to +collect dense pixel-pair-wise raw associations and then deploys deep +aggregation networks on them. We design a progressive association generation +module for this purpose with additional enhancement of the hyperassociation +calculation. More importantly, we propose a correspondence-induced association +condensation module that introduces a pretext task, i.e. semantic +correspondence estimation, to condense the hyperassociations for computational +burden reduction and noise elimination. We also design an object-aware cycle +consistency loss for high-quality correspondence estimations. Experimental +results in three benchmark datasets demonstrate the remarkable effectiveness of +our proposed method with various training settings. + +
+
+ comment: There is an error. In Sec 4.1, the number of images in some dataset + is incorrect and needs to be revised +
+
+
+
+
+ + ♻ ☆ Open Gaze: Open Source eye tracker for smartphone devices using Deep + Learning + + +
+ Eye tracking has been a pivotal tool in diverse fields such as vision +research, language analysis, and usability assessment. The majority of prior +investigations, however, have concentrated on expansive desktop displays +employing specialized, costly eye tracking hardware that lacks scalability. +Remarkably little insight exists into ocular movement patterns on smartphones, +despite their widespread adoption and significant usage. In this manuscript, we +present an open-source implementation of a smartphone-based gaze tracker that +emulates the methodology proposed by a GooglePaper (whose source code remains +proprietary). Our focus is on attaining accuracy comparable to that attained +through the GooglePaper's methodology, without the necessity for supplementary +hardware. Through the integration of machine learning techniques, we unveil an +accurate eye tracking solution that is native to smartphones. Our approach +demonstrates precision akin to the state-of-the-art mobile eye trackers, which +are characterized by a cost that is two orders of magnitude higher. Leveraging +the vast MIT GazeCapture dataset, which is available through registration on +the dataset's website, we successfully replicate crucial findings from previous +studies concerning ocular motion behavior in oculomotor tasks and saliency +analyses during natural image observation. Furthermore, we emphasize the +applicability of smartphone-based gaze tracking in discerning reading +comprehension challenges. Our findings exhibit the inherent potential to +amplify eye movement research by significant proportions, accommodating +participation from thousands of subjects with explicit consent. This +scalability not only fosters advancements in vision research, but also extends +its benefits to domains such as accessibility enhancement and healthcare +applications. + +
+
+ comment: This paper results are incorrectly reported. The paper is not + authentic and conclusions are not correct +
+
+
+
+
+ + ♻ ☆ Q-Seg: Quantum Annealing-Based Unsupervised Image Segmentation + + +
+ We present Q-Seg, a novel unsupervised image segmentation method based on +quantum annealing, tailored for existing quantum hardware. We formulate the +pixel-wise segmentation problem, which assimilates spectral and spatial +information of the image, as a graph-cut optimization task. Our method +efficiently leverages the interconnected qubit topology of the D-Wave Advantage +device, offering superior scalability over existing quantum approaches and +outperforming several tested state-of-the-art classical methods. Empirical +evaluations on synthetic datasets have shown that Q-Seg has better runtime +performance than the state-of-the-art classical optimizer Gurobi. The method +has also been tested on earth observation image segmentation, a critical area +with noisy and unreliable annotations. In the era of noisy intermediate-scale +quantum, Q-Seg emerges as a reliable contender for real-world applications in +comparison to advanced techniques like Segment Anything. Consequently, Q-Seg +offers a promising solution using available quantum hardware, especially in +situations constrained by limited labeled data and the need for efficient +computational runtime. + +
+
+ comment: 12 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Enhancing the vision-language foundation model with key semantic + knowledge-emphasized report refinement + + +
+ Recently, vision-language representation learning has made remarkable +advancements in building up medical foundation models, holding immense +potential for transforming the landscape of clinical research and medical care. +The underlying hypothesis is that the rich knowledge embedded in radiology +reports can effectively assist and guide the learning process, reducing the +need for additional labels. However, these reports tend to be complex and +sometimes even consist of redundant descriptions that make the representation +learning too challenging to capture the key semantic information. This paper +develops a novel iterative vision-language representation learning framework by +proposing a key semantic knowledge-emphasized report refinement method. +Particularly, raw radiology reports are refined to highlight the key +information according to a constructed clinical dictionary and two +model-optimized knowledge-enhancement metrics. The iterative framework is +designed to progressively learn, starting from gaining a general understanding +of the patient's condition based on raw reports and gradually refines and +extracts critical information essential to the fine-grained analysis tasks. The +effectiveness of the proposed framework is validated on various downstream +medical image analysis tasks, including disease classification, +region-of-interest segmentation, and phrase grounding. Our framework surpasses +seven state-of-the-art methods in both fine-tuning and zero-shot settings, +demonstrating its encouraging potential for different clinical applications. + +
+
+
+
+
+ + ♻ ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations change +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations changes through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that how ID changes through the network differs +noticeably between natural and medical image models. Specifically, medical +image models peak in representation ID earlier in the network, implying a +difference in the image features and their abstractness that are typically used +for downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ♻ ☆ CHOTA: A Higher Order Accuracy Metric for Cell Tracking + + +
+ The evaluation of cell tracking results steers the development of tracking +methods, significantly impacting biomedical research. This is quantitatively +achieved by means of evaluation metrics. Unfortunately, current metrics favor +local correctness and weakly reward global coherence, impeding high-level +biological analysis. To also foster global coherence, we propose the CHOTA +metric (Cell-specific Higher Order Tracking Accuracy) which unifies the +evaluation of all relevant aspects of cell tracking: cell detections and local +associations, global coherence, and lineage tracking. We achieve this by +introducing a new definition of the term 'trajectory' that includes the entire +cell lineage and by including this into the well-established HOTA metric from +general multiple object tracking. Furthermore, we provide a detailed survey of +contemporary cell tracking metrics to compare our novel CHOTA metric and to +show its advantages. All metrics are extensively evaluated on state-of-the-art +real-data cell tracking results and synthetic results that simulate specific +tracking errors. We show that CHOTA is sensitive to all tracking errors and +gives a good indication of the biologically relevant capability of a method to +reconstruct the full lineage of cells. It introduces a robust and comprehensive +alternative to the currently used metrics in cell tracking. Python code is +available at https://github.com/CellTrackingChallenge/py-ctcmetrics . + +
+
+ comment: Accepted at BIC Workshop at European Conference on Computer Vision + 2024, 14 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Large Scale Unsupervised Brain MRI Image Registration Solution for + Learn2Reg 2024 MICCAI + + +
+ In this paper, we summarize the methods and experimental results we proposed +for Task 2 in the learn2reg 2024 Challenge. This task focuses on unsupervised +registration of anatomical structures in brain MRI images between different +patients. The difficulty lies in: (1) without segmentation labels, and (2) a +large amount of data. To address these challenges, we built an efficient +backbone network and explored several schemes to further enhance registration +accuracy. Under the guidance of the NCC loss function and smoothness +regularization loss function, we obtained a smooth and reasonable deformation +field. According to the leaderboard, our method achieved a Dice coefficient of +77.34%, which is 1.4% higher than the TransMorph. Overall, we won second place +on the leaderboard for Task 2. + +
+
+ comment: MICCAI Learn2Reg 2024 Challenge & WBIR 2024 Workshop on Biomedical + Imaging Registration +
+
+
+
+
+ + ♻ ☆ Nickel and Diming Your GAN: A Dual-Method Approach to Enhancing GAN + Efficiency via Knowledge Distillation + + +
+ In this paper, we address the challenge of compressing generative adversarial +networks (GANs) for deployment in resource-constrained environments by +proposing two novel methodologies: Distribution Matching for Efficient +compression (DiME) and Network Interactive Compression via Knowledge Exchange +and Learning (NICKEL). DiME employs foundation models as embedding kernels for +efficient distribution matching, leveraging maximum mean discrepancy to +facilitate effective knowledge distillation. Simultaneously, NICKEL employs an +interactive compression method that enhances the communication between the +student generator and discriminator, achieving a balanced and stable +compression process. Our comprehensive evaluation on the StyleGAN2 architecture +with the FFHQ dataset shows the effectiveness of our approach, with NICKEL & +DiME achieving FID scores of 10.45 and 15.93 at compression rates of 95.73% and +98.92%, respectively. Remarkably, our methods sustain generative quality even +at an extreme compression rate of 99.69%, surpassing the previous +state-of-the-art performance by a large margin. These findings not only +demonstrate our methodologies' capacity to significantly lower GANs' +computational demands but also pave the way for deploying high-quality GAN +models in settings with limited resources. Our code will be released soon. + +
+
+
+
+
+ + ♻ ☆ When Does Visual Prompting Outperform Linear Probing for Vision-Language + Models? A Likelihood Perspective + + +
+ Adapting pre-trained models to new tasks can exhibit varying effectiveness +across datasets. Visual prompting, a state-of-the-art parameter-efficient +transfer learning method, can significantly improve the performance of +out-of-distribution tasks. On the other hand, linear probing, a standard +transfer learning method, can sometimes become the best approach. We propose a +log-likelihood ratio (LLR) approach to analyze the comparative benefits of +visual prompting and linear probing. By employing the LLR score alongside +resource-efficient visual prompts approximations, our cost-effective measure +attains up to a 100-fold reduction in run time compared to full training, while +achieving prediction accuracies up to 91%. The source code is available at +https://github.com/IBM/VP-LLR. + +
+
+
+
+
+ + ♻ ☆ MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN + for Precise Facial Expression Intensity Estimation + + +
+ This paper presents MMA-MRNNet, a novel deep learning architecture for +dynamic multi-output Facial Expression Intensity Estimation (FEIE) from video +data. Traditional approaches to this task often rely on complex 3-D CNNs, which +require extensive pre-training and assume that facial expressions are uniformly +distributed across all frames of a video. These methods struggle to handle +videos of varying lengths, often resorting to ad-hoc strategies that either +discard valuable information or introduce bias. MMA-MRNNet addresses these +challenges through a two-stage process. First, the Multiple Models of Affect +(MMA) extractor component is a Multi-Task Learning CNN that concurrently +estimates valence-arousal, recognizes basic facial expressions, and detects +action units in each frame. These representations are then processed by a +Masked RNN component, which captures temporal dependencies and dynamically +updates weights according to the true length of the input video, ensuring that +only the most relevant features are used for the final prediction. The proposed +unimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction +dataset and demonstrated significantly superior performance, surpassing +state-of-the-art methods by a wide margin, regardless of whether they were +unimodal, multimodal, or ensemble approaches. Finally, we demonstrated the +effectiveness of the MMA component of our proposed method across multiple +in-the-wild datasets, where it consistently outperformed all state-of-the-art +methods across various metrics. + +
+
+
+
+
+ + ♻ ☆ In the Search for Optimal Multi-view Learning Models for Crop + Classification with Global Remote Sensing Data + + +
+ Studying and analyzing cropland is a difficult task due to its dynamic and +heterogeneous growth behavior. Usually, diverse data sources can be collected +for its estimation. Although deep learning models have proven to excel in the +crop classification task, they face substantial challenges when dealing with +multiple inputs, named Multi-View Learning (MVL). The methods used in the MVL +scenario can be structured based on the encoder architecture, the fusion +strategy, and the optimization technique. The literature has primarily focused +on using specific encoder architectures for local regions, lacking a deeper +exploration of other components in the MVL methodology. In contrast, we +investigate the simultaneous selection of the fusion strategy and encoder +architecture, assessing global-scale cropland and crop-type classifications. We +use a range of five fusion strategies (Input, Feature, Decision, Ensemble, +Hybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible +configurations in the MVL method. We use the CropHarvest dataset for +validation, which provides optical, radar, weather time series, and topographic +information as input data. We found that in scenarios with a limited number of +labeled samples, a unique configuration is insufficient for all the cases. +Instead, a specialized combination should be meticulously sought, including an +encoder and fusion strategy. To streamline this search process, we suggest +identifying the optimal encoder architecture tailored for a particular fusion +strategy, and then determining the most suitable fusion strategy for the +classification task. We provide a methodological framework for researchers +exploring crop classification through an MVL methodology. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ♻ ☆ Increasing the Robustness of Model Predictions to Missing Sensors in + Earth Observation ACL + + +
+ Multi-sensor ML models for EO aim to enhance prediction accuracy by +integrating data from various sources. However, the presence of missing data +poses a significant challenge, particularly in non-persistent sensors that can +be affected by external factors. Existing literature has explored strategies +like temporal dropout and sensor-invariant models to address the generalization +to missing data issues. Inspired by these works, we study two novel methods +tailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and +Ensemble Sensor Invariant (ESensI). Through experimentation on three +multi-sensor temporal EO datasets, we demonstrate that these methods +effectively increase the robustness of model predictions to missing sensors. +Particularly, we focus on how the predictive performance of models drops when +sensors are missing at different levels. We observe that ensemble multi-sensor +models are the most robust to the lack of sensors. In addition, the sensor +dropout component in ISensD shows promising robustness results. + +
+
+ comment: Accepted at the MACLEAN workshop in the ECML/PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Scalable Glacier Mapping using Deep Learning and Open Earth Observation + Data Matches the Accuracy of Manual Delineation + + +
+ Accurate global glacier mapping is critical for understanding climate change +impacts. Despite its importance, automated glacier mapping at a global scale +remains largely unexplored. Here we address this gap and propose +Glacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep +learning model, and five strategies for multitemporal global-scale glacier +mapping using open satellite imagery. Assessing the spatial, temporal and +cross-sensor generalisation shows that our best strategy achieves intersection +over union >0.85 on previously unobserved images in most cases, which drops to +>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90 +for regions dominated by clean ice. A comparative validation against human +expert uncertainties in terms of area and distance deviations underscores +GlaViTU performance, approaching or matching expert-level delineation. Adding +synthetic aperture radar data, namely, backscatter and interferometric +coherence, increases the accuracy in all regions where available. The +calibrated confidence for glacier extents is reported making the predictions +more reliable and interpretable. We also release a benchmark dataset that +covers 9% of glaciers worldwide. Our results support efforts towards automated +multitemporal and global glacier mapping. + +
+
+ comment: after major revision, expanded validation +
+
+
+
+
+ + ♻ ☆ CSGO: Content-Style Composition in Text-to-Image Generation + + +
+ The diffusion model has shown exceptional capabilities in controlled image +generation, which has further fueled interest in image style transfer. Existing +works mainly focus on training free-based methods (e.g., image inversion) due +to the scarcity of specific data. In this study, we present a data construction +pipeline for content-style-stylized image triplets that generates and +automatically cleanses stylized data triplets. Based on this pipeline, we +construct a dataset IMAGStyle, the first large-scale style transfer dataset +containing 210k image triplets, available for the community to explore and +research. Equipped with IMAGStyle, we propose CSGO, a style transfer model +based on end-to-end training, which explicitly decouples content and style +features employing independent feature injection. The unified CSGO implements +image-driven style transfer, text-driven stylized synthesis, and text +editing-driven stylized synthesis. Extensive experiments demonstrate the +effectiveness of our approach in enhancing style control capabilities in image +generation. Additional visualization and access to the source code can be +located on the project page: \url{https://csgo-gen.github.io/}. + +
+
+
+
+
+ + ♻ ☆ Object-Size-Driven Design of Convolutional Neural Networks: Virtual Axle + Detection based on Raw Data + + +
+ As infrastructure ages, the need for efficient monitoring methods becomes +increasingly critical. Bridge Weigh-In-Motion (BWIM) systems are crucial for +cost-efficient load and thus residual service life determination of road and +railway infrastructure. However, conventional BWIM systems require additional +sensors for axle detection, which have to be installed in potentially +inaccessible locations or in locations that interfere with bridge operation. +This study addresses this challenge by replacing dedicated axle detectors with +a novel approach to real-time detection of train axles using sensors +arbitrarily placed on bridges. The proposed Virtual Axle Detector with Enhanced +Receptive Field (VADER) has been validated on a single-track railway bridge, +demonstrating that it achieves to detect 99.9% of axles with a spatial error of +3.69cm using only acceleration measurements. Using raw data as input +outperforms the state-of-the-art spectrogram-based method in both speed and +memory usage by 99%, making real-time application feasible for the first time. +Additionally, we introduce the Maximum Receptive Field (MRF) rule, a novel +approach to optimise hyperparameters of Convolutional Neural Networks (CNNs) +based on the size of objects, which in this case relates to the fundamental +frequency of a bridge. The MRF rule effectively narrows the hyperparameter +search space, potentially replacing the need for extensive hyperparameter +tuning. Since the MRF rule is theoretically applicable to all unstructured +data, it could have implications for a wide range of deep learning problems +from earthquake prediction to object recognition. + +
+
+
+
+
+ + ♻ ☆ Filter & Align: Leveraging Human Knowledge to Curate Image-Text Data + + +
+ The increasing availability of image-text pairs has largely fueled the rapid +advancement in vision-language foundation models. However, the vast scale of +these datasets inevitably introduces significant variability in data quality, +which can adversely affect the model performance. This highlights the critical +role of data filtering, not only to enhance training efficiency but also to +improve overall data quality. Existing methods typically rely on metrics such +as CLIP Score and BLIP Score, which are derived from pre-trained models. +However, these models are often trained on uncurated, noisy datasets, which can +perpetuate errors and misalignments in the filtered dataset. We present a novel +algorithm that incorporates human knowledge on image-text alignment to guide +filtering vast corpus of web-crawled image-text datasets into a compact and +high-quality form. To systemically capture human preferences on image-text +alignments, we collect a diverse image-text dataset where each image is +associated with multiple captions from various sources, and establish a +comprehensive set of both subjective and objective criteria for critically +guiding the alignment assessment from labelers. Additionally, we train a reward +model on these human-preference annotations to internalize the nuanced human +understanding of image-text alignment. The resulting reward model thus can act +as a human-like referee to filter image-text pairs. Extensive experiments +demonstrate that we can maintain, sometimes even improve, model performance +while compressing the image-text datasets up to ~90%. An impressive example is +that, by aggressively reducing the total training sample from 130M to only +15.5M, our BLIP-B/16 models consistently show an average improvement of 2.9% on +retrieval tasks and 11.5% on captioning tasks compared to full-size-dataset +counterparts. + +
+
+
+
+
+ + ♻ ☆ Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression + Recognition CVPR 2024 + + +
+ Human communication is multi-modal; e.g., face-to-face interaction involves +auditory signals (speech) and visual signals (face movements and hand +gestures). Hence, it is essential to exploit multiple modalities when designing +machine learning-based facial expression recognition systems. In addition, +given the ever-growing quantities of video data that capture human facial +expressions, such systems should utilize raw unlabeled videos without requiring +expensive annotations. Therefore, in this work, we employ a multitask +multi-modal self-supervised learning method for facial expression recognition +from in-the-wild video data. Our model combines three self-supervised objective +functions: First, a multi-modal contrastive loss, that pulls diverse data +modalities of the same video together in the representation space. Second, a +multi-modal clustering loss that preserves the semantic structure of input data +in the representation space. Finally, a multi-modal data reconstruction loss. +We conduct a comprehensive study on this multimodal multi-task self-supervised +learning method on three facial expression recognition benchmarks. To that end, +we examine the performance of learning through different combinations of +self-supervised tasks on the facial expression recognition downstream task. Our +model ConCluGen outperforms several multi-modal self-supervised and fully +supervised baselines on the CMU-MOSEI dataset. Our results generally show that +multi-modal self-supervision tasks offer large performance gains for +challenging tasks such as facial expression recognition, while also reducing +the amount of manual annotations required. We release our pre-trained models as +well as source code publicly + +
+
+ comment: The paper will appear in the CVPR 2024 workshops proceedings +
+
+
+
+
+ + ♻ ☆ UHD-IQA Benchmark Database: Pushing the Boundaries of Blind Photo + Quality Assessment + + +
+ We introduce a novel Image Quality Assessment (IQA) dataset comprising 6073 +UHD-1 (4K) images, annotated at a fixed width of 3840 pixels. Contrary to +existing No-Reference (NR) IQA datasets, ours focuses on highly aesthetic +photos of high technical quality, filling a gap in the literature. The images, +carefully curated to exclude synthetic content, are sufficiently diverse to +train general NR-IQA models. Importantly, the dataset is annotated with +perceptual quality ratings obtained through a crowdsourcing study. Ten expert +raters, comprising photographers and graphics artists, assessed each image at +least twice in multiple sessions spanning several days, resulting in 20 highly +reliable ratings per image. Annotators were rigorously selected based on +several metrics, including self-consistency, to ensure their reliability. The +dataset includes rich metadata with user and machine-generated tags from over +5,000 categories and popularity indicators such as favorites, likes, downloads, +and views. With its unique characteristics, such as its focus on high-quality +images, reliable crowdsourced annotations, and high annotation resolution, our +dataset opens up new opportunities for advancing perceptual image quality +assessment research and developing practical NR-IQA models that apply to modern +photos. Our dataset is available at +https://database.mmsp-kn.de/uhd-iqa-benchmark-database.html + +
+
+
+
+
+ + ♻ ☆ Model-agnostic explainable artificial intelligence for object detection + in image data + + +
+ In recent years, deep neural networks have been widely used for building +high-performance Artificial Intelligence (AI) systems for computer vision +applications. Object detection is a fundamental task in computer vision, which +has been greatly progressed through developing large and intricate AI models. +However, the lack of transparency is a big challenge that may not allow the +widespread adoption of these models. Explainable artificial intelligence is a +field of research where methods are developed to help users understand the +behavior, decision logics, and vulnerabilities of AI systems. Previously, few +explanation methods were developed for object detection based on random +masking. However, random masks may raise some issues regarding the actual +importance of pixels within an image. In this paper, we design and implement a +black-box explanation method named Black-box Object Detection Explanation by +Masking (BODEM) through adopting a hierarchical random masking approach for +object detection systems. We propose a hierarchical random masking framework in +which coarse-grained masks are used in lower levels to find salient regions +within an image, and fine-grained mask are used to refine the salient regions +in higher levels. Experimentations on various object detection datasets and +models showed that BODEM can effectively explain the behavior of object +detectors. Moreover, our method outperformed Detector Randomized Input Sampling +for Explanation (D-RISE) and Local Interpretable Model-agnostic Explanations +(LIME) with respect to different quantitative measures of explanation +effectiveness. The experimental results demonstrate that BODEM can be an +effective method for explaining and validating object detection systems in +black-box testing scenarios. + +
+
+
+
+
+ + ♻ ☆ Map-Free Visual Relocalization Enhanced by Instance Knowledge and Depth + Knowledge + + +
+ Map-free relocalization technology is crucial for applications in autonomous +navigation and augmented reality, but relying on pre-built maps is often +impractical. It faces significant challenges due to limitations in matching +methods and the inherent lack of scale in monocular images. These issues lead +to substantial rotational and metric errors and even localization failures in +real-world scenarios. Large matching errors significantly impact the overall +relocalization process, affecting both rotational and translational accuracy. +Due to the inherent limitations of the camera itself, recovering the metric +scale from a single image is crucial, as this significantly impacts the +translation error. To address these challenges, we propose a map-free +relocalization method enhanced by instance knowledge and depth knowledge. By +leveraging instance-based matching information to improve global matching +results, our method significantly reduces the possibility of mismatching across +different objects. The robustness of instance knowledge across the scene helps +the feature point matching model focus on relevant regions and enhance matching +accuracy. Additionally, we use estimated metric depth from a single image to +reduce metric errors and improve scale recovery accuracy. By integrating +methods dedicated to mitigating large translational and rotational errors, our +approach demonstrates superior performance in map-free relocalization +techniques. + +
+
+ comment: 17 pages,6 figures +
+
+
+
+
+ + ♻ ☆ CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT + Volumes + + +
+ The rapid increase of computed tomography (CT) scans and their time-consuming +manual analysis have created an urgent need for robust automated analysis +techniques in clinical settings. These aim to assist radiologists and help them +managing their growing workload. Existing methods typically generate entire +reports directly from 3D CT images, without explicitly focusing on observed +abnormalities. This unguided approach often results in repetitive content or +incomplete reports, failing to prioritize anomaly-specific descriptions. We +propose a new anomaly-guided report generation model, which first predicts +abnormalities and then generates targeted descriptions for each. Evaluation on +a public dataset demonstrates significant improvements in report quality and +clinical relevance. We extend our work by conducting an ablation study to +demonstrate its effectiveness. + +
+
+ comment: 15 pages, 9 figures, submitted to ISBI 2025 +
+
+
+
+
+ + ♻ ☆ Path-SAM2: Transfer SAM2 for digital pathology semantic segmentation + + +
+ The semantic segmentation task in pathology plays an indispensable role in +assisting physicians in determining the condition of tissue lesions. With the +proposal of Segment Anything Model (SAM), more and more foundation models have +seen rapid development in the field of image segmentation. Recently, SAM2 has +garnered widespread attention in both natural image and medical image +segmentation. Compared to SAM, it has significantly improved in terms of +segmentation accuracy and generalization performance. We compared the +foundational models based on SAM and found that their performance in semantic +segmentation of pathological images was hardly satisfactory. In this paper, we +propose Path-SAM2, which for the first time adapts the SAM2 model to cater to +the task of pathological semantic segmentation. We integrate the largest +pretrained vision encoder for histopathology (UNI) with the original SAM2 +encoder, adding more pathology-based prior knowledge. Additionally, we +introduce a learnable Kolmogorov-Arnold Networks (KAN) classification module to +replace the manual prompt process. In three adenoma pathological datasets, +Path-SAM2 has achieved state-of-the-art performance.This study demonstrates the +great potential of adapting SAM2 to pathology image segmentation tasks. We plan +to release the code and model weights for this paper at: +https://github.com/simzhangbest/SAM2PATH + +
+
+ comment: 5 pages , 5 figures +
+
+
+
+
+ + ♻ ☆ Bayesian Evidential Learning for Few-Shot Classification + + +
+ Few-Shot Classification(FSC) aims to generalize from base classes to novel +classes given very limited labeled samples, which is an important step on the +path toward human-like machine learning. State-of-the-art solutions involve +learning to find a good metric and representation space to compute the distance +between samples. Despite the promising accuracy performance, how to model +uncertainty for metric-based FSC methods effectively is still a challenge. To +model uncertainty, We place a distribution over class probability based on the +theory of evidence. As a result, uncertainty modeling and metric learning can +be decoupled. To reduce the uncertainty of classification, we propose a +Bayesian evidence fusion theorem. Given observed samples, the network learns to +get posterior distribution parameters given the prior parameters produced by +the pre-trained network. Detailed gradient analysis shows that our method +provides a smooth optimization target and can capture the uncertainty. The +proposed method is agnostic to metric learning strategies and can be +implemented as a plug-and-play module. We integrate our method into several +newest FSC methods and demonstrate the improved accuracy and uncertainty +quantification on standard FSC benchmarks. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ AI-Assisted Cervical Cancer Screening + + +
+ Visual Inspection with Acetic Acid (VIA) remains the most feasible cervical +cancer screening test in resource-constrained settings of low- and +middle-income countries (LMICs), which are often performed screening camps or +primary/community health centers by nurses instead of the preferred but +unavailable expert Gynecologist. To address the highly subjective nature of the +test, various handheld devices integrating cameras or smartphones have been +recently explored to capture cervical images during VIA and aid decision-making +via telemedicine or AI models. Most studies proposing AI models retrospectively +use a relatively small number of already collected images from specific +devices, digital cameras, or smartphones; the challenges and protocol for +quality image acquisition during VIA in resource-constrained camp settings, +challenges in getting gold standard, data imbalance, etc. are often overlooked. +We present a novel approach and describe the end-to-end design process to build +a robust smartphone-based AI-assisted system that does not require buying a +separate integrated device: the proposed protocol for quality image acquisition +in resource-constrained settings, dataset collected from 1,430 women during VIA +performed by nurses in screening camps, preprocessing pipeline, and training +and evaluation of a deep-learning-based classification model aimed to identify +(pre)cancerous lesions. Our work shows that the readily available smartphones +and a suitable protocol can capture the cervix images with the required details +for the VIA test well; the deep-learning-based classification model provides +promising results to assist nurses in VIA screening; and provides a direction +for large-scale data collection and validation in resource-constrained +settings. + +
+
+
+
+
+ + ♻ ☆ Style-NeRF2NeRF: 3D Style Transfer From Style-Aligned Multi-View Images + + +
+ We propose a simple yet effective pipeline for stylizing a 3D scene, +harnessing the power of 2D image diffusion models. Given a NeRF model +reconstructed from a set of multi-view images, we perform 3D style transfer by +refining the source NeRF model using stylized images generated by a +style-aligned image-to-image diffusion model. Given a target style prompt, we +first generate perceptually similar multi-view images by leveraging a +depth-conditioned diffusion model with an attention-sharing mechanism. Next, +based on the stylized multi-view images, we propose to guide the style transfer +process with the sliced Wasserstein loss based on the feature maps extracted +from a pre-trained CNN model. Our pipeline consists of decoupled steps, +allowing users to test various prompt ideas and preview the stylized 3D result +before proceeding to the NeRF fine-tuning stage. We demonstrate that our method +can transfer diverse artistic styles to real-world 3D scenes with competitive +quality. Result videos are also available on our project page: +https://haruolabs.github.io/style-n2n/ + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Rethinking Barely-Supervised Volumetric Medical Image Segmentation from + an Unsupervised Domain Adaptation Perspective + + +
+ This paper investigates an extremely challenging problem: barely-supervised +volumetric medical image segmentation (BSS). A BSS training dataset consists of +two parts: 1) a barely-annotated labeled set, where each labeled image contains +only a single-slice annotation, and 2) an unlabeled set comprising numerous +unlabeled volumetric images. State-of-the-art BSS methods employ a +registration-based paradigm, which uses inter-slice image registration to +propagate single-slice annotations into volumetric pseudo labels, constructing +a completely annotated labeled set, to which a semi-supervised segmentation +scheme can be applied. However, the paradigm has a critical limitation: the +pseudo-labels generated by image registration are unreliable and noisy. +Motivated by this, we propose a new perspective: instead of solving BSS within +a semi-supervised learning scheme, this work formulates BSS as an unsupervised +domain adaptation problem. To this end, we propose a novel BSS framework, +\textbf{B}arely-supervised learning \textbf{via} unsupervised domain +\textbf{A}daptation (BvA), as an alternative to the dominant registration +paradigm. Specifically, we first design a novel noise-free labeled data +construction algorithm (NFC) for slice-to-volume labeled data synthesis. Then, +we introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the +domain shifts. Extensive experiments demonstrate that our method provides a +promising alternative for BSS. Remarkably, the proposed method, trained on the +left atrial segmentation dataset with \textbf{only one} barely-labeled image, +achieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%. +The code is available at https://github.com/Senyh/BvA. + +
+
+
+
+
+ + ♻ ☆ Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment + Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging + + +
+ Purpose: + To evaluate the zero-shot performance of Segment Anything Model 2 (SAM 2) in +3D segmentation of abdominal organs in CT scans, and to investigate the effects +of prompt settings on segmentation results. + Materials and Methods: + Using a subset of the TotalSegmentator CT dataset (n = 123) from eight +institutions, we assessed SAM 2's ability to segment eight abdominal organs. +Segmentation was initiated from three different z-coordinate levels (caudal, +mid, and cranial levels) of each organ. Performance was measured using the Dice +similarity coefficient (DSC). We also analyzed the impact of "negative +prompts," which explicitly exclude certain regions from the segmentation +process, on accuracy. Additionally, we analyzed organ volumes to contextualize +the segmentation performance. + Results: + As a zero-shot approach, larger organs with clear boundaries demonstrated +high segmentation performance, with mean(median) DSCs as follows: liver +0.821(0.898), left kidney 0.870(0.921), right kidney 0.862(0.935), and spleen +0.891(0.932). Smaller organs showed lower performance: gallbladder +0.531(0.590), pancreas 0.361(0.359), and adrenal glands, right 0.203(0.109), +left 0.308(0.231). The initial slice for segmentation and the use of negative +prompts significantly influenced the results. By removing negative prompts from +the input, the DSCs significantly decreased for six organs. Moderate positive +correlations were observed between volume sizes and DSCs. + Conclusion: + SAM 2 demonstrated promising zero-shot performance in segmenting certain +abdominal organs in CT scans, particularly larger organs with clear boundaries. +Performance was significantly influenced by input negative prompts and initial +slice selection, highlighting the importance of optimizing these factors for +effective segmentation. + +
+
+ comment: 20 pages, 7 figures (including 2 supplemental figure), 4 tables +
+
+
+
+
+ + ♻ ☆ RMT-BVQA: Recurrent Memory Transformer-based Blind Video Quality + Assessment for Enhanced Video Content ECCV 2024 + + +
+ With recent advances in deep learning, numerous algorithms have been +developed to enhance video quality, reduce visual artifacts, and improve +perceptual quality. However, little research has been reported on the quality +assessment of enhanced content - the evaluation of enhancement methods is often +based on quality metrics that were designed for compression applications. In +this paper, we propose a novel blind deep video quality assessment (VQA) method +specifically for enhanced video content. It employs a new Recurrent Memory +Transformer (RMT) based network architecture to obtain video quality +representations, which is optimized through a novel content-quality-aware +contrastive learning strategy based on a new database containing 13K training +patches with enhanced content. The extracted quality representations are then +combined through linear regression to generate video-level quality indices. The +proposed method, RMT-BVQA, has been evaluated on the VDPVE (VQA Dataset for +Perceptual Video Enhancement) database through a five-fold cross validation. +The results show its superior correlation performance when compared to ten +existing no-reference quality metrics. + +
+
+ comment: This paper has been accepted by the ECCV 2024 AIM Advances in Image + Manipulation workshop +
+
+
+
+
+ + ♻ ☆ Group-aware Parameter-efficient Updating for Content-Adaptive Neural + Video Compression ACM MM 2024 + + +
+ Content-adaptive compression is crucial for enhancing the adaptability of the +pre-trained neural codec for various contents. Although these methods have been +very practical in neural image compression (NIC), their application in neural +video compression (NVC) is still limited due to two main aspects: 1), video +compression relies heavily on temporal redundancy, therefore updating just one +or a few frames can lead to significant errors accumulating over time; 2), NVC +frameworks are generally more complex, with many large components that are not +easy to update quickly during encoding. To address the previously mentioned +challenges, we have developed a content-adaptive NVC technique called +Group-aware Parameter-Efficient Updating (GPU). Initially, to minimize error +accumulation, we adopt a group-aware approach for updating encoder parameters. +This involves adopting a patch-based Group of Pictures (GoP) training strategy +to segment a video into patch-based GoPs, which will be updated to facilitate a +globally optimized domain-transferable solution. Subsequently, we introduce a +parameter-efficient delta-tuning strategy, which is achieved by integrating +several light-weight adapters into each coding component of the encoding +process by both serial and parallel configuration. Such architecture-agnostic +modules stimulate the components with large parameters, thereby reducing both +the update cost and the encoding time. We incorporate our GPU into the latest +NVC framework and conduct comprehensive experiments, whose results showcase +outstanding video compression efficiency across four video benchmarks and +adaptability of one medical image benchmark. + +
+
+ comment: Accepted by ACM MM 2024, Melbourne, Australia +
+
+
+
+
+ + ♻ ☆ CrossDF: Improving Cross-Domain Deepfake Detection with Deep Information + Decomposition + + +
+ Deepfake technology poses a significant threat to security and social trust. +Although existing detection methods have shown high performance in identifying +forgeries within datasets that use the same deepfake techniques for both +training and testing, they suffer from sharp performance degradation when faced +with cross-dataset scenarios where unseen deepfake techniques are tested. To +address this challenge, we propose a Deep Information Decomposition (DID) +framework to enhance the performance of Cross-dataset Deepfake Detection +(CrossDF). Unlike most existing deepfake detection methods, our framework +prioritizes high-level semantic features over specific visual artifacts. +Specifically, it adaptively decomposes facial features into deepfake-related +and irrelevant information, only using the intrinsic deepfake-related +information for real/fake discrimination. Moreover, it optimizes these two +kinds of information to be independent with a de-correlation learning module, +thereby enhancing the model's robustness against various irrelevant information +changes and generalization ability to unseen forgery methods. Our extensive +experimental evaluation and comparison with existing state-of-the-art detection +methods validate the effectiveness and superiority of the DID framework on +cross-dataset deepfake detection. + +
+
+
+
+
+ + ♻ ☆ Towards Extreme Image Compression with Latent Feature Guidance and + Diffusion Prior + + +
+ Image compression at extremely low bitrates (below 0.1 bits per pixel (bpp)) +is a significant challenge due to substantial information loss. In this work, +we propose a novel two-stage extreme image compression framework that exploits +the powerful generative capability of pre-trained diffusion models to achieve +realistic image reconstruction at extremely low bitrates. In the first stage, +we treat the latent representation of images in the diffusion space as +guidance, employing a VAE-based compression approach to compress images and +initially decode the compressed information into content variables. The second +stage leverages pre-trained stable diffusion to reconstruct images under the +guidance of content variables. Specifically, we introduce a small control +module to inject content information while keeping the stable diffusion model +fixed to maintain its generative capability. Furthermore, we design a space +alignment loss to force the content variables to align with the diffusion space +and provide the necessary constraints for optimization. Extensive experiments +demonstrate that our method significantly outperforms state-of-the-art +approaches in terms of visual performance at extremely low bitrates. The source +code and trained models are available at https://github.com/huai-chang/DiffEIC. + +
+
+ comment: Accepted by IEEE TCSVT +
+
+
+
+
+ + ♻ ☆ Robust Semi-supervised Multimodal Medical Image Segmentation via Cross + Modality Collaboration + + +
+ Multimodal learning leverages complementary information derived from +different modalities, thereby enhancing performance in medical image +segmentation. However, prevailing multimodal learning methods heavily rely on +extensive well-annotated data from various modalities to achieve accurate +segmentation performance. This dependence often poses a challenge in clinical +settings due to limited availability of such data. Moreover, the inherent +anatomical misalignment between different imaging modalities further +complicates the endeavor to enhance segmentation performance. To address this +problem, we propose a novel semi-supervised multimodal segmentation framework +that is robust to scarce labeled data and misaligned modalities. Our framework +employs a novel cross modality collaboration strategy to distill +modality-independent knowledge, which is inherently associated with each +modality, and integrates this information into a unified fusion layer for +feature amalgamation. With a channel-wise semantic consistency loss, our +framework ensures alignment of modality-independent information from a +feature-wise perspective across modalities, thereby fortifying it against +misalignments in multimodal scenarios. Furthermore, our framework effectively +integrates contrastive consistent learning to regulate anatomical structures, +facilitating anatomical-wise prediction alignment on unlabeled data in +semi-supervised segmentation tasks. Our method achieves competitive performance +compared to other multimodal methods across three tasks: cardiac, abdominal +multi-organ, and thyroid-associated orbitopathy segmentations. It also +demonstrates outstanding robustness in scenarios involving scarce labeled data +and misaligned modalities. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Intracranial Hemorrhage Segmentation with YOLO and an + Uncertainty Rectified Segment Anything Model + + +
+ Intracranial hemorrhage (ICH) is a life-threatening condition that requires +rapid and accurate diagnosis to improve treatment outcomes and patient survival +rates. Recent advancements in supervised deep learning have greatly improved +the analysis of medical images, but often rely on extensive datasets with +high-quality annotations, which are costly, time-consuming, and require medical +expertise to prepare. To mitigate the need for large amounts of expert-prepared +segmentation data, we have developed a novel weakly supervised ICH segmentation +method that utilizes the YOLO object detection model and an +uncertainty-rectified Segment Anything Model (SAM). In addition, we have +proposed a novel point prompt generator for this model to further improve +segmentation results with YOLO-predicted bounding box prompts. Our approach +achieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along +with a mean Dice score of 0.629 for ICH segmentation, outperforming existing +weakly supervised and popular supervised (UNet and Swin-UNETR) approaches. +Overall, the proposed method provides a robust and accurate alternative to the +more commonly used supervised techniques for ICH quantification without +requiring refined segmentation ground truths during model training. + +
+
+ comment: Manuscript was accepted at SWITCH2024. 10 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Learned Image Transmission with Hierarchical Variational Autoencoder + + +
+ In this paper, we introduce an innovative hierarchical joint source-channel +coding (HJSCC) framework for image transmission, utilizing a hierarchical +variational autoencoder (VAE). Our approach leverages a combination of +bottom-up and top-down paths at the transmitter to autoregressively generate +multiple hierarchical representations of the original image. These +representations are then directly mapped to channel symbols for transmission by +the JSCC encoder. We extend this framework to scenarios with a feedback link, +modeling transmission over a noisy channel as a probabilistic sampling process +and deriving a novel generative formulation for JSCC with feedback. Compared +with existing approaches, our proposed HJSCC provides enhanced adaptability by +dynamically adjusting transmission bandwidth, encoding these representations +into varying amounts of channel symbols. Additionally, we introduce a rate +attention module to guide the JSCC encoder in optimizing its encoding strategy +based on prior information. Extensive experiments on images of varying +resolutions demonstrate that our proposed model outperforms existing baselines +in rate-distortion performance and maintains robustness against channel noise. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Classify Power Quality Signals Using Vision + Transformers + + +
+ With the rapid integration of electronically interfaced renewable energy +resources and loads into smart grids, there is increasing interest in power +quality disturbances (PQD) classification to enhance the security and +efficiency of these grids. This paper introduces a new approach to PQD +classification based on the Vision Transformer (ViT) model. When a PQD occurs, +the proposed approach first converts the power quality signal into an image and +then utilizes a pre-trained ViT to accurately determine the class of the PQD. +Unlike most previous works, which were limited to a few disturbance classes or +small datasets, the proposed method is trained and tested on a large dataset +with 17 disturbance classes. Our experimental results show that the proposed +ViT-based approach achieves PQD classification precision and recall of 98.28% +and 97.98%, respectively, outperforming recently proposed techniques applied to +the same dataset. + +
+
+ comment: IECON 2024-50th Annual Conference of the IEEE Industrial Electronics + Society, Chicago, U.S.A, 2024, pp. 1-6 +
+
+
+
+
+ + ♻ ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images + + +
+ Text-to-image generation models have achieved remarkable advancements in +recent years, aiming to produce realistic images from textual descriptions. +However, these models often struggle with generating anatomically accurate +representations of human hands. The resulting images frequently exhibit issues +such as incorrect numbers of fingers, unnatural twisting or interlacing of +fingers, or blurred and indistinct hands. These issues stem from the inherent +complexity of hand structures and the difficulty in aligning textual +descriptions with precise visual depictions of hands. To address these +challenges, we propose a novel approach named Hand1000 that enables the +generation of realistic hand images with target gesture using only 1,000 +training samples. The training of Hand1000 is divided into three stages with +the first stage aiming to enhance the model's understanding of hand anatomy by +using a pre-trained hand gesture recognition model to extract gesture +representation. The second stage further optimizes text embedding by +incorporating the extracted hand gesture representation, to improve alignment +between the textual descriptions and the generated hand images. The third stage +utilizes the optimized embedding to fine-tune the Stable Diffusion model to +generate realistic hand images. In addition, we construct the first publicly +available dataset specifically designed for text-to-hand image generation. +Based on the existing hand gesture recognition dataset, we adopt advanced image +captioning models and LLaMA3 to generate high-quality textual descriptions +enriched with detailed gesture information. Extensive experiments demonstrate +that Hand1000 significantly outperforms existing models in producing +anatomically correct hand images while faithfully representing other details in +the text, such as faces, clothing, and colors. + +
+
+ comment: Project page https://haozhuo-zhang.github.io/Hand1000-project-page/ +
+
+
+
+
+ + ♻ ☆ MV-VTON: Multi-View Virtual Try-On with Diffusion Models + + +
+ The goal of image-based virtual try-on is to generate an image of the target +person naturally wearing the given clothing. However, existing methods solely +focus on the frontal try-on using the frontal clothing. When the views of the +clothing and person are significantly inconsistent, particularly when the +person's view is non-frontal, the results are unsatisfactory. To address this +challenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to +reconstruct the dressing results from multiple views using the given clothes. +Given that single-view clothes provide insufficient information for MV-VTON, we +instead employ two images, i.e., the frontal and back views of the clothing, to +encompass the complete view as much as possible. Moreover, we adopt diffusion +models that have demonstrated superior abilities to perform our MV-VTON. In +particular, we propose a view-adaptive selection method where hard-selection +and soft-selection are applied to the global and local clothing feature +extraction, respectively. This ensures that the clothing features are roughly +fit to the person's view. Subsequently, we suggest joint attention blocks to +align and fuse clothing features with person features. Additionally, we collect +a MV-VTON dataset MVG, in which each person has multiple photos with diverse +views and poses. Experiments show that the proposed method not only achieves +state-of-the-art results on MV-VTON task using our MVG dataset, but also has +superiority on frontal-view virtual try-on task using VITON-HD and DressCode +datasets. Codes and datasets are publicly released at +https://github.com/hywang2002/MV-VTON . + +
+
+ comment: Project url: https://hywang2002.github.io/MV-VTON/ +
+
+
+
+
+ + ♻ ☆ Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in + Federated Class Continual Learning ECCV 2024 + + +
+ Federated Class Continual Learning (FCCL) merges the challenges of +distributed client learning with the need for seamless adaptation to new +classes without forgetting old ones. The key challenge in FCCL is catastrophic +forgetting, an issue that has been explored to some extent in Continual +Learning (CL). However, due to privacy preservation requirements, some +conventional methods, such as experience replay, are not directly applicable to +FCCL. Existing FCCL methods mitigate forgetting by generating historical data +through federated training of GANs or data-free knowledge distillation. +However, these approaches often suffer from unstable training of generators or +low-quality generated data, limiting their guidance for the model. To address +this challenge, we propose a novel method of data replay based on diffusion +models. Instead of training a diffusion model, we employ a pre-trained +conditional diffusion model to reverse-engineer each class, searching the +corresponding input conditions for each class within the model's input space, +significantly reducing computational resources and time consumption while +ensuring effective generation. Furthermore, we enhance the classifier's domain +generalization ability on generated and real data through contrastive learning, +indirectly improving the representational capability of generated data for real +data. Comprehensive experiments demonstrate that our method significantly +outperforms existing baselines. Code is available at +https://github.com/jinglin-liang/DDDR. + +
+
+ comment: Accepted by ECCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ ORMNet: Object-centric Relationship Modeling for Egocentric Hand-object + Segmentation + + +
+ Egocentric hand-object segmentation (EgoHOS) is a promising new task aiming +at segmenting hands and interacting objects in egocentric images. Although +EgoHOS has the potential to enable various applications, current methods +struggle to achieve both high performance and end-to-end optimization +simultaneously. Moreover, existing approaches fail to fully leverage hand cues +to assist the interacting-object segmentation and overlook the coupled +relationships between diverse interacting-object categories, resulting in +performance deficiencies. To address these limitations, this paper proposes a +novel Object-centric Relationship Modeling Network (ORMNet) to fulfill +end-to-end and effective EgoHOS by modeling relationships between hands and +objects as well as objects and objects. Specifically, a Hand-Object Relation +(HOR) module is introduced to capture the correlation between hands and +objects, which uses hand features to guide the network to extract more +distinguishing interacting-object features. Besides, we find the coupling +relations between diverse interacting-object categories and design the Object +Relation Decoupling (ORD) strategy to disentangle them, emphasizing learning of +the interaction between hands and objects and reducing the confusion of +interacting-object classification. In-domain experiments show that ORMNet has +notably exceptional segmentation performance compared with state-of-the-art +methods, while out-of-domain experiments further exhibit its robust +generalization capability. The project is available at +https://github.com/yuggiehk/ORMNet/ + +
+
+
+
+
+ + ♻ ☆ MADE-for-ASD: A Multi-Atlas Deep Ensemble Network for Diagnosing Autism + Spectrum Disorder + + +
+ In response to the global need for efficient early diagnosis of Autism +Spectrum Disorder (ASD), this paper bridges the gap between traditional, +time-consuming diagnostic methods and potential automated solutions. We propose +a multi-atlas deep ensemble network, MADE-for-ASD, that integrates multiple +atlases of the brain's functional magnetic resonance imaging (fMRI) data +through a weighted deep ensemble network. Our approach integrates demographic +information into the prediction workflow, which enhances ASD diagnosis +performance and offers a more holistic perspective on patient profiling. We +experiment with the well-known publicly available ABIDE (Autism Brain Imaging +Data Exchange) I dataset, consisting of resting state fMRI data from 17 +different laboratories around the globe. Our proposed system achieves 75.20% +accuracy on the entire dataset and 96.40% on a specific subset $-$ both +surpassing reported ASD diagnosis accuracy in ABIDE I fMRI studies. +Specifically, our model improves by 4.4 percentage points over prior works on +the same amount of data. The model exhibits a sensitivity of 82.90% and a +specificity of 69.70% on the entire dataset, and 91.00% and 99.50%, +respectively, on the specific subset. We leverage the F-score to pinpoint the +top 10 ROI in ASD diagnosis, such as precuneus and anterior +cingulate/ventromedial. The proposed system can potentially pave the way for +more cost-effective, efficient and scalable strategies in ASD diagnosis. Codes +and evaluations are publicly available at +https://github.com/hasan-rakibul/MADE-for-ASD. + +
+
+ comment: Xuehan Liu and Md Rakibul Hasan contributed equally to this work +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+ + ♻ ☆ Asynchronous Blob Tracker for Event Cameras + + +
+ Event-based cameras are popular for tracking fast-moving objects due to their +high temporal resolution, low latency, and high dynamic range. In this paper, +we propose a novel algorithm for tracking event blobs using raw events +asynchronously in real time. We introduce the concept of an event blob as a +spatio-temporal likelihood of event occurrence where the conditional spatial +likelihood is blob-like. Many real-world objects such as car headlights or any +quickly moving foreground objects generate event blob data. The proposed +algorithm uses a nearest neighbour classifier with a dynamic threshold criteria +for data association coupled with an extended Kalman filter to track the event +blob state. Our algorithm achieves highly accurate blob tracking, velocity +estimation, and shape estimation even under challenging lighting conditions and +high-speed motions (> 11000 pixels/s). The microsecond time resolution achieved +means that the filter output can be used to derive secondary information such +as time-to-contact or range estimation, that will enable applications to +real-world problems such as collision avoidance in autonomous driving. + +
+
+ comment: 18 pages, 16 figures. The manuscript was accepted on August 7, 2024, + by IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video + Solution to Enhance Community Safety + + +
+ This article adopts and evaluates an AI-enabled Smart Video Solution (SVS) +designed to enhance safety in the real world. The system integrates with +existing infrastructure camera networks, leveraging recent advancements in AI +for easy adoption. Prioritizing privacy and ethical standards, pose based data +is used for downstream AI tasks such as anomaly detection. Cloud-based +infrastructure and mobile app are deployed, enabling real-time alerts within +communities. The SVS employs innovative data representation and visualization +techniques, such as the Occupancy Indicator, Statistical Anomaly Detection, +Bird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance +public safety. Evaluation of the SVS demonstrates its capacity to convert +complex computer vision outputs into actionable insights for stakeholders, +community partners, law enforcement, urban planners, and social scientists. +This article presents a comprehensive real-world deployment and evaluation of +the SVS, implemented in a community college environment across 16 cameras. The +system integrates AI-driven visual processing, supported by statistical +analysis, database management, cloud communication, and user notifications. +Additionally, the article evaluates the end-to-end latency from the moment an +AI algorithm detects anomalous behavior in real-time at the camera level to the +time stakeholders receive a notification. The results demonstrate the system's +robustness, effectively managing 16 CCTV cameras with a consistent throughput +of 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end +latency of 26.76 seconds between anomaly detection and alert issuance. + +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Bioinformatics Retrieval Augmentation Data (BRAD) Digital Assistant + + +
+ We present a prototype for a Bioinformatics Retrieval Augmentation Data +(BRAD) digital assistant. BRAD integrates a suite of tools to handle a wide +range of bioinformatics tasks, from code execution to online search. We +demonstrate BRAD's capabilities through (1) improved question-and-answering +with retrieval augmented generation (RAG), (2) BRAD's ability to run and write +complex software pipelines, and (3) BRAD's ability to organize and distribute +tasks across individual and teams of agents. We use BRAD for automation of +bioinformatics workflows, performing tasks ranging from gene enrichment and +searching the archive to automatic code generation and running biomarker +identification pipelines. BRAD is a step toward the ultimate goal to develop a +digital twin of laboratories driven by self-contained loops for hypothesis +generation and testing of digital biology experiments. + +
+
+
+
+
+ + ☆ Building a Scalable, Effective, and Steerable Search and Ranking + Platform + + +
+ Modern e-commerce platforms offer vast product selections, making it +difficult for customers to find items that they like and that are relevant to +their current session intent. This is why it is key for e-commerce platforms to +have near real-time scalable and adaptable personalized ranking and search +systems. While numerous methods exist in the scientific literature for building +such systems, many are unsuitable for large-scale industrial use due to +complexity and performance limitations. Consequently, industrial ranking +systems often resort to computationally efficient yet simplistic retrieval or +candidate generation approaches, which overlook near real-time and +heterogeneous customer signals, which results in a less personalized and +relevant experience. Moreover, related customer experiences are served by +completely different systems, which increases complexity, maintenance, and +inconsistent experiences. + In this paper, we present a personalized, adaptable near real-time ranking +platform that is reusable across various use cases, such as browsing and +search, and that is able to cater to millions of items and customers under +heavy load (thousands of requests per second). We employ transformer-based +models through different ranking layers which can learn complex behavior +patterns directly from customer action sequences while being able to +incorporate temporal (e.g. in-session) and contextual information. We validate +our system through a series of comprehensive offline and online real-world +experiments at a large online e-commerce platform, and we demonstrate its +superiority when compared to existing systems, both in terms of customer +experience as well as in net revenue. Finally, we share the lessons learned +from building a comprehensive, modern ranking platform for use in a large-scale +e-commerce environment. + +
+
+
+
+
+ + ☆ Pooling And Attention: What Are Effective Designs For LLm-Based + Embedding Models? + + +
+ The significant advancements of Large Language Models (LLMs) in generative +tasks have led to a growing body of work exploring LLM-based embedding models. +While these models, employing different pooling and attention strategies, have +achieved state-of-the-art performance on public embedding benchmarks, questions +still arise about what constitutes an effective design for LLM-based embedding +models. However, these models are often trained on different datasets, using +different LLM base models or training settings. Moreover, evaluations on public +embedding benchmarks often fail to report statistical significance, making it +difficult to determine which designs truly contribute to final performance. +This complicates the process for practitioners seeking optimal training recipes +for LLM-based embedding models. In this study, we conduct a large-scale +experiment by training a series of LLM-based embedding models using the same +training data and base model but differing in their pooling and attention +strategies. The results show that there is no one-size-fits-all solution: while +bidirectional attention and an additional trainable pooling layer outperform in +text similarity and information retrieval tasks, they do not significantly +surpass simpler designs like EOS-last token pooling and default causal +attention in clustering and classification tasks. Furthermore, we propose a new +pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs +of all hidden layers, rather than just the last layer, using a cross-attention +network. This method proves to be statistically superior in text similarity and +retrieval tasks compared to existing pooling methods. Overall, this paper sheds +light on effective training strategies for LLM-based embedding models. + +
+
+ comment: https://github.com/yixuantt/PoolingAndAttn +
+
+
+
+
+ + ☆ RouterRetriever: Exploring the Benefits of Routing over Multiple Expert + Embedding Models + + +
+ Information retrieval methods often rely on a single embedding model trained +on large, general-domain datasets like MSMARCO. While this approach can produce +a retriever with reasonable overall performance, models trained on +domain-specific data often yield better results within their respective +domains. While prior work in information retrieval has tackled this through +multi-task training, the topic of combining multiple domain-specific expert +retrievers remains unexplored, despite its popularity in language model +generation. In this work, we introduce RouterRetriever, a retrieval model that +leverages multiple domain-specific experts along with a routing mechanism to +select the most appropriate expert for each query. It is lightweight and allows +easy addition or removal of experts without additional training. Evaluation on +the BEIR benchmark demonstrates that RouterRetriever outperforms both +MSMARCO-trained (+2.1 absolute nDCG@10) and multi-task trained (+3.2) models. +This is achieved by employing our routing mechanism, which surpasses other +routing techniques (+1.8 on average) commonly used in language modeling. +Furthermore, the benefit generalizes well to other datasets, even in the +absence of a specific expert on the dataset. To our knowledge, RouterRetriever +is the first work to demonstrate the advantages of using multiple +domain-specific expert embedding models with effective routing over a single, +general-purpose embedding model in retrieval tasks. + +
+
+
+
+
+ + ☆ A Fashion Item Recommendation Model in Hyperbolic Space CVPR 2024 + + +
+ In this work, we propose a fashion item recommendation model that +incorporates hyperbolic geometry into user and item representations. Using +hyperbolic space, our model aims to capture implicit hierarchies among items +based on their visual data and users' purchase history. During training, we +apply a multi-task learning framework that considers both hyperbolic and +Euclidean distances in the loss function. Our experiments on three data sets +show that our model performs better than previous models trained in Euclidean +space only, confirming the effectiveness of our model. Our ablation studies +show that multi-task learning plays a key role, and removing the Euclidean loss +substantially deteriorates the model performance. + +
+
+ comment: This work was presented at the CVFAD Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ AlignGroup: Learning and Aligning Group Consensus with Member + Preferences for Group Recommendation CIKM 2024 + + +
+ Group activities are important behaviors in human society, providing +personalized recommendations for groups is referred to as the group +recommendation task. Existing methods can usually be categorized into two +strategies to infer group preferences: 1) determining group preferences by +aggregating members' personalized preferences, and 2) inferring group consensus +by capturing group members' coherent decisions after common compromises. +However, the former would suffer from the lack of group-level considerations, +and the latter overlooks the fine-grained preferences of individual users. To +this end, we propose a novel group recommendation method AlignGroup, which +focuses on both group consensus and individual preferences of group members to +infer the group decision-making. Specifically, AlignGroup explores group +consensus through a well-designed hypergraph neural network that efficiently +learns intra- and inter-group relationships. Moreover, AlignGroup innovatively +utilizes a self-supervised alignment task to capture fine-grained group +decision-making by aligning the group consensus with members' common +preferences. Extensive experiments on two real-world datasets validate that our +AlignGroup outperforms the state-of-the-art on both the group recommendation +task and the user recommendation task, as well as outperforms the efficiency of +most baselines. + +
+
+ comment: 10 pages, accepted by CIKM 2024 +
+
+
+
+
+ + ☆ iRangeGraph: Improvising Range-dedicated Graphs for Range-filtering + Nearest Neighbor Search SIGMOD 2025 + + +
+ Range-filtering approximate nearest neighbor (RFANN) search is attracting +increasing attention in academia and industry. Given a set of data objects, +each being a pair of a high-dimensional vector and a numeric value, an RFANN +query with a vector and a numeric range as parameters returns the data object +whose numeric value is in the query range and whose vector is nearest to the +query vector. To process this query, a recent study proposes to build $O(n^2)$ +dedicated graph-based indexes for all possible query ranges to enable efficient +processing on a database of $n$ objects. As storing all these indexes is +prohibitively expensive, the study constructs compressed indexes instead, which +reduces the memory consumption considerably. However, this incurs suboptimal +performance because the compression is lossy. In this study, instead of +materializing a compressed index for every possible query range in preparation +for querying, we materialize graph-based indexes, called elemental graphs, for +a moderate number of ranges. We then provide an effective and efficient +algorithm that during querying can construct an index for any query range using +the elemental graphs. We prove that the time needed to construct such an index +is low. We also cover an experimental study on real-world datasets that +provides evidence that the materialized elemental graphs only consume moderate +space and that the proposed method is capable of superior and stable query +performance across different query workloads. + +
+
+ comment: The paper has been accepted by SIGMOD 2025 +
+
+
+
+
+ + ☆ An Effective Tag Assignment Approach for Billboard Advertisement + + +
+ Billboard Advertisement has gained popularity due to its significant outrage +in return on investment. To make this advertisement approach more effective, +the relevant information about the product needs to be reached to the relevant +set of people. This can be achieved if the relevant set of tags can be mapped +to the correct slots. Formally, we call this problem the Tag Assignment Problem +in Billboard Advertisement. Given trajectory, billboard database, and a set of +selected billboard slots and tags, this problem asks to output a mapping of +selected tags to the selected slots so that the influence is maximized. We +model this as a variant of traditional bipartite matching called One-To-Many +Bipartite Matching (OMBM). Unlike traditional bipartite matching, a tag can be +assigned to only one slot; in the OMBM, a tag can be assigned to multiple slots +while the vice versa can not happen. We propose an iterative solution approach +that incrementally allocates the tags to the slots. The proposed methodology +has been explained with an illustrated example. A complexity analysis of the +proposed solution approach has also been conducted. The experimental results on +real-world trajectory and billboard datasets prove our claim on the +effectiveness and efficiency of the proposed solution. + +
+
+ comment: This Paper has been accepted at The 25th International Web + Information Systems Engineering Conference (WISE-2024) +
+
+
+
+
+ + ☆ Deep Adaptive Interest Network: Personalized Recommendation with + Context-Aware Learning + + +
+ In personalized recommendation systems, accurately capturing users' evolving +interests and combining them with contextual information is a critical research +area. This paper proposes a novel model called the Deep Adaptive Interest +Network (DAIN), which dynamically models users' interests while incorporating +context-aware learning mechanisms to achieve precise and adaptive personalized +recommendations. DAIN leverages deep learning techniques to build an adaptive +interest network structure that can capture users' interest changes in +real-time while further optimizing recommendation results by integrating +contextual information. Experiments conducted on several public datasets +demonstrate that DAIN excels in both recommendation performance and +computational efficiency. This research not only provides a new solution for +personalized recommendation systems but also offers fresh insights into the +application of context-aware learning in recommendation systems. + +
+
+
+
+
+ + ☆ NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for + Retrieval + + +
+ $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval) +from pre-trained embedding models is the predominant retrieval method for text +and images, as well as Retrieval-Augmented Generation (RAG) pipelines. In +practice, application developers often fine-tune the embeddings to improve +their accuracy on the dataset and query workload in hand. Existing approaches +either fine-tune the pre-trained model itself or, more efficiently, but at the +cost of accuracy, train adaptor models to transform the output of the +pre-trained model. We present NUDGE, a family of novel non-parametric embedding +fine-tuning approaches that are significantly more accurate and efficient than +both sets of existing approaches. NUDGE directly modifies the embeddings of +data records to maximize the accuracy of $k$-NN retrieval. We present a +thorough theoretical and experimental study of NUDGE's non-parametric approach. +We show that even though the underlying problem is NP-Hard, constrained +variations can be solved efficiently. These constraints additionally ensure +that the changes to the embeddings are modest, avoiding large distortions to +the semantics learned during pre-training. In experiments across five +pre-trained models and nine standard text and image retrieval datasets, NUDGE +runs in minutes and often improves NDCG@10 by more than 10% over existing +fine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase +in accuracy and runs 200x and 3x faster, respectively, over fine-tuning the +pre-trained model and training adaptors. + +
+
+
+
+
+ + ♻ ☆ The Design of an LLM-powered Unstructured Analytics System + + +
+ LLMs demonstrate an uncanny ability to process unstructured data, and as +such, have the potential to go beyond search and run complex, semantic analyses +at scale. We describe the design of an unstructured analytics system, Aryn, and +the tenets and use cases that motivate its design. With Aryn, users can specify +queries in natural language and the system automatically determines a semantic +plan and executes it to compute an answer from a large collection of +unstructured documents using LLMs. At the core of Aryn is Sycamore, a +declarative document processing engine, built using Ray, that provides a +reliable distributed abstraction called DocSets. Sycamore allows users to +analyze, enrich, and transform complex documents at scale. Aryn also comprises +Luna, a query planner that translates natural language queries to Sycamore +scripts, and the Aryn Partitioner, which takes raw PDFs and document images, +and converts them to DocSets for downstream processing. Using Aryn, we +demonstrate a real world use case for analyzing accident reports from the +National Transportation Safety Board (NTSB), and discuss some of the major +challenges we encountered in deploying Aryn in the wild. + +
+
+ comment: 6 pages, 3 figures, fixed typos +
+
+
+
+
+ + ♻ ☆ Multimodal Recommender Systems: A Survey + + +
+ The recommender system (RS) has been an integral toolkit of online services. +They are equipped with various deep learning techniques to model user +preference based on identifier and attribute information. With the emergence of +multimedia services, such as short videos, news and etc., understanding these +contents while recommending becomes critical. Besides, multimodal features are +also helpful in alleviating the problem of data sparsity in RS. Thus, +Multimodal Recommender System (MRS) has attracted much attention from both +academia and industry recently. In this paper, we will give a comprehensive +survey of the MRS models, mainly from technical views. First, we conclude the +general procedures and major challenges for MRS. Then, we introduce the +existing MRS models according to four categories, i.e., Modality Encoder, +Feature Interaction, Feature Enhancement and Model Optimization. Besides, to +make it convenient for those who want to research this field, we also summarize +the dataset and code resources. Finally, we discuss some promising future +directions of MRS and conclude this paper. To access more details of the +surveyed papers, such as implementation code, we open source a repository. + +
+
+ comment: accepted by CSUR +
+
+
+
+
+ + ♻ ☆ MARS: Matching Attribute-aware Representations for Text-based Sequential + Recommendation CIKM 2024 + + +
+ Sequential recommendation aims to predict the next item a user is likely to +prefer based on their sequential interaction history. Recently, text-based +sequential recommendation has emerged as a promising paradigm that uses +pre-trained language models to exploit textual item features to enhance +performance and facilitate knowledge transfer to unseen datasets. However, +existing text-based recommender models still struggle with two key challenges: +(i) representing users and items with multiple attributes, and (ii) matching +items with complex user interests. To address these challenges, we propose a +novel model, Matching Attribute-aware Representations for Text-based Sequential +Recommendation (MARS). MARS extracts detailed user and item representations +through attribute-aware text encoding, capturing diverse user intents with +multiple attribute-aware representations. It then computes user-item scores via +attribute-wise interaction matching, effectively capturing attribute-level user +preferences. Our extensive experiments demonstrate that MARS significantly +outperforms existing sequential models, achieving improvements of up to 24.43% +and 29.26% in Recall@10 and NDCG@10 across five benchmark datasets. Code is +available at https://github.com/junieberry/MARS + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ♻ ☆ HIRO: Hierarchical Information Retrieval Optimization + + +
+ Retrieval-Augmented Generation (RAG) has revolutionized natural language +processing by dynamically integrating external knowledge into Large Language +Models (LLMs), addressing their limitation of static training datasets. Recent +implementations of RAG leverage hierarchical data structures, which organize +documents at various levels of summarization and information density. This +complexity, however, can cause LLMs to "choke" on information overload, +necessitating more sophisticated querying mechanisms. In this context, we +introduce Hierarchical Information Retrieval Optimization (HIRO), a novel +querying approach that employs a Depth-First Search (DFS)-based recursive +similarity score calculation and branch pruning. This method uniquely minimizes +the context delivered to the LLM without informational loss, effectively +managing the challenge of excessive data. HIRO's refined approach is validated +by a 10.85% improvement in performance on the NarrativeQA dataset. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions, such as +search agents, within this expanding field. + +
+
+ comment: updated to version 3 +
+
+
+
+
+ + ♻ ☆ Smart E-commerce Recommendations with Semantic AI + + +
+ In e-commerce, web mining for page recommendations is widely used but often +fails to meet user needs. To address this, we propose a novel solution +combining semantic web mining with BP neural networks. We process user search +logs to extract five key features: content priority, time spent, user feedback, +recommendation semantics, and input deviation. These features are then fed into +a BP neural network to classify and prioritize web pages. The prioritized pages +are recommended to users. Using book sales pages for testing, our results +demonstrate that this solution can quickly and accurately identify the pages +users need. Our approach ensures that recommendations are more relevant and +tailored to individual preferences, enhancing the online shopping experience. +By leveraging advanced semantic analysis and neural network techniques, we +bridge the gap between user expectations and actual recommendations. This +innovative method not only improves accuracy but also speeds up the +recommendation process, making it a valuable tool for e-commerce platforms +aiming to boost user satisfaction and engagement. Additionally, our system +ability to handle large datasets and provide real-time recommendations makes it +a scalable and efficient solution for modern e-commerce challenges. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ NFARec: A Negative Feedback-Aware Recommender Model SIGIR 2024 + + +
+ Graph neural network (GNN)-based models have been extensively studied for +recommendations, as they can extract high-order collaborative signals +accurately which is required for high-quality recommender systems. However, +they neglect the valuable information gained through negative feedback in two +aspects: (1) different users might hold opposite feedback on the same item, +which hampers optimal information propagation in GNNs, and (2) even when an +item vastly deviates from users' preferences, they might still choose it and +provide a negative rating. In this paper, we propose a negative feedback-aware +recommender model (NFARec) that maximizes the leverage of negative feedback. To +transfer information to multi-hop neighbors along an optimal path effectively, +NFARec adopts a feedback-aware correlation that guides hypergraph convolutions +(HGCs) to learn users' structural representations. Moreover, NFARec +incorporates an auxiliary task - predicting the feedback sentiment polarity +(i.e., positive or negative) of the next interaction - based on the Transformer +Hawkes Process. The task is beneficial for understanding users by learning the +sentiment expressed in their previous sequential feedback patterns and +predicting future interactions. Extensive experiments demonstrate that NFARec +outperforms competitive baselines. Our source code and data are released at +https://github.com/WangXFng/NFARec. + +
+
+ comment: Accepted to SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ CaDRec: Contextualized and Debiased Recommender Model SIGIR 2024 + + +
+ Recommender models aimed at mining users' behavioral patterns have raised +great attention as one of the essential applications in daily life. Recent work +on graph neural networks (GNNs) or debiasing methods has attained remarkable +gains. However, they still suffer from (1) over-smoothing node embeddings +caused by recursive convolutions with GNNs, and (2) the skewed distribution of +interactions due to popularity and user-individual biases. This paper proposes +a contextualized and debiased recommender model (CaDRec). To overcome the +over-smoothing issue, we explore a novel hypergraph convolution operator that +can select effective neighbors during convolution by introducing both +structural context and sequential context. To tackle the skewed distribution, +we propose two strategies for disentangling interactions: (1) modeling +individual biases to learn unbiased item embeddings, and (2) incorporating item +popularity with positional encoding. Moreover, we mathematically show that the +imbalance of the gradients to update item embeddings exacerbates the popularity +bias, thus adopting regularization and weighting schemes as solutions. +Extensive experiments on four datasets demonstrate the superiority of the +CaDRec against state-of-the-art (SOTA) methods. Our source code and data are +released at https://github.com/WangXFng/CaDRec. + +
+
+ comment: Accepted to SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Named Entity Recognition Using Few-Shot Prompting with Large + Language Models + + +
+ This paper evaluates Few-Shot Prompting with Large Language Models for Named +Entity Recognition (NER). Traditional NER systems rely on extensive labeled +datasets, which are costly and time-consuming to obtain. Few-Shot Prompting or +in-context learning enables models to recognize entities with minimal examples. +We assess state-of-the-art models like GPT-4 in NER tasks, comparing their +few-shot performance to fully supervised benchmarks. Results show that while +there is a performance gap, large models excel in adapting to new entity types +and domains with very limited data. We also explore the effects of prompt +engineering, guided output format and context length on performance. This study +underscores Few-Shot Learning's potential to reduce the need for large labeled +datasets, enhancing NER scalability and accessibility. + +
+
+ comment: Github repo: https://github.com/GEODE-project/ner-llm +
+
+
+
+
+ + ♻ ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever EMNLP + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce a novel architecture and a training framework to +support long context window and multilingual retrieval. Our new model, +Jina-ColBERT-v2, demonstrates strong performance across a range of English and +multilingual retrieval tasks, + +
+
+ comment: 8 pages, references at pp7,8; EMNLP workshop submission +
+
+
+
+
+
+
+
+ + Machine Learning 134 + +
+
+
+ + ☆ Masked Diffusion Models are Secretly Time-Agnostic Masked Models and + Exploit Inaccurate Categorical Sampling + + +
+ Masked diffusion models (MDMs) have emerged as a popular research topic for +generative modeling of discrete data, thanks to their superior performance over +other discrete diffusion models, and are rivaling the auto-regressive models +(ARMs) for language modeling tasks. The recent effort in simplifying the masked +diffusion framework further leads to alignment with continuous-space diffusion +models and more principled training and sampling recipes. In this paper, +however, we reveal that both training and sampling of MDMs are theoretically +free from the time variable, arguably the key signature of diffusion models, +and are instead equivalent to masked models. The connection on the sampling +aspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we +show that the FHS is theoretically equivalent to MDMs' original generation +process while significantly alleviating the time-consuming categorical sampling +and achieving a 20$\times$ speedup. In addition, our investigation challenges +previous claims that MDMs can surpass ARMs in generative perplexity. We +identify, for the first time, an underlying numerical issue, even with the +32-bit floating-point precision, which results in inaccurate categorical +sampling. We show that the numerical issue lowers the effective temperature +both theoretically and empirically, leading to unfair assessments of MDMs' +generation results in the previous literature. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ Topological Methods in Machine Learning: A Tutorial for Practitioners + + +
+ Topological Machine Learning (TML) is an emerging field that leverages +techniques from algebraic topology to analyze complex data structures in ways +that traditional machine learning methods may not capture. This tutorial +provides a comprehensive introduction to two key TML techniques, persistent +homology and the Mapper algorithm, with an emphasis on practical applications. +Persistent homology captures multi-scale topological features such as clusters, +loops, and voids, while the Mapper algorithm creates an interpretable graph +summarizing high-dimensional data. To enhance accessibility, we adopt a +data-centric approach, enabling readers to gain hands-on experience applying +these techniques to relevant tasks. We provide step-by-step explanations, +implementations, hands-on examples, and case studies to demonstrate how these +tools can be applied to real-world problems. The goal is to equip researchers +and practitioners with the knowledge and resources to incorporate TML into +their work, revealing insights often hidden from conventional machine learning +methods. The tutorial code is available at +https://github.com/cakcora/TopologyForML + +
+
+ comment: 54 pages, 35 figures +
+
+
+
+
+ + ☆ Regional data-driven weather modeling with a global stretched-grid + + +
+ A data-driven model (DDM) suitable for regional weather forecasting +applications is presented. The model extends the Artificial Intelligence +Forecasting System by introducing a stretched-grid architecture that dedicates +higher resolution over a regional area of interest and maintains a lower +resolution elsewhere on the globe. The model is based on graph neural networks, +which naturally affords arbitrary multi-resolution grid configurations. + The model is applied to short-range weather prediction for the Nordics, +producing forecasts at 2.5 km spatial and 6 h temporal resolution. The model is +pre-trained on 43 years of global ERA5 data at 31 km resolution and is further +refined using 3.3 years of 2.5 km resolution operational analyses from the +MetCoOp Ensemble Prediction System (MEPS). The performance of the model is +evaluated using surface observations from measurement stations across Norway +and is compared to short-range weather forecasts from MEPS. The DDM outperforms +both the control run and the ensemble mean of MEPS for 2 m temperature. The +model also produces competitive precipitation and wind speed forecasts, but is +shown to underestimate extreme events. + +
+
+
+
+
+ + ☆ Benchmarking Spurious Bias in Few-Shot Image Classifiers ECCV 2024 + + +
+ Few-shot image classifiers are designed to recognize and classify new data +with minimal supervision and limited data but often show reliance on spurious +correlations between classes and spurious attributes, known as spurious bias. +Spurious correlations commonly hold in certain samples and few-shot classifiers +can suffer from spurious bias induced from them. There is an absence of an +automatic benchmarking system to assess the robustness of few-shot classifiers +against spurious bias. In this paper, we propose a systematic and rigorous +benchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied +degrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates +few-shot evaluation tasks with biased attributes so that using them for +predictions can demonstrate poor performance. To construct these tasks, we +propose attribute-based sample selection strategies based on a pre-trained +vision-language model, eliminating the need for manual dataset curation. This +allows FewSTAB to automatically benchmark spurious bias using any existing test +data. FewSTAB offers evaluation results in a new dimension along with a new +design guideline for building robust classifiers. Moreover, it can benchmark +spurious bias in varied degrees and enable designs for varied degrees of +robustness. Its effectiveness is demonstrated through experiments on ten +few-shot learning methods across three datasets. We hope our framework can +inspire new designs of robust few-shot classifiers. Our code is available at +https://github.com/gtzheng/FewSTAB. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Configurable Foundation Models: Building LLMs from a Modular Perspective + + +
+ Advancements in LLMs have recently unveiled challenges tied to computational +efficiency and continual scalability due to their requirements of huge +parameters, making the applications and evolution of these models on devices +with limited computation resources and scenarios requiring various abilities +increasingly cumbersome. Inspired by modularity within the human brain, there +is a growing tendency to decompose LLMs into numerous functional modules, +allowing for inference with part of modules and dynamic assembly of modules to +tackle complex tasks, such as mixture-of-experts. To highlight the inherent +efficiency and composability of the modular approach, we coin the term brick to +represent each functional module, designating the modularized structure as +configurable foundation models. In this paper, we offer a comprehensive +overview and investigation of the construction, utilization, and limitation of +configurable foundation models. We first formalize modules into emergent bricks +- functional neuron partitions that emerge during the pre-training phase, and +customized bricks - bricks constructed via additional post-training to improve +the capabilities and knowledge of LLMs. Based on diverse functional bricks, we +further present four brick-oriented operations: retrieval and routing, merging, +updating, and growing. These operations allow for dynamic configuration of LLMs +based on instructions to handle complex tasks. To verify our perspective, we +conduct an empirical analysis on widely-used LLMs. We find that the FFN layers +follow modular patterns with functional specialization of neurons and +functional neuron partitions. Finally, we highlight several open issues and +directions for future research. Overall, this paper aims to offer a fresh +modular perspective on existing LLM research and inspire the future creation of +more efficient and scalable foundational models. + +
+
+
+
+
+ + ☆ Hybrid Imitation-Learning Motion Planner for Urban Driving + + +
+ With the release of open source datasets such as nuPlan and Argoverse, the +research around learning-based planners has spread a lot in the last years. +Existing systems have shown excellent capabilities in imitating the human +driver behaviour, but they struggle to guarantee safe closed-loop driving. +Conversely, optimization-based planners offer greater security in short-term +planning scenarios. To confront this challenge, in this paper we propose a +novel hybrid motion planner that integrates both learning-based and +optimization-based techniques. Initially, a multilayer perceptron (MLP) +generates a human-like trajectory, which is then refined by an +optimization-based component. This component not only minimizes tracking errors +but also computes a trajectory that is both kinematically feasible and +collision-free with obstacles and road boundaries. Our model effectively +balances safety and human-likeness, mitigating the trade-off inherent in these +objectives. We validate our approach through simulation experiments and further +demonstrate its efficacy by deploying it in real-world self-driving vehicles. + +
+
+
+
+
+ + ☆ Look Into the LITE in Deep Learning for Time Series Classification + + +
+ Deep learning models have been shown to be a powerful solution for Time +Series Classification (TSC). State-of-the-art architectures, while producing +promising results on the UCR and the UEA archives , present a high number of +trainable parameters. This can lead to long training with high CO2 emission, +power consumption and possible increase in the number of FLoating-point +Operation Per Second (FLOPS). In this paper, we present a new architecture for +TSC, the Light Inception with boosTing tEchnique (LITE) with only 2.34% of the +number of parameters of the state-of-the-art InceptionTime model, while +preserving performance. This architecture, with only 9, 814 trainable +parameters due to the usage of DepthWise Separable Convolutions (DWSC), is +boosted by three techniques: multiplexing, custom filters, and dilated +convolution. The LITE architecture, trained on the UCR, is 2.78 times faster +than InceptionTime and consumes 2.79 times less CO2 and power. To evaluate the +performance of the proposed architecture on multivariate time series data, we +adapt LITE to handle multivariate time series, we call this version LITEMV. To +bring theory into application, we also conducted experiments using LITEMV on +multivariate time series representing human rehabilitation movements, showing +that LITEMV not only is the most efficient model but also the best performing +for this application on the Kimore dataset, a skeleton based human +rehabilitation exercises dataset. Moreover, to address the interpretability of +LITEMV, we present a study using Class Activation Maps to understand the +classification decision taken by the model during evaluation. + +
+
+
+
+
+ + ☆ Building a Scalable, Effective, and Steerable Search and Ranking + Platform + + +
+ Modern e-commerce platforms offer vast product selections, making it +difficult for customers to find items that they like and that are relevant to +their current session intent. This is why it is key for e-commerce platforms to +have near real-time scalable and adaptable personalized ranking and search +systems. While numerous methods exist in the scientific literature for building +such systems, many are unsuitable for large-scale industrial use due to +complexity and performance limitations. Consequently, industrial ranking +systems often resort to computationally efficient yet simplistic retrieval or +candidate generation approaches, which overlook near real-time and +heterogeneous customer signals, which results in a less personalized and +relevant experience. Moreover, related customer experiences are served by +completely different systems, which increases complexity, maintenance, and +inconsistent experiences. + In this paper, we present a personalized, adaptable near real-time ranking +platform that is reusable across various use cases, such as browsing and +search, and that is able to cater to millions of items and customers under +heavy load (thousands of requests per second). We employ transformer-based +models through different ranking layers which can learn complex behavior +patterns directly from customer action sequences while being able to +incorporate temporal (e.g. in-session) and contextual information. We validate +our system through a series of comprehensive offline and online real-world +experiments at a large online e-commerce platform, and we demonstrate its +superiority when compared to existing systems, both in terms of customer +experience as well as in net revenue. Finally, we share the lessons learned +from building a comprehensive, modern ranking platform for use in a large-scale +e-commerce environment. + +
+
+
+
+
+ + ☆ Oops, I Sampled it Again: Reinterpreting Confidence Intervals in + Few-Shot Learning + + +
+ The predominant method for computing confidence intervals (CI) in few-shot +learning (FSL) is based on sampling the tasks with replacement, i.e.\ allowing +the same samples to appear in multiple tasks. This makes the CI misleading in +that it takes into account the randomness of the sampler but not the data +itself. To quantify the extent of this problem, we conduct a comparative +analysis between CIs computed with and without replacement. These reveal a +notable underestimation by the predominant method. This observation calls for a +reevaluation of how we interpret confidence intervals and the resulting +conclusions in FSL comparative studies. Our research demonstrates that the use +of paired tests can partially address this issue. Additionally, we explore +methods to further reduce the (size of the) CI by strategically sampling tasks +of a specific size. We also introduce a new optimized benchmark, which can be +accessed at https://github.com/RafLaf/FSL-benchmark-again + +
+
+
+
+
+ + ☆ SNNAX -- Spiking Neural Networks in JAX + + +
+ Spiking Neural Networks (SNNs) simulators are essential tools to prototype +biologically inspired models and neuromorphic hardware architectures and +predict their performance. For such a tool, ease of use and flexibility are +critical, but so is simulation speed especially given the complexity inherent +to simulating SNN. Here, we present SNNAX, a JAX-based framework for simulating +and training such models with PyTorch-like intuitiveness and JAX-like execution +speed. SNNAX models are easily extended and customized to fit the desired model +specifications and target neuromorphic hardware. Additionally, SNNAX offers key +features for optimizing the training and deployment of SNNs such as flexible +automatic differentiation and just-in-time compilation. We evaluate and compare +SNNAX to other commonly used machine learning (ML) frameworks used for +programming SNNs. We provide key performance metrics, best practices, +documented examples for simulating SNNs in SNNAX, and implement several +benchmarks used in the literature. + +
+
+
+
+
+ + ☆ Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency + Discussions by Few-Shot Learning with Large Language Models + + +
+ This study performs analysis of Predictive statements, Hope speech, and +Regret Detection behaviors within cryptocurrency-related discussions, +leveraging advanced natural language processing techniques. We introduce a +novel classification scheme named "Prediction statements," categorizing +comments into Predictive Incremental, Predictive Decremental, Predictive +Neutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large +language model, we explore sentiment dynamics across five prominent +cryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis +reveals distinct patterns in predictive sentiments, with Matic demonstrating a +notably higher propensity for optimistic predictions. Additionally, we +investigate hope and regret sentiments, uncovering nuanced interplay between +these emotions and predictive behaviors. Despite encountering limitations +related to data volume and resource availability, our study reports valuable +discoveries concerning investor behavior and sentiment trends within the +cryptocurrency market, informing strategic decision-making and future research +endeavors. + +
+
+
+
+
+ + ☆ Obsidian: Cooperative State-Space Exploration for Performant Inference + on Secure ML Accelerators + + +
+ Trusted execution environments (TEEs) for machine learning accelerators are +indispensable in secure and efficient ML inference. Optimizing workloads +through state-space exploration for the accelerator architectures improves +performance and energy consumption. However, such explorations are expensive +and slow due to the large search space. Current research has to use fast +analytical models that forego critical hardware details and cross-layer +opportunities unique to the hardware security primitives. While cycle-accurate +models can theoretically reach better designs, their high runtime cost +restricts them to a smaller state space. + We present Obsidian, an optimization framework for finding the optimal +mapping from ML kernels to a secure ML accelerator. Obsidian addresses the +above challenge by exploring the state space using analytical and +cycle-accurate models cooperatively. The two main exploration components +include: (1) A secure accelerator analytical model, that includes the effect of +secure hardware while traversing the large mapping state space and produce the +best m model mappings; (2) A compiler profiling step on a cycle-accurate model, +that captures runtime bottlenecks to further improve execution runtime, energy +and resource utilization and find the optimal model mapping. + We compare our results to a baseline secure accelerator, comprising of the +state-of-the-art security schemes obtained from guardnn [ 33 ] and sesame [11]. +The analytical model reduces the inference latency by 20.5% for a cloud and +8.4% for an edge deployment with an energy improvement of 24% and 19% +respectively. The cycle-accurate model, further reduces the latency by 9.1% for +a cloud and 12.2% for an edge with an energy improvement of 13.8% and 13.1%. + +
+
+
+
+
+ + ☆ Boosting Certificate Robustness for Time Series Classification with + Efficient Self-Ensemble + + +
+ Recently, the issue of adversarial robustness in the time series domain has +garnered significant attention. However, the available defense mechanisms +remain limited, with adversarial training being the predominant approach, +though it does not provide theoretical guarantees. Randomized Smoothing has +emerged as a standout method due to its ability to certify a provable lower +bound on robustness radius under $\ell_p$-ball attacks. Recognizing its +success, research in the time series domain has started focusing on these +aspects. However, existing research predominantly focuses on time series +forecasting, or under the non-$\ell_p$ robustness in statistic feature +augmentation for time series classification~(TSC). Our review found that +Randomized Smoothing performs modestly in TSC, struggling to provide effective +assurances on datasets with poor robustness. Therefore, we propose a +self-ensemble method to enhance the lower bound of the probability confidence +of predicted labels by reducing the variance of classification margins, thereby +certifying a larger radius. This approach also addresses the computational +overhead issue of Deep Ensemble~(DE) while remaining competitive and, in some +cases, outperforming it in terms of robustness. Both theoretical analysis and +experimental results validate the effectiveness of our method, demonstrating +superior performance in robustness testing compared to baseline approaches. + +
+
+ comment: 6 figures, 4 tables, 10 pages +
+
+
+
+
+ + ☆ UnLearning from Experience to Avoid Spurious Correlations + + +
+ While deep neural networks can achieve state-of-the-art performance in many +tasks, these models are more fragile than they appear. They are prone to +learning spurious correlations in their training data, leading to surprising +failure cases. In this paper, we propose a new approach that addresses the +issue of spurious correlations: UnLearning from Experience (ULE). Our method is +based on using two classification models trained in parallel: student and +teacher models. Both models receive the same batches of training data. The +student model is trained with no constraints and pursues the spurious +correlations in the data. The teacher model is trained to solve the same +classification problem while avoiding the mistakes of the student model. As +training is done in parallel, the better the student model learns the spurious +correlations, the more robust the teacher model becomes. The teacher model uses +the gradient of the student's output with respect to its input to unlearn +mistakes made by the student. We show that our method is effective on the +Waterbirds, CelebA, Spawrious and UrbanCars datasets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Regularized Multi-output Gaussian Convolution Process with Domain + Adaptation + + +
+ Multi-output Gaussian process (MGP) has been attracting increasing attention +as a transfer learning method to model multiple outputs. Despite its high +flexibility and generality, MGP still faces two critical challenges when +applied to transfer learning. The first one is negative transfer, which occurs +when there exists no shared information among the outputs. The second challenge +is the input domain inconsistency, which is commonly studied in transfer +learning yet not explored in MGP. In this paper, we propose a regularized MGP +modeling framework with domain adaptation to overcome these challenges. More +specifically, a sparse covariance matrix of MGP is proposed by using +convolution process, where penalization terms are added to adaptively select +the most informative outputs for knowledge transfer. To deal with the domain +inconsistency, a domain adaptation method is proposed by marginalizing +inconsistent features and expanding missing features to align the input domains +among different outputs. Statistical properties of the proposed method are +provided to guarantee the performance practically and asymptotically. The +proposed framework outperforms state-of-the-art benchmarks in comprehensive +simulation studies and one real case study of a ceramic manufacturing process. +The results demonstrate the effectiveness of our method in dealing with both +the negative transfer and the domain inconsistency. + +
+
+
+
+
+ + ☆ Unifying Causal Representation Learning with the Invariance Principle + + +
+ Causal representation learning aims at recovering latent causal variables +from high-dimensional observations to solve causal downstream tasks, such as +predicting the effect of new interventions or more robust classification. A +plethora of methods have been developed, each tackling carefully crafted +problem settings that lead to different types of identifiability. The folklore +is that these different settings are important, as they are often linked to +different rungs of Pearl's causal hierarchy, although not all neatly fit. Our +main contribution is to show that many existing causal representation learning +approaches methodologically align the representation to known data symmetries. +Identification of the variables is guided by equivalence classes across +different data pockets that are not necessarily causal. This result suggests +important implications, allowing us to unify many existing approaches in a +single method that can mix and match different assumptions, including +non-causal ones, based on the invariances relevant to our application. It also +significantly benefits applicability, which we demonstrate by improving +treatment effect estimation on real-world high-dimensional ecological data. +Overall, this paper clarifies the role of causality assumptions in the +discovery of causal variables and shifts the focus to preserving data +symmetries. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ Tractable Offline Learning of Regular Decision Processes + + +
+ This work studies offline Reinforcement Learning (RL) in a class of +non-Markovian environments called Regular Decision Processes (RDPs). In RDPs, +the unknown dependency of future observations and rewards from the past +interactions can be captured by some hidden finite-state automaton. For this +reason, many RDP algorithms first reconstruct this unknown dependency using +automata learning techniques. In this paper, we show that it is possible to +overcome two strong limitations of previous offline RL algorithms for RDPs, +notably RegORL. This can be accomplished via the introduction of two original +techniques: the development of a new pseudometric based on formal languages, +which removes a problematic dependency on +$L_\infty^\mathsf{p}$-distinguishability parameters, and the adoption of +Count-Min-Sketch (CMS), instead of naive counting. The former reduces the +number of samples required in environments that are characterized by a low +complexity in language-theoretic terms. The latter alleviates the memory +requirements for long planning horizons. We derive the PAC sample complexity +bounds associated to each of these techniques, and we validate the approach +experimentally. + +
+
+ comment: To appear in EWRL 2024 +
+
+
+
+
+ + ☆ Convolutional Neural Networks for Automated Cellular Automaton + Classification + + +
+ The emergent dynamics in spacetime diagrams of cellular automata (CAs) is +often organised by means of a number of behavioural classes. Whilst +classification of elementary CAs is feasible and well-studied, non-elementary +CAs are generally too diverse and numerous to exhaustively classify manually. +In this chapter we treat the spacetime diagram as a digital image, and +implement simple computer vision techniques to perform an automated +classification of elementary cellular automata into the five Li-Packard +classes. In particular, we present a supervised learning task to a +convolutional neural network, in such a way that it may be generalised to +non-elementary CAs. If we want to do so, we must divert the algorithm's focus +away from the underlying 'microscopic' local updates. We first show that +previously developed deep learning approaches have in fact been trained to +identify the local update rule, rather than directly focus on the mesoscopic +patterns that are associated with the particular behavioural classes. By means +of a well-argued neural network design, as well as a number of data +augmentation techniques, we then present a convolutional neural network that +performs nearly perfectly at identifying the behavioural class, without +necessarily first identifying the underlying microscopic dynamics. + +
+
+ comment: 19 pages, 12 figures, book chapter +
+
+
+
+
+ + ☆ Complete and Efficient Covariants for 3D Point Configurations with + Application to Learning Molecular Quantum Properties + + +
+ When modeling physical properties of molecules with machine learning, it is +desirable to incorporate $SO(3)$-covariance. While such models based on low +body order features are not complete, we formulate and prove general +completeness properties for higher order methods, and show that $6k-5$ of these +features are enough for up to $k$ atoms. We also find that the Clebsch--Gordan +operations commonly used in these methods can be replaced by matrix +multiplications without sacrificing completeness, lowering the scaling from +$O(l^6)$ to $O(l^3)$ in the degree of the features. We apply this to quantum +chemistry, but the proposed methods are generally applicable for problems +involving 3D point configurations. + +
+
+
+
+
+ + ☆ Task-Oriented Communication for Graph Data: A Graph Information + Bottleneck Approach + + +
+ Graph data, essential in fields like knowledge representation and social +networks, often involves large networks with many nodes and edges. Transmitting +these graphs can be highly inefficient due to their size and redundancy for +specific tasks. This paper introduces a method to extract a smaller, +task-focused subgraph that maintains key information while reducing +communication overhead. Our approach utilizes graph neural networks (GNNs) and +the graph information bottleneck (GIB) principle to create a compact, +informative, and robust graph representation suitable for transmission. The +challenge lies in the irregular structure of graph data, making GIB +optimization complex. We address this by deriving a tractable variational upper +bound for the objective function. Additionally, we propose the VQ-GIB +mechanism, integrating vector quantization (VQ) to convert subgraph +representations into a discrete codebook sequence, compatible with existing +digital communication systems. Our experiments show that this GIB-based method +significantly lowers communication costs while preserving essential +task-related information. The approach demonstrates robust performance across +various communication channels, suitable for both continuous and discrete +systems. + +
+
+
+
+
+ + ☆ A Data Selection Approach for Enhancing Low Resource Machine Translation + Using Cross-Lingual Sentence Representations + + +
+ Machine translation in low-resource language pairs faces significant +challenges due to the scarcity of parallel corpora and linguistic resources. +This study focuses on the case of English-Marathi language pairs, where +existing datasets are notably noisy, impeding the performance of machine +translation models. To mitigate the impact of data quality issues, we propose a +data filtering approach based on cross-lingual sentence representations. Our +methodology leverages a multilingual SBERT model to filter out problematic +translations in the training data. Specifically, we employ an IndicSBERT +similarity model to assess the semantic equivalence between original and +translated sentences, allowing us to retain linguistically correct translations +while discarding instances with substantial deviations. The results demonstrate +a significant improvement in translation quality over the baseline +post-filtering with IndicSBERT. This illustrates how cross-lingual sentence +representations can reduce errors in machine translation scenarios with limited +resources. By integrating multilingual sentence BERT models into the +translation pipeline, this research contributes to advancing machine +translation techniques in low-resource environments. The proposed method not +only addresses the challenges in English-Marathi language pairs but also +provides a valuable framework for enhancing translation quality in other +low-resource language translation tasks. + +
+
+ comment: Accepted at I2CT 2024 +
+
+
+
+
+ + ☆ Few-shot Multi-Task Learning of Linear Invariant Features with Meta + Subspace Pursuit + + +
+ Data scarcity poses a serious threat to modern machine learning and +artificial intelligence, as their practical success typically relies on the +availability of big datasets. One effective strategy to mitigate the issue of +insufficient data is to first harness information from other data sources +possessing certain similarities in the study design stage, and then employ the +multi-task or meta learning framework in the analysis stage. In this paper, we +focus on multi-task (or multi-source) linear models whose coefficients across +tasks share an invariant low-rank component, a popular structural assumption +considered in the recent multi-task or meta learning literature. Under this +assumption, we propose a new algorithm, called Meta Subspace Pursuit +(abbreviated as Meta-SP), that provably learns this invariant subspace shared +by different tasks. Under this stylized setup for multi-task or meta learning, +we establish both the algorithmic and statistical guarantees of the proposed +method. Extensive numerical experiments are conducted, comparing Meta-SP +against several competing methods, including popular, off-the-shelf +model-agnostic meta learning algorithms such as ANIL. These experiments +demonstrate that Meta-SP achieves superior performance over the competing +methods in various aspects. + +
+
+
+
+
+ + ☆ Decision Transformer for Enhancing Neural Local Search on the Job Shop + Scheduling Problem + + +
+ The job shop scheduling problem (JSSP) and its solution algorithms have been +of enduring interest in both academia and industry for decades. In recent +years, machine learning (ML) is playing an increasingly important role in +advancing existing and building new heuristic solutions for the JSSP, aiming to +find better solutions in shorter computation times. In this paper we build on +top of a state-of-the-art deep reinforcement learning (DRL) agent, called +Neural Local Search (NLS), which can efficiently and effectively control a +large local neighborhood search on the JSSP. In particular, we develop a method +for training the decision transformer (DT) algorithm on search trajectories +taken by a trained NLS agent to further improve upon the learned +decision-making sequences. Our experiments show that the DT successfully learns +local search strategies that are different and, in many cases, more effective +than those of the NLS agent itself. In terms of the tradeoff between solution +quality and acceptable computational time needed for the search, the DT is +particularly superior in application scenarios where longer computational times +are acceptable. In this case, it makes up for the longer inference times +required per search step, which are caused by the larger neural network +architecture, through better quality decisions per step. Thereby, the DT +achieves state-of-the-art results for solving the JSSP with ML-enhanced search. + +
+
+ comment: currently under review for IEEE Transactions on Cybernetics +
+
+
+
+
+ + ☆ Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for + Problem-Solving Improvement of LLMs + + +
+ Large Language Models (LLMs) have demonstrated remarkable efficiency in +tackling various tasks based on human instructions, but recent studies reveal +that these models often fail to achieve satisfactory results on questions +involving reasoning, such as mathematics or physics questions. This phenomenon +is usually attributed to the uncertainty regarding whether these models could +genuinely comprehend the knowledge embedded in the text or merely learn to +replicate the token distribution without a true understanding of the content. +In this paper, we delve into this problem and aim to enhance the reasoning +capabilities of LLMs. First, we investigate if the model has genuine reasoning +capabilities by visualizing the text generation process at the attention and +representation level. Then, we formulate the reasoning process of LLMs into a +causal framework, which provides a formal explanation of the problems we +observe in the visualization. Finally, building upon this causal framework, we +propose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient +fine-tuning (PEFT) method to enhance the model's reasoning capabilities by +encouraging the model to extract the general problem-solving skills and apply +these skills to different questions. Experiments show that our method +outperforms the baseline consistently across multiple benchmarks, and with only +1.2M tunable parameters, we achieve better or comparable results to other +fine-tuning methods. This demonstrates the effectiveness and efficiency of our +method in improving the overall accuracy and reliability of LLMs. + +
+
+
+
+
+ + ☆ Neural timescales from a computational perspective + + +
+ Timescales of neural activity are diverse across and within brain areas, and +experimental observations suggest that neural timescales reflect information in +dynamic environments. However, these observations do not specify how neural +timescales are shaped, nor whether particular timescales are necessary for +neural computations and brain function. Here, we take a complementary +perspective and synthesize three directions where computational methods can +distill the broad set of empirical observations into quantitative and testable +theories: We review (i) how data analysis methods allow us to capture different +timescales of neural dynamics across different recording modalities, (ii) how +computational models provide a mechanistic explanation for the emergence of +diverse timescales, and (iii) how task-optimized models in machine learning +uncover the functional relevance of neural timescales. This integrative +computational approach, combined with empirical findings, would provide a more +holistic understanding of how neural timescales capture the relationship +between brain structure, dynamics, and behavior. + +
+
+ comment: 18 pages, 4 figures, 2 boxes +
+
+
+
+
+ + ☆ Neural Networks with LSTM and GRU in Modeling Active Fires in the Amazon + + +
+ This study presents a comprehensive methodology for modeling and forecasting +the historical time series of fire spots detected by the AQUA_M-T satellite in +the Amazon, Brazil. The approach utilizes a mixed Recurrent Neural Network +(RNN) model, combining Long Short-Term Memory (LSTM) and Gated Recurrent Unit +(GRU) architectures to predict monthly accumulations of daily detected fire +spots. A summary of the data revealed a consistent seasonality over time, with +annual maximum and minimum fire spot values tending to repeat at the same +periods each year. The primary objective is to verify whether the forecasts +capture this inherent seasonality through rigorous statistical analysis. The +methodology involved careful data preparation, model configuration, and +training using cross-validation with two seeds, ensuring that the data +generalizes well to the test and validation sets, and confirming the +convergence of the model parameters. The results indicate that the mixed LSTM +and GRU model offers improved accuracy in forecasting 12 months ahead, +demonstrating its effectiveness in capturing complex temporal patterns and +modeling the observed time series. This research significantly contributes to +the application of deep learning techniques in environmental monitoring, +specifically in fire spot forecasting. In addition to improving forecast +accuracy, the proposed approach highlights the potential for adaptation to +other time series forecasting challenges, opening new avenues for research and +development in machine learning and natural phenomenon prediction. Keywords: +Time Series Forecasting, Recurrent Neural Networks, Deep Learning. + +
+
+ comment: 16 pages, in Portuguese language, 24 figures +
+
+
+
+
+ + ☆ Independence Constrained Disentangled Representation Learning from + Epistemological Perspective + + +
+ Disentangled Representation Learning aims to improve the explainability of +deep learning methods by training a data encoder that identifies semantically +meaningful latent variables in the data generation process. Nevertheless, there +is no consensus regarding a universally accepted definition for the objective +of disentangled representation learning. In particular, there is a considerable +amount of discourse regarding whether should the latent variables be mutually +independent or not. In this paper, we first investigate these arguments on the +interrelationships between latent variables by establishing a conceptual bridge +between Epistemology and Disentangled Representation Learning. Then, inspired +by these interdisciplinary concepts, we introduce a two-level latent space +framework to provide a general solution to the prior arguments on this issue. +Finally, we propose a novel method for disentangled representation learning by +employing an integration of mutual information constraint and independence +constraint within the Generative Adversarial Network (GAN) framework. +Experimental results demonstrate that our proposed method consistently +outperforms baseline approaches in both quantitative and qualitative +evaluations. The method exhibits strong performance across multiple commonly +used metrics and demonstrates a great capability in disentangling various +semantic factors, leading to an improved quality of controllable generation, +which consequently benefits the explainability of the algorithm. + +
+
+
+
+
+ + ☆ Causality-Aware Transformer Networks for Robotic Navigation + + +
+ Recent advances in machine learning algorithms have garnered growing interest +in developing versatile Embodied AI systems. However, current research in this +domain reveals opportunities for improvement. First, the direct adoption of +RNNs and Transformers often overlooks the specific differences between Embodied +AI and traditional sequential data modelling, potentially limiting its +performance in Embodied AI tasks. Second, the reliance on task-specific +configurations, such as pre-trained modules and dataset-specific logic, +compromises the generalizability of these methods. We address these constraints +by initially exploring the unique differences between Embodied AI tasks and +other sequential data tasks through the lens of Causality, presenting a causal +framework to elucidate the inadequacies of conventional sequential methods for +Embodied AI. By leveraging this causal perspective, we propose Causality-Aware +Transformer (CAT) Networks for Navigation, featuring a Causal Understanding +Module to enhance the models's Environmental Understanding capability. +Meanwhile, our method is devoid of task-specific inductive biases and can be +trained in an End-to-End manner, which enhances the method's generalizability +across various contexts. Empirical evaluations demonstrate that our methodology +consistently surpasses benchmark performances across a spectrum of settings, +tasks and simulation environments. Extensive ablation studies reveal that the +performance gains can be attributed to the Causal Understanding Module, which +demonstrates effectiveness and efficiency in both Reinforcement Learning and +Supervised Learning settings. + +
+
+
+
+
+ + ☆ Introduction to Machine Learning + + +
+ This book introduces the mathematical foundations and techniques that lead to +the development and analysis of many of the algorithms that are used in machine +learning. It starts with an introductory chapter that describes notation used +throughout the book and serve at a reminder of basic concepts in calculus, +linear algebra and probability and also introduces some measure theoretic +terminology, which can be used as a reading guide for the sections that use +these tools. The introductory chapters also provide background material on +matrix analysis and optimization. The latter chapter provides theoretical +support to many algorithms that are used in the book, including stochastic +gradient descent, proximal methods, etc. After discussing basic concepts for +statistical prediction, the book includes an introduction to reproducing kernel +theory and Hilbert space techniques, which are used in many places, before +addressing the description of various algorithms for supervised statistical +learning, including linear methods, support vector machines, decision trees, +boosting, or neural networks. The subject then switches to generative methods, +starting with a chapter that presents sampling methods and an introduction to +the theory of Markov chains. The following chapter describe the theory of +graphical models, an introduction to variational methods for models with latent +variables, and to deep-learning based generative models. The next chapters +focus on unsupervised learning methods, for clustering, factor analysis and +manifold learning. The final chapter of the book is theory-oriented and +discusses concentration inequalities and generalization bounds. + +
+
+ comment: textbook +
+
+
+
+
+ + ☆ Learning-Based Error Detection System for Advanced Vehicle Instrument + Cluster Rendering + + +
+ The automotive industry is currently expanding digital display options with +every new model that comes onto the market. This entails not just an expansion +in dimensions, resolution, and customization choices, but also the capability +to employ novel display effects like overlays while assembling the content of +the display cluster. Unfortunately, this raises the need for appropriate +monitoring systems that can detect rendering errors and apply appropriate +countermeasures when required. Classical solutions such as Cyclic Redundancy +Checks (CRC) will soon be no longer viable as any sort of alpha blending, +warping of scaling of content can cause unwanted CRC violations. Therefore, we +propose a novel monitoring approach to verify correctness of displayed content +using telltales (e.g. warning signs) as example. It uses a learning-based +approach to separate "good" telltales, i.e. those that a human driver will +understand correctly, and "corrupted" telltales, i.e. those that will not be +visible or perceived correctly. As a result, it possesses inherent resilience +against individual pixel errors and implicitly supports changing backgrounds, +overlay or scaling effects. This is underlined by our experimental study where +all "corrupted" test patterns were correctly classified, while no false alarms +were triggered. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Conformal Prediction in Dynamic Biological Systems + + +
+ Uncertainty quantification (UQ) is the process of systematically determining +and characterizing the degree of confidence in computational model predictions. +In the context of systems biology, especially with dynamic models, UQ is +crucial because it addresses the challenges posed by nonlinearity and parameter +sensitivity, allowing us to properly understand and extrapolate the behavior of +complex biological systems. Here, we focus on dynamic models represented by +deterministic nonlinear ordinary differential equations. Many current UQ +approaches in this field rely on Bayesian statistical methods. While powerful, +these methods often require strong prior specifications and make parametric +assumptions that may not always hold in biological systems. Additionally, these +methods face challenges in domains where sample sizes are limited, and +statistical inference becomes constrained, with computational speed being a +bottleneck in large models of biological systems. As an alternative, we propose +the use of conformal inference methods, introducing two novel algorithms that, +in some instances, offer non-asymptotic guarantees, enhancing robustness and +scalability across various applications. We demonstrate the efficacy of our +proposed algorithms through several scenarios, highlighting their advantages +over traditional Bayesian approaches. The proposed methods show promising +results for diverse biological data structures and scenarios, offering a +general framework to quantify uncertainty for dynamic models of biological +systems.The software for the methodology and the reproduction of the results is +available at https://zenodo.org/doi/10.5281/zenodo.13644870. + +
+
+
+
+
+ + ☆ AdvSecureNet: A Python Toolkit for Adversarial Machine Learning + + +
+ Machine learning models are vulnerable to adversarial attacks. Several tools +have been developed to research these vulnerabilities, but they often lack +comprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch +based toolkit for adversarial machine learning that is the first to natively +support multi-GPU setups for attacks, defenses, and evaluation. It is the first +toolkit that supports both CLI and API interfaces and external YAML +configuration files to enhance versatility and reproducibility. The toolkit +includes multiple attacks, defenses and evaluation metrics. Rigiorous software +engineering practices are followed to ensure high code quality and +maintainability. The project is available as an open-source project on GitHub +at https://github.com/melihcatal/advsecurenet and installable via PyPI. + +
+
+
+
+
+ + ☆ (Implicit) Ensembles of Ensembles: Epistemic Uncertainty Collapse in + Large Models + + +
+ Epistemic uncertainty is crucial for safety-critical applications and +out-of-distribution detection tasks. Yet, we uncover a paradoxical phenomenon +in deep learning models: an epistemic uncertainty collapse as model complexity +increases, challenging the assumption that larger models invariably offer +better uncertainty quantification. We propose that this stems from implicit +ensembling within large models. To support this hypothesis, we demonstrate +epistemic uncertainty collapse empirically across various architectures, from +explicit ensembles of ensembles and simple MLPs to state-of-the-art vision +models, including ResNets and Vision Transformers -- for the latter, we examine +implicit ensemble extraction and decompose larger models into diverse +sub-models, recovering epistemic uncertainty. We provide theoretical +justification for these phenomena and explore their implications for +uncertainty estimation. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Hypothesizing Missing Causal Variables with LLMs + + +
+ Scientific discovery is a catalyst for human intellectual advances, driven by +the cycle of hypothesis generation, experimental design, data evaluation, and +iterative assumption refinement. This process, while crucial, is expensive and +heavily dependent on the domain knowledge of scientists to generate hypotheses +and navigate the scientific cycle. Central to this is causality, the ability to +establish the relationship between the cause and the effect. Motivated by the +scientific discovery process, in this work, we formulate a novel task where the +input is a partial causal graph with missing variables, and the output is a +hypothesis about the missing variables to complete the partial graph. We design +a benchmark with varying difficulty levels and knowledge assumptions about the +causal graph. With the growing interest in using Large Language Models (LLMs) +to assist in scientific discovery, we benchmark open-source and closed models +on our testbed. We show the strong ability of LLMs to hypothesize the mediation +variables between a cause and its effect. In contrast, they underperform in +hypothesizing the cause and effect variables themselves. We also observe +surprising results where some of the open-source models outperform the closed +GPT-4 model. + +
+
+ comment: Code - https://github.com/ivaxi0s/hypothesizing-causal-variable-llm +
+
+
+
+
+ + ☆ A Fashion Item Recommendation Model in Hyperbolic Space CVPR 2024 + + +
+ In this work, we propose a fashion item recommendation model that +incorporates hyperbolic geometry into user and item representations. Using +hyperbolic space, our model aims to capture implicit hierarchies among items +based on their visual data and users' purchase history. During training, we +apply a multi-task learning framework that considers both hyperbolic and +Euclidean distances in the loss function. Our experiments on three data sets +show that our model performs better than previous models trained in Euclidean +space only, confirming the effectiveness of our model. Our ablation studies +show that multi-task learning plays a key role, and removing the Euclidean loss +substantially deteriorates the model performance. + +
+
+ comment: This work was presented at the CVFAD Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ An Analysis of Linear Complexity Attention Substitutes with BEST-RQ + + +
+ Self-Supervised Learning (SSL) has proven to be effective in various domains, +including speech processing. However, SSL is computationally and memory +expensive. This is in part due the quadratic complexity of multi-head +self-attention (MHSA). Alternatives for MHSA have been proposed and used in the +speech domain, but have yet to be investigated properly in an SSL setting. In +this work, we study the effects of replacing MHSA with recent state-of-the-art +alternatives that have linear complexity, namely, HyperMixing, Fastformer, +SummaryMixing, and Mamba. We evaluate these methods by looking at the speed, +the amount of VRAM consumed, and the performance on the SSL MP3S benchmark. +Results show that these linear alternatives maintain competitive performance +compared to MHSA while, on average, decreasing VRAM consumption by around 20% +to 60% and increasing speed from 7% to 65% for input sequences ranging from 20 +to 80 seconds. + +
+
+ comment: Accepted in the IEEE Soken Language Technology Workshop 2024 +
+
+
+
+
+ + ☆ Multiview Random Vector Functional Link Network for Predicting + DNA-Binding Proteins + + +
+ The identification of DNA-binding proteins (DBPs) is a critical task due to +their significant impact on various biological activities. Understanding the +mechanisms underlying protein-DNA interactions is essential for elucidating +various life activities. In recent years, machine learning-based models have +been prominently utilized for DBP prediction. In this paper, to predict DBPs, +we propose a novel framework termed a multiview random vector functional link +(MvRVFL) network, which fuses neural network architecture with multiview +learning. The proposed MvRVFL model combines the benefits of late and early +fusion, allowing for distinct regularization parameters across different views +while leveraging a closed-form solution to determine unknown parameters +efficiently. The primal objective function incorporates a coupling term aimed +at minimizing a composite of errors stemming from all views. From each of the +three protein views of the DBP datasets, we extract five features. These +features are then fused together by incorporating a hidden feature during the +model training process. The performance of the proposed MvRVFL model on the DBP +dataset surpasses that of baseline models, demonstrating its superior +effectiveness. Furthermore, we extend our assessment to the UCI, KEEL, AwA, and +Corel5k datasets, to establish the practicality of the proposed models. The +consistency error bound, the generalization error bound, and empirical +findings, coupled with rigorous statistical analyses, confirm the superior +generalization capabilities of the MvRVFL model compared to the baseline +models. + +
+
+
+
+
+ + ☆ BMI Prediction from Handwritten English Characters Using a Convolutional + Neural Network + + +
+ A person's Body Mass Index, or BMI, is the most widely used parameter for +assessing their health. BMI is a crucial predictor of potential diseases that +may arise at higher body fat levels because it is correlated with body fat. +Conversely, a community's or an individual's nutritional status can be +determined using the BMI. Although deep learning models are used in several +studies to estimate BMI from face photos and other data, no previous research +established a clear connection between deep learning techniques for handwriting +analysis and BMI prediction. This article addresses this research gap with a +deep learning approach to estimating BMI from handwritten characters by +developing a convolutional neural network (CNN). A dataset containing samples +from 48 people in lowercase English scripts is successfully captured for the +BMI prediction task. The proposed CNN-based approach reports a commendable +accuracy of 99.92%. Performance comparison with other popular CNN architectures +reveals that AlexNet and InceptionV3 achieve the second and third-best +performance, with the accuracy of 99.69% and 99.53%, respectively. + +
+
+
+
+
+ + ☆ Advancing Cyber Incident Timeline Analysis Through Rule Based AI and + Large Language Models + + +
+ Timeline Analysis (TA) is a key part of Timeline Forensics (TF) in Digital +Forensics (DF), focusing primarily on examining and analysing temporal digital +artefacts such as timestamps, derived from event logs, file metadata, and other +related data to correlate events resulting from cyber incidents and reconstruct +their chronological timeline. Traditional tools often struggle to efficiently +process the vast volume and variety of data acquired during DF investigations +and Incident Response (IR) processes. This paper presents a novel framework, +GenDFIR, that combines Rule-Based Artificial Intelligence (R-BAI) algorithms +with Large Language Models (LLMs) to advance and automate the TA process. Our +approach consists of two main stages (1) We use R-BAI to identify and select +anomalous digital artefacts based on predefined rules. (2) The selected +artefacts are then converted into embeddings for processing by an LLM with the +help of a Retrieval-Augmented Generation (RAG) agent. The LLM consequently +leverages its capabilities to perform automated TA on the artefacts and predict +potential incident scenarios. To validate our framework, we evaluate GenDFIR +performance, efficiency, and reliability using various metrics across synthetic +cyber incident simulation scenarios. This paper presents a proof of concept, +where the findings demonstrate the significant potential of integrating R-BAI +and LLMs for TA. This novel approach highlights the power of Generative AI +(GenAI), specifically LLMs, and opens new avenues for advanced threat detection +and incident reconstruction, representing a significant step forward in the +field. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ☆ Low-Resolution Object Recognition with Cross-Resolution Relational + Contrastive Distillation + + +
+ Recognizing objects in low-resolution images is a challenging task due to the +lack of informative details. Recent studies have shown that knowledge +distillation approaches can effectively transfer knowledge from a +high-resolution teacher model to a low-resolution student model by aligning +cross-resolution representations. However, these approaches still face +limitations in adapting to the situation where the recognized objects exhibit +significant representation discrepancies between training and testing images. +In this study, we propose a cross-resolution relational contrastive +distillation approach to facilitate low-resolution object recognition. Our +approach enables the student model to mimic the behavior of a well-trained +teacher model which delivers high accuracy in identifying high-resolution +objects. To extract sufficient knowledge, the student learning is supervised +with contrastive relational distillation loss, which preserves the similarities +in various relational structures in contrastive representation space. In this +manner, the capability of recovering missing details of familiar low-resolution +objects can be effectively enhanced, leading to a better knowledge transfer. +Extensive experiments on low-resolution object classification and +low-resolution face recognition clearly demonstrate the effectiveness and +adaptability of our approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Circuits and Systems + for Video Technology (TCSVT) +
+
+
+
+
+ + ☆ Understanding eGFR Trajectories and Kidney Function Decline via Large + Multimodal Models + + +
+ The estimated Glomerular Filtration Rate (eGFR) is an essential indicator of +kidney function in clinical practice. Although traditional equations and +Machine Learning (ML) models using clinical and laboratory data can estimate +eGFR, accurately predicting future eGFR levels remains a significant challenge +for nephrologists and ML researchers. Recent advances demonstrate that Large +Language Models (LLMs) and Large Multimodal Models (LMMs) can serve as robust +foundation models for diverse applications. This study investigates the +potential of LMMs to predict future eGFR levels with a dataset consisting of +laboratory and clinical values from 50 patients. By integrating various +prompting techniques and ensembles of LMMs, our findings suggest that these +models, when combined with precise prompts and visual representations of eGFR +trajectories, offer predictive performance comparable to existing ML models. +This research extends the application of foundation models and suggests avenues +for future studies to harness these models in addressing complex medical +forecasting challenges. + +
+
+ comment: This preprint version includes corrections of typographical errors + related to numerical values in Table 2, which were present in the version + published at the BDH workshop in MIPR 2024. These corrections do not affect + the overall conclusions of the study +
+
+
+
+
+ + ☆ Sample what you cant compress + + +
+ For learned image representations, basic autoencoders often produce blurry +results. Reconstruction quality can be improved by incorporating additional +penalties such as adversarial (GAN) and perceptual losses. Arguably, these +approaches lack a principled interpretation. Concurrently, in generative +settings diffusion has demonstrated a remarkable ability to create crisp, high +quality results and has solid theoretical underpinnings (from variational +inference to direct study as the Fisher Divergence). Our work combines +autoencoder representation learning with diffusion and is, to our knowledge, +the first to demonstrate the efficacy of jointly learning a continuous encoder +and decoder under a diffusion-based loss. We demonstrate that this approach +yields better reconstruction quality as compared to GAN-based autoencoders +while being easier to tune. We also show that the resulting representation is +easier to model with a latent diffusion model as compared to the representation +obtained from a state-of-the-art GAN-based loss. Since our decoder is +stochastic, it can generate details not encoded in the otherwise deterministic +latent representation; we therefore name our approach "Sample what you can't +compress", or SWYCC for short. + +
+
+
+
+
+ + ☆ Training Universal Vocoders with Feature Smoothing-Based Augmentation + Methods for High-Quality TTS Systems + + +
+ While universal vocoders have achieved proficient waveform generation across +diverse voices, their integration into text-to-speech (TTS) tasks often results +in degraded synthetic quality. To address this challenge, we present a novel +augmentation technique for training universal vocoders. Our training scheme +randomly applies linear smoothing filters to input acoustic features, +facilitating vocoder generalization across a wide range of smoothings. It +significantly mitigates the training-inference mismatch, enhancing the +naturalness of synthetic output even when the acoustic model produces overly +smoothed features. Notably, our method is applicable to any vocoder without +requiring architectural modifications or dependencies on specific acoustic +models. The experimental results validate the superiority of our vocoder over +conventional methods, achieving 11.99% and 12.05% improvements in mean opinion +scores when integrated with Tacotron 2 and FastSpeech 2 TTS acoustic models, +respectively. + +
+
+ comment: 4 pages, 4 figures, for demo samples, see + https://sytronik.github.io/demos/voc_smth_aug/ +
+
+
+
+
+ + ☆ Continual Diffuser (CoD): Mastering Continual Offline Reinforcement + Learning with Experience Rehearsal + + +
+ Artificial neural networks, especially recent diffusion-based models, have +shown remarkable superiority in gaming, control, and QA systems, where the +training tasks' datasets are usually static. However, in real-world +applications, such as robotic control of reinforcement learning (RL), the tasks +are changing, and new tasks arise in a sequential order. This situation poses +the new challenge of plasticity-stability trade-off for training an agent who +can adapt to task changes and retain acquired knowledge. In view of this, we +propose a rehearsal-based continual diffusion model, called Continual Diffuser +(CoD), to endow the diffuser with the capabilities of quick adaptation +(plasticity) and lasting retention (stability). Specifically, we first +construct an offline benchmark that contains 90 tasks from multiple domains. +Then, we train the CoD on each task with sequential modeling and conditional +generation for making decisions. Next, we preserve a small portion of previous +datasets as the rehearsal buffer and replay it to retain the acquired +knowledge. Extensive experiments on a series of tasks show CoD can achieve a +promising plasticity-stability trade-off and outperform existing +diffusion-based methods and other representative baselines on most tasks. + +
+
+
+
+
+ + ☆ CoAst: Validation-Free Contribution Assessment for Federated Learning + based on Cross-Round Valuation + + +
+ In the federated learning (FL) process, since the data held by each +participant is different, it is necessary to figure out which participant has a +higher contribution to the model performance. Effective contribution assessment +can help motivate data owners to participate in the FL training. Research works +in this field can be divided into two directions based on whether a validation +dataset is required. Validation-based methods need to use representative +validation data to measure the model accuracy, which is difficult to obtain in +practical FL scenarios. Existing validation-free methods assess the +contribution based on the parameters and gradients of local models and the +global model in a single training round, which is easily compromised by the +stochasticity of model training. In this work, we propose CoAst, a practical +method to assess the FL participants' contribution without access to any +validation data. The core idea of CoAst involves two aspects: one is to only +count the most important part of model parameters through a weights +quantization, and the other is a cross-round valuation based on the similarity +between the current local parameters and the global parameter updates in +several subsequent communication rounds. Extensive experiments show that CoAst +has comparable assessment reliability to existing validation-based methods and +outperforms existing validation-free methods. + +
+
+
+
+
+ + ☆ Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of + Data-Driven Optimization Routine + + +
+ Diffusion tensor imaging (DTI) holds significant importance in clinical +diagnosis and neuroscience research. However, conventional model-based fitting +methods often suffer from sensitivity to noise, leading to decreased accuracy +in estimating DTI parameters. While traditional data-driven deep learning +methods have shown potential in terms of accuracy and efficiency, their limited +generalization to out-of-training-distribution data impedes their broader +application due to the diverse scan protocols used across centers, scanners, +and studies. This work aims to tackle these challenges and promote the use of +DTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI +combines the weighted linear least squares fitting algorithm and regularization +by denoising technique. The former fits DW images from diverse acquisition +settings into diffusion tensor field, while the latter applies a deep +learning-based denoiser to regularize the diffusion tensor field instead of the +DW images, which is free from the limitation of fixed-channel assignment of the +network. The optimization object is solved using the alternating direction +method of multipliers and then unrolled to construct a deep neural network, +leveraging a data-driven strategy to learn network parameters. Extensive +validation experiments are conducted utilizing both internally simulated +datasets and externally obtained in-vivo datasets. The results, encompassing +both qualitative and quantitative analyses, showcase that the proposed method +attains state-of-the-art performance in DTI parameter estimation. Notably, it +demonstrates superior generalization, accuracy, and efficiency, rendering it +highly reliable for widespread application in the field. + +
+
+
+
+
+ + ☆ Adversarial Attacks on Machine Learning-Aided Visualizations + + +
+ Research in ML4VIS investigates how to use machine learning (ML) techniques +to generate visualizations, and the field is rapidly growing with high societal +impact. However, as with any computational pipeline that employs ML processes, +ML4VIS approaches are susceptible to a range of ML-specific adversarial +attacks. These attacks can manipulate visualization generations, causing +analysts to be tricked and their judgments to be impaired. Due to a lack of +synthesis from both visualization and ML perspectives, this security aspect is +largely overlooked by the current ML4VIS literature. To bridge this gap, we +investigate the potential vulnerabilities of ML-aided visualizations from +adversarial attacks using a holistic lens of both visualization and ML +perspectives. We first identify the attack surface (i.e., attack entry points) +that is unique in ML-aided visualizations. We then exemplify five different +adversarial attacks. These examples highlight the range of possible attacks +when considering the attack surface and multiple different adversary +capabilities. Our results show that adversaries can induce various attacks, +such as creating arbitrary and deceptive visualizations, by systematically +identifying input attributes that are influential in ML inferences. Based on +our observations of the attack surface characteristics and the attack examples, +we underline the importance of comprehensive studies of security issues and +defense mechanisms as a call of urgency for the ML4VIS community. + +
+
+ comment: This is the author's version of the article that has been accepted by + the Journal of Visualization +
+
+
+
+
+ + ☆ Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes + + +
+ High-quality real-time view synthesis methods are based on volume rendering, +splatting, or surface rendering. While surface-based methods generally are the +fastest, they cannot faithfully model fuzzy geometry like hair. In turn, +alpha-blending techniques excel at representing fuzzy materials but require an +unbounded number of samples per ray (P1). Further overheads are induced by +empty space skipping in volume rendering (P2) and sorting input primitives in +splatting (P3). These problems are exacerbated on low-performance graphics +hardware, e.g. on mobile devices. We present a novel representation for +real-time view synthesis where the (P1) number of sampling locations is small +and bounded, (P2) sampling locations are efficiently found via rasterization, +and (P3) rendering is sorting-free. We achieve this by representing objects as +semi-transparent multi-layer meshes, rendered in fixed layer order from +outermost to innermost. We model mesh layers as SDF shells with optimal spacing +learned during training. After baking, we fit UV textures to the corresponding +meshes. We show that our method can represent challenging fuzzy objects while +achieving higher frame rates than volume-based and splatting-based methods on +low-end and mobile devices. + +
+
+
+
+
+ + ☆ Demographic parity in regression and classification within the + unawareness framework + + +
+ This paper explores the theoretical foundations of fair regression under the +constraint of demographic parity within the unawareness framework, where +disparate treatment is prohibited, extending existing results where such +treatment is permitted. Specifically, we aim to characterize the optimal fair +regression function when minimizing the quadratic loss. Our results reveal that +this function is given by the solution to a barycenter problem with optimal +transport costs. Additionally, we study the connection between optimal fair +cost-sensitive classification, and optimal fair regression. We demonstrate that +nestedness of the decision sets of the classifiers is both necessary and +sufficient to establish a form of equivalence between classification and +regression. Under this nestedness assumption, the optimal classifiers can be +derived by applying thresholds to the optimal fair regression function; +conversely, the optimal fair regression function is characterized by the family +of cost-sensitive classifiers. + +
+
+
+
+
+ + ☆ ForeCal: Random Forest-based Calibration for DNNs + + +
+ Deep neural network(DNN) based classifiers do extremely well in +discriminating between observations, resulting in higher ROC AUC and accuracy +metrics, but their outputs are often miscalibrated with respect to true event +likelihoods. Post-hoc calibration algorithms are often used to calibrate the +outputs of these classifiers. Methods like Isotonic regression, Platt scaling, +and Temperature scaling have been shown to be effective in some cases but are +limited by their parametric assumptions and/or their inability to capture +complex non-linear relationships. We propose ForeCal - a novel post-hoc +calibration algorithm based on Random forests. ForeCal exploits two unique +properties of Random forests: the ability to enforce weak monotonicity and +range-preservation. It is more powerful in achieving calibration than current +state-of-the-art methods, is non-parametric, and can incorporate exogenous +information as features to learn a better calibration function. Through +experiments on 43 diverse datasets from the UCI ML repository, we show that +ForeCal outperforms existing methods in terms of Expected Calibration +Error(ECE) with minimal impact on the discriminative power of the base DNN as +measured by AUC. + +
+
+
+
+
+ + ☆ Adversarial Learning for Neural PDE Solvers with Sparse Data + + +
+ Neural network solvers for partial differential equations (PDEs) have made +significant progress, yet they continue to face challenges related to data +scarcity and model robustness. Traditional data augmentation methods, which +leverage symmetry or invariance, impose strong assumptions on physical systems +that often do not hold in dynamic and complex real-world applications. To +address this research gap, this study introduces a universal learning strategy +for neural network PDEs, named Systematic Model Augmentation for Robust +Training (SMART). By focusing on challenging and improving the model's +weaknesses, SMART reduces generalization error during training under +data-scarce conditions, leading to significant improvements in prediction +accuracy across various PDE scenarios. The effectiveness of the proposed method +is demonstrated through both theoretical analysis and extensive +experimentation. The code will be available. + +
+
+
+
+
+ + ☆ Transfer-based Adversarial Poisoning Attacks for Online (MIMO-)Deep + Receviers + + +
+ Recently, the design of wireless receivers using deep neural networks (DNNs), +known as deep receivers, has attracted extensive attention for ensuring +reliable communication in complex channel environments. To adapt quickly to +dynamic channels, online learning has been adopted to update the weights of +deep receivers with over-the-air data (e.g., pilots). However, the fragility of +neural models and the openness of wireless channels expose these systems to +malicious attacks. To this end, understanding these attack methods is essential +for robust receiver design.In this paper, we propose a transfer-based +adversarial poisoning attack method for online receivers.Without knowledge of +the attack target, adversarial perturbations are injected to the pilots, +poisoning the online deep receiver and impairing its ability to adapt to +dynamic channels and nonlinear effects. In particular, our attack method +targets Deep Soft Interference Cancellation (DeepSIC)[1] using online +meta-learning.As a classical model-driven deep receiver, DeepSIC incorporates +wireless domain knowledge into its architecture. This integration allows it to +adapt efficiently to time-varying channels with only a small number of pilots, +achieving optimal performance in a multi-input and multi-output (MIMO) +scenario.The deep receiver in this scenario has a number of applications in the +field of wireless communication, which motivates our study of the attack +methods targeting it.Specifically, we demonstrate the effectiveness of our +attack in simulations on synthetic linear, synthetic nonlinear, static, and +COST 2100 channels. Simulation results indicate that the proposed poisoning +attack significantly reduces the performance of online receivers in rapidly +changing scenarios. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ☆ Large Language Models as Efficient Reward Function Searchers for + Custom-Environment Multi-Objective Reinforcement Learning + + +
+ Leveraging large language models (LLMs) for designing reward functions +demonstrates significant potential. However, achieving effective design and +improvement of reward functions in reinforcement learning (RL) tasks with +complex custom environments and multiple requirements presents considerable +challenges. In this paper, we enable LLMs to be effective white-box searchers, +highlighting their advanced semantic understanding capabilities. Specifically, +we generate reward components for each explicit user requirement and employ the +reward critic to identify the correct code form. Then, LLMs assign weights to +the reward components to balance their values and iteratively search and +optimize these weights based on the context provided by the training log +analyzer, while adaptively determining the search step size. We applied the +framework to an underwater information collection RL task without direct human +feedback or reward examples (zero-shot). The reward critic successfully correct +the reward code with only one feedback for each requirement, effectively +preventing irreparable errors that can occur when reward function feedback is +provided in aggregate. The effective initialization of weights enables the +acquisition of different reward functions within the Pareto solution set +without weight search. Even in the case where a weight is 100 times off, fewer +than four iterations are needed to obtain solutions that meet user +requirements. The framework also works well with most prompts utilizing GPT-3.5 +Turbo, since it does not require advanced numerical understanding or +calculation. + +
+
+
+
+
+ + ☆ Diffusion Models Learn Low-Dimensional Distributions via Subspace + Clustering + + +
+ Recent empirical studies have demonstrated that diffusion models can +effectively learn the image distribution and generate new samples. Remarkably, +these models can achieve this even with a small number of training samples +despite a large image dimension, circumventing the curse of dimensionality. In +this work, we provide theoretical insights into this phenomenon by leveraging +key empirical observations: (i) the low intrinsic dimensionality of image data, +(ii) a union of manifold structure of image data, and (iii) the low-rank +property of the denoising autoencoder in trained diffusion models. These +observations motivate us to assume the underlying data distribution of image +data as a mixture of low-rank Gaussians and to parameterize the denoising +autoencoder as a low-rank model according to the score function of the assumed +distribution. With these setups, we rigorously show that optimizing the +training loss of diffusion models is equivalent to solving the canonical +subspace clustering problem over the training samples. Based on this +equivalence, we further show that the minimal number of samples required to +learn the underlying distribution scales linearly with the intrinsic dimensions +under the above data and model assumptions. This insight sheds light on why +diffusion models can break the curse of dimensionality and exhibit the phase +transition in learning distributions. Moreover, we empirically establish a +correspondence between the subspaces and the semantic representations of image +data, facilitating image editing. We validate these results with corroborated +experimental results on both simulated distributions and image datasets. + +
+
+ comment: 39 pages, 9 figures +
+
+
+
+
+ + ☆ Deep Adaptive Interest Network: Personalized Recommendation with + Context-Aware Learning + + +
+ In personalized recommendation systems, accurately capturing users' evolving +interests and combining them with contextual information is a critical research +area. This paper proposes a novel model called the Deep Adaptive Interest +Network (DAIN), which dynamically models users' interests while incorporating +context-aware learning mechanisms to achieve precise and adaptive personalized +recommendations. DAIN leverages deep learning techniques to build an adaptive +interest network structure that can capture users' interest changes in +real-time while further optimizing recommendation results by integrating +contextual information. Experiments conducted on several public datasets +demonstrate that DAIN excels in both recommendation performance and +computational efficiency. This research not only provides a new solution for +personalized recommendation systems but also offers fresh insights into the +application of context-aware learning in recommendation systems. + +
+
+
+
+
+ + ☆ Relative-Translation Invariant Wasserstein Distance + + +
+ We introduce a new family of distances, relative-translation invariant +Wasserstein distances ($RW_p$), for measuring the similarity of two probability +distributions under distribution shift. Generalizing it from the classical +optimal transport model, we show that $RW_p$ distances are also real distance +metrics defined on the quotient set $\mathcal{P}_p(\mathbb{R}^n)/\sim$ and +invariant to distribution translations. When $p=2$, the $RW_2$ distance enjoys +more exciting properties, including decomposability of the optimal transport +model, translation-invariance of the $RW_2$ distance, and a Pythagorean +relationship between $RW_2$ and the classical quadratic Wasserstein distance +($W_2$). Based on these properties, we show that a distribution shift, measured +by $W_2$ distance, can be explained in the bias-variance perspective. In +addition, we propose a variant of the Sinkhorn algorithm, named $RW_2$ Sinkhorn +algorithm, for efficiently calculating $RW_2$ distance, coupling solutions, as +well as $W_2$ distance. We also provide the analysis of numerical stability and +time complexity for the proposed algorithm. Finally, we validate the $RW_2$ +distance metric and the algorithm performance with three experiments. We +conduct one numerical validation for the $RW_2$ Sinkhorn algorithm and show two +real-world applications demonstrating the effectiveness of using $RW_2$ under +distribution shift: digits recognition and similar thunderstorm detection. The +experimental results report that our proposed algorithm significantly improves +the computational efficiency of Sinkhorn in certain practical applications, and +the $RW_2$ distance is robust to distribution translations compared with +baselines. + +
+
+
+
+
+ + ☆ Abstractive Text Summarization: State of the Art, Challenges, and + Improvements + + +
+ Specifically focusing on the landscape of abstractive text summarization, as +opposed to extractive techniques, this survey presents a comprehensive +overview, delving into state-of-the-art techniques, prevailing challenges, and +prospective research directions. We categorize the techniques into traditional +sequence-to-sequence models, pre-trained large language models, reinforcement +learning, hierarchical methods, and multi-modal summarization. Unlike prior +works that did not examine complexities, scalability and comparisons of +techniques in detail, this review takes a comprehensive approach encompassing +state-of-the-art methods, challenges, solutions, comparisons, limitations and +charts out future improvements - providing researchers an extensive overview to +advance abstractive summarization research. We provide vital comparison tables +across techniques categorized - offering insights into model complexity, +scalability and appropriate applications. The paper highlights challenges such +as inadequate meaning representation, factual consistency, controllable text +summarization, cross-lingual summarization, and evaluation metrics, among +others. Solutions leveraging knowledge incorporation and other innovative +strategies are proposed to address these challenges. The paper concludes by +highlighting emerging research areas like factual inconsistency, +domain-specific, cross-lingual, multilingual, and long-document summarization, +as well as handling noisy data. Our objective is to provide researchers and +practitioners with a structured overview of the domain, enabling them to better +understand the current landscape and identify potential areas for further +research and improvement. + +
+
+ comment: 9 Tables, 7 Figures +
+
+
+
+
+ + ☆ Adaptive Class Emergence Training: Enhancing Neural Network Stability + and Generalization through Progressive Target Evolution + + +
+ Recent advancements in artificial intelligence, particularly deep neural +networks, have pushed the boundaries of what is achievable in complex tasks. +Traditional methods for training neural networks in classification problems +often rely on static target outputs, such as one-hot encoded vectors, which can +lead to unstable optimization and difficulties in handling non-linearities +within data. In this paper, we propose a novel training methodology that +progressively evolves the target outputs from a null vector to one-hot encoded +vectors throughout the training process. This gradual transition allows the +network to adapt more smoothly to the increasing complexity of the +classification task, maintaining an equilibrium state that reduces the risk of +overfitting and enhances generalization. Our approach, inspired by concepts +from structural equilibrium in finite element analysis, has been validated +through extensive experiments on both synthetic and real-world datasets. The +results demonstrate that our method achieves faster convergence, improved +accuracy, and better generalization, especially in scenarios with high data +complexity and noise. This progressive training framework offers a robust +alternative to classical methods, opening new perspectives for more efficient +and stable neural network training. + +
+
+ comment: 15 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ Learning Privacy-Preserving Student Networks via + Discriminative-Generative Distillation + + +
+ While deep models have proved successful in learning rich knowledge from +massive well-annotated data, they may pose a privacy leakage risk in practical +deployment. It is necessary to find an effective trade-off between high utility +and strong privacy. In this work, we propose a discriminative-generative +distillation approach to learn privacy-preserving deep models. Our key idea is +taking models as bridge to distill knowledge from private data and then +transfer it to learn a student network via two streams. First, discriminative +stream trains a baseline classifier on private data and an ensemble of teachers +on multiple disjoint private subsets, respectively. Then, generative stream +takes the classifier as a fixed discriminator and trains a generator in a +data-free manner. After that, the generator is used to generate massive +synthetic data which are further applied to train a variational autoencoder +(VAE). Among these synthetic data, a few of them are fed into the teacher +ensemble to query labels via differentially private aggregation, while most of +them are embedded to the trained VAE for reconstructing synthetic data. +Finally, a semi-supervised student learning is performed to simultaneously +handle two tasks: knowledge transfer from the teachers with distillation on few +privately labeled synthetic data, and knowledge enhancement with tangent-normal +adversarial regularization on many triples of reconstructed synthetic data. In +this way, our approach can control query cost over private data and mitigate +accuracy degradation in a unified manner, leading to a privacy-preserving +student model. Extensive experiments and analysis clearly show the +effectiveness of the proposed approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ Building Math Agents with Multi-Turn Iterative Preference Learning + + +
+ Recent studies have shown that large language models' (LLMs) mathematical +problem-solving capabilities can be enhanced by integrating external tools, +such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) +reasoning. While current methods focus on synthetic data generation and +Supervised Fine-Tuning (SFT), this paper studies the complementary direct +preference learning approach to further improve model performance. However, +existing direct preference learning algorithms are originally designed for the +single-turn chat task, and do not fully address the complexities of multi-turn +reasoning and external tool integration required for tool-integrated +mathematical reasoning tasks. To fill in this gap, we introduce a multi-turn +direct preference learning framework, tailored for this context, that leverages +feedback from code interpreters and optimizes trajectory-level preferences. +This framework includes multi-turn DPO and multi-turn KTO as specific +implementations. The effectiveness of our framework is validated through +training of various language models using an augmented prompt set from the +GSM8K and MATH datasets. Our results demonstrate substantial improvements: a +supervised fine-tuned Gemma-1.1-it-7B model's performance increased from 77.5% +to 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B +model improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH. + +
+
+ comment: A multi-turn direct preference learning framework for tool-integrated + reasoning tasks +
+
+
+
+
+ + ☆ Gaussian Rate-Distortion-Perception Coding and Entropy-Constrained + Scalar Quantization + + +
+ This paper investigates the best known bounds on the quadratic Gaussian +distortion-rate-perception function with limited common randomness for the +Kullback-Leibler divergence-based perception measure, as well as their +counterparts for the squared Wasserstein-2 distance-based perception measure, +recently established by Xie et al. These bounds are shown to be nondegenerate +in the sense that they cannot be deduced from each other via a refined version +of Talagrand's transportation inequality. On the other hand, an improved lower +bound is established when the perception measure is given by the squared +Wasserstein-2 distance. In addition, it is revealed by exploiting the +connection between rate-distortion-perception coding and entropy-constrained +scalar quantization that all the aforementioned bounds are generally not tight +in the weak perception constraint regime. + +
+
+
+
+
+ + ☆ Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable + Image Editing + + +
+ Recently, diffusion models have emerged as a powerful class of generative +models. Despite their success, there is still limited understanding of their +semantic spaces. This makes it challenging to achieve precise and disentangled +image generation without additional training, especially in an unsupervised +way. In this work, we improve the understanding of their semantic spaces from +intriguing observations: among a certain range of noise levels, (1) the learned +posterior mean predictor (PMP) in the diffusion model is locally linear, and +(2) the singular vectors of its Jacobian lie in low-dimensional semantic +subspaces. We provide a solid theoretical basis to justify the linearity and +low-rankness in the PMP. These insights allow us to propose an unsupervised, +single-step, training-free LOw-rank COntrollable image editing (LOCO Edit) +method for precise local editing in diffusion models. LOCO Edit identified +editing directions with nice properties: homogeneity, transferability, +composability, and linearity. These properties of LOCO Edit benefit greatly +from the low-dimensional semantic subspace. Our method can further be extended +to unsupervised or text-supervised editing in various text-to-image diffusion +models (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the +effectiveness and efficiency of LOCO Edit. The codes will be released at +https://github.com/ChicyChen/LOCO-Edit. + +
+
+
+
+
+ + ☆ Optimal Neural Network Approximation for High-Dimensional Continuous + Functions + + +
+ Recently, the authors of Shen Yang Zhang (JMLR, 2022) developed a neural +network with width $36d(2d + 1)$ and depth $11$, which utilizes a special +activation function called the elementary universal activation function, to +achieve the super approximation property for functions in $C([a,b]^d)$. That +is, the constructed network only requires a fixed number of neurons to +approximate a $d$-variate continuous function on a $d$-dimensional hypercube +with arbitrary accuracy. Their network uses $\mathcal{O}(d^2)$ fixed neurons. +One natural question to address is whether we can reduce the number of these +neurons in such a network. By leveraging a variant of the Kolmogorov +Superposition Theorem, our analysis shows that there is a neural network +generated by the elementary universal activation function with only $366d +365$ +fixed, intrinsic (non-repeated) neurons that attains this super approximation +property. Furthermore, we present a family of continuous functions that +requires at least width $d$, and therefore at least $d$ intrinsic neurons, to +achieve arbitrary accuracy in its approximation. This shows that the +requirement of $\mathcal{O}(d)$ intrinsic neurons is optimal in the sense that +it grows linearly with the input dimension $d$, unlike some approximation +methods where parameters may grow exponentially with $d$. + +
+
+
+
+
+ + ☆ Machine Learning Applications to Computational Plasma Physics and + Reduced-Order Plasma Modeling: A Perspective + + +
+ Machine learning (ML) provides a broad spectrum of tools and architectures +that enable the transformation of data from simulations and experiments into +useful and explainable science, thereby augmenting domain knowledge. +Furthermore, ML-enhanced numerical modelling can revamp scientific computing +for real-world complex engineering systems, creating unique opportunities to +examine the operation of the technologies in detail and automate their +optimization and control. In recent years, ML applications have seen +significant growth across various scientific domains, particularly in fluid +mechanics, where ML has shown great promise in enhancing computational modeling +of fluid flows. In contrast, ML applications in numerical plasma physics +research remain relatively limited in scope and extent. Despite this, the close +relationship between fluid mechanics and plasma physics presents a valuable +opportunity to create a roadmap for transferring ML advances in fluid flow +modeling to computational plasma physics. This Perspective aims to outline such +a roadmap. We begin by discussing some general fundamental aspects of ML, +including the various categories of ML algorithms and the different types of +problems that can be solved with the help of ML. With regard to each problem +type, we then present specific examples from the use of ML in computational +fluid dynamics, reviewing several insightful prior efforts. We also review +recent ML applications in plasma physics for each problem type. The paper +discusses promising future directions and development pathways for ML in plasma +modelling within the different application areas. Additionally, we point out +prominent challenges that must be addressed to realize ML's full potential in +computational plasma physics, including the need for cost-effective +high-fidelity simulation tools for extensive data generation. + +
+
+ comment: 42 pages, 20 figures +
+
+
+
+
+ + ☆ Understanding the Role of Functional Diversity in Weight-Ensembling with + Ingredient Selection and Multidimensional Scaling ICML 2024 + + +
+ Weight-ensembles are formed when the parameters of multiple neural networks +are directly averaged into a single model. They have demonstrated +generalization capability in-distribution (ID) and out-of-distribution (OOD) +which is not completely understood, though they are thought to successfully +exploit functional diversity allotted by each distinct model. Given a +collection of models, it is also unclear which combination leads to the optimal +weight-ensemble; the SOTA is a linear-time ``greedy" method. We introduce two +novel weight-ensembling approaches to study the link between performance +dynamics and the nature of how each method decides to use apply the +functionally diverse components, akin to diversity-encouragement in the +prediction-ensemble literature. We develop a visualization tool to explain how +each algorithm explores various domains defined via pairwise-distances to +further investigate selection and algorithms' convergence. Empirical analyses +shed perspectives which reinforce how high-diversity enhances weight-ensembling +while qualifying the extent to which diversity alone improves accuracy. We also +demonstrate that sampling positionally distinct models can contribute just as +meaningfully to improvements in a weight-ensemble. + +
+
+ comment: Published at the ICML 2024 (Vienna, Austria) Workshop on Foundation + Models in the Wild +
+
+
+
+
+ + ☆ Robust Federated Finetuning of Foundation Models via Alternating + Minimization of LoRA ICML2024 + + +
+ Parameter-Efficient Fine-Tuning (PEFT) has risen as an innovative training +strategy that updates only a select few model parameters, significantly +lowering both computational and memory demands. PEFT also helps to decrease +data transfer in federated learning settings, where communication depends on +the size of updates. In this work, we explore the constraints of previous +studies that integrate a well-known PEFT method named LoRA with federated +fine-tuning, then introduce RoLoRA, a robust federated fine-tuning framework +that utilizes an alternating minimization approach for LoRA, providing greater +robustness against decreasing fine-tuning parameters and increasing data +heterogeneity. Our results indicate that RoLoRA not only presents the +communication benefits but also substantially enhances the robustness and +effectiveness in multiple federated fine-tuning scenarios. + +
+
+ comment: Presented at ES-FOMO-II@ICML2024 +
+
+
+
+
+ + ☆ NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for + Retrieval + + +
+ $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval) +from pre-trained embedding models is the predominant retrieval method for text +and images, as well as Retrieval-Augmented Generation (RAG) pipelines. In +practice, application developers often fine-tune the embeddings to improve +their accuracy on the dataset and query workload in hand. Existing approaches +either fine-tune the pre-trained model itself or, more efficiently, but at the +cost of accuracy, train adaptor models to transform the output of the +pre-trained model. We present NUDGE, a family of novel non-parametric embedding +fine-tuning approaches that are significantly more accurate and efficient than +both sets of existing approaches. NUDGE directly modifies the embeddings of +data records to maximize the accuracy of $k$-NN retrieval. We present a +thorough theoretical and experimental study of NUDGE's non-parametric approach. +We show that even though the underlying problem is NP-Hard, constrained +variations can be solved efficiently. These constraints additionally ensure +that the changes to the embeddings are modest, avoiding large distortions to +the semantics learned during pre-training. In experiments across five +pre-trained models and nine standard text and image retrieval datasets, NUDGE +runs in minutes and often improves NDCG@10 by more than 10% over existing +fine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase +in accuracy and runs 200x and 3x faster, respectively, over fine-tuning the +pre-trained model and training adaptors. + +
+
+
+
+
+ + ☆ Optimal sampling for least-squares approximation + + +
+ Least-squares approximation is one of the most important methods for +recovering an unknown function from data. While in many applications the data +is fixed, in many others there is substantial freedom to choose where to +sample. In this paper, we review recent progress on optimal sampling for +(weighted) least-squares approximation in arbitrary linear spaces. We introduce +the Christoffel function as a key quantity in the analysis of (weighted) +least-squares approximation from random samples, then show how it can be used +to construct sampling strategies that possess near-optimal sample complexity: +namely, the number of samples scales log-linearly in $n$, the dimension of the +approximation space. We discuss a series of variations, extensions and further +topics, and throughout highlight connections to approximation theory, machine +learning, information-based complexity and numerical linear algebra. Finally, +motivated by various contemporary applications, we consider a generalization of +the classical setting where the samples need not be pointwise samples of a +scalar-valued function, and the approximation space need not be linear. We show +that even in this significantly more general setting suitable generalizations +of the Christoffel function still determine the sample complexity. This +provides a unified procedure for designing improved sampling strategies for +general recovery problems. This article is largely self-contained, and intended +to be accessible to nonspecialists. + +
+
+
+
+
+ + ☆ Data-driven 2D stationary quantum droplets and wave propagations in the + amended GP equation with two potentials via deep neural networks learning + + +
+ In this paper, we develop a systematic deep learning approach to solve +two-dimensional (2D) stationary quantum droplets (QDs) and investigate their +wave propagation in the 2D amended Gross-Pitaevskii equation with +Lee-Huang-Yang correction and two kinds of potentials. Firstly, we use the +initial-value iterative neural network (IINN) algorithm for 2D stationary +quantum droplets of stationary equations. Then the learned stationary QDs are +used as the initial value conditions for physics-informed neural networks +(PINNs) to explore their evolutions in the some space-time region. Especially, +we consider two types of potentials, one is the 2D quadruple-well Gaussian +potential and the other is the PT-symmetric HO-Gaussian potential, which lead +to spontaneous symmetry breaking and the generation of multi-component QDs. The +used deep learning method can also be applied to study wave propagations of +other nonlinear physical models. + +
+
+ comment: 17 pages, 12 figures (Proc. R. Soc. A, accepted for publication). + arXiv admin note: text overlap with arXiv:2409.01124 +
+
+
+
+
+ + ♻ ☆ Enhancing Graph Neural Networks with Limited Labeled Data by Actively + Distilling Knowledge from Large Language Models + + +
+ Graphs are pervasive in the real-world, such as social network analysis, +bioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great +ability in node classification, a fundamental task on graphs. Unfortunately, +conventional GNNs still face challenges in scenarios with few labeled nodes, +despite the prevalence of few-shot node classification tasks in real-world +applications. To address this challenge, various approaches have been proposed, +including graph meta-learning, transfer learning, and methods based on Large +Language Models (LLMs). However, traditional meta-learning and transfer +learning methods often require prior knowledge from base classes or fail to +exploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based +methods may overlook the zero-shot capabilities of LLMs and rely heavily on the +quality of generated contexts. In this paper, we propose a novel approach that +integrates LLMs and GNNs, leveraging the zero-shot inference and reasoning +capabilities of LLMs and employing a Graph-LLM-based active learning paradigm +to enhance GNNs' performance. Extensive experiments demonstrate the +effectiveness of our model in improving node classification accuracy with +considerably limited labeled data, surpassing state-of-the-art baselines by +significant margins. + +
+
+ comment: 10 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ Decentralized Intelligence Network (DIN) + + +
+ Decentralized Intelligence Network (DIN) is a theoretical framework designed +to address challenges in AI development, particularly focusing on data +fragmentation and siloing issues. It facilitates effective AI training within +sovereign data networks by overcoming barriers to accessing diverse data +sources, leveraging: 1) personal data stores to ensure data sovereignty, where +data remains securely within Participants' control; 2) a scalable federated +learning protocol implemented on a public blockchain for decentralized AI +training, where only model parameter updates are shared, keeping data within +the personal data stores; and 3) a scalable, trustless cryptographic rewards +mechanism on a public blockchain to incentivize participation and ensure fair +reward distribution through a decentralized auditing protocol. This approach +guarantees that no entity can prevent or control access to training data or +influence financial benefits, as coordination and reward distribution are +managed on the public blockchain with an immutable record. The framework +supports effective AI training by allowing Participants to maintain control +over their data, benefit financially, and contribute to a decentralized, +scalable ecosystem that leverages collective AI to develop beneficial +algorithms. + +
+
+ comment: 16 pages, 1 figure. DIN was presented by the author as a speaker at + the Summit on Responsible Decentralized Intelligence - Future of + Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the + Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC +
+
+
+
+
+ + ♻ ☆ Kolmogorov n-Widths for Multitask Physics-Informed Machine Learning + (PIML) Methods: Towards Robust Metrics + + +
+ Physics-informed machine learning (PIML) as a means of solving partial +differential equations (PDE) has garnered much attention in the Computational +Science and Engineering (CS&E) world. This topic encompasses a broad array of +methods and models aimed at solving a single or a collection of PDE problems, +called multitask learning. PIML is characterized by the incorporation of +physical laws into the training process of machine learning models in lieu of +large data when solving PDE problems. Despite the overall success of this +collection of methods, it remains incredibly difficult to analyze, benchmark, +and generally compare one approach to another. Using Kolmogorov n-widths as a +measure of effectiveness of approximating functions, we judiciously apply this +metric in the comparison of various multitask PIML architectures. We compute +lower accuracy bounds and analyze the model's learned basis functions on +various PDE problems. This is the first objective metric for comparing +multitask PIML architectures and helps remove uncertainty in model validation +from selective sampling and overfitting. We also identify avenues of +improvement for model architectures, such as the choice of activation function, +which can drastically affect model generalization to "worst-case" scenarios, +which is not observed when reporting task-specific errors. We also incorporate +this metric into the optimization process through regularization, which +improves the models' generalizability over the multitask PDE problem. + +
+
+
+
+
+ + ♻ ☆ Hybrid Decentralized Optimization: Leveraging Both First- and + Zeroth-Order Optimizers for Faster Convergence + + +
+ Distributed optimization is the standard way of speeding up machine learning +training, and most of the research in the area focuses on distributed +first-order, gradient-based methods. Yet, there are settings where some +computationally-bounded nodes may not be able to implement first-order, +gradient-based optimization, while they could still contribute to joint +optimization tasks. In this paper, we initiate the study of hybrid +decentralized optimization, studying settings where nodes with zeroth-order and +first-order optimization capabilities co-exist in a distributed system, and +attempt to jointly solve an optimization task over some data distribution. We +essentially show that, under reasonable parameter settings, such a system can +not only withstand noisier zeroth-order agents but can even benefit from +integrating such agents into the optimization process, rather than ignoring +their information. At the core of our approach is a new analysis of distributed +optimization with noisy and possibly-biased gradient estimators, which may be +of independent interest. Our results hold for both convex and non-convex +objectives. Experimental results on standard optimization tasks confirm our +analysis, showing that hybrid first-zeroth order optimization can be practical, +even when training deep neural networks. + +
+
+ comment: Shayan Talaei and Matin Ansaripour contributed equally to this work +
+
+
+
+
+ + ♻ ☆ The Need for Guardrails with Large Language Models in Medical + Safety-Critical Settings: An Artificial Intelligence Application in the + Pharmacovigilance Ecosystem + + +
+ Large language models (LLMs) are useful tools with the capacity for +performing specific types of knowledge work at an effective scale. However, LLM +deployments in high-risk and safety-critical domains pose unique challenges, +notably the issue of ``hallucination,'' where LLMs can generate fabricated +information. This is particularly concerning in settings such as drug safety, +where inaccuracies could lead to patient harm. To mitigate these risks, we have +developed and demonstrated a proof of concept suite of guardrails specifically +designed to mitigate certain types of hallucinations and errors for drug +safety, and potentially applicable to other medical safety-critical contexts. +These guardrails include mechanisms to detect anomalous documents to prevent +the ingestion of inappropriate data, identify incorrect drug names or adverse +event terms, and convey uncertainty in generated content. We integrated these +guardrails with an LLM fine-tuned for a text-to-text task, which involves +converting both structured and unstructured data within adverse event reports +into natural language. This method was applied to translate individual case +safety reports, demonstrating effective application in a pharmacovigilance +processing task. Our guardrail framework offers a set of tools with broad +applicability across various domains, ensuring LLMs can be safely used in +high-risk situations by eliminating the occurrence of key errors, including the +generation of incorrect pharmacovigilance-related terms, thus adhering to +stringent regulatory and quality standards in medical safety-critical +environments. + +
+
+ comment: 27 pages, 6 figures, 4 tables and supplementary material provided +
+
+
+
+
+ + ♻ ☆ GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for + High-Throughput Omics Data Analysis and Visualization + + +
+ The surge in high-throughput omics data has reshaped the landscape of +biological research, underlining the need for powerful, user-friendly data +analysis and interpretation tools. This paper presents GenoCraft, a web-based +comprehensive software solution designed to handle the entire pipeline of omics +data processing. GenoCraft offers a unified platform featuring advanced +bioinformatics tools, covering all aspects of omics data analysis. It +encompasses a range of functionalities, such as normalization, quality control, +differential analysis, network analysis, pathway analysis, and diverse +visualization techniques. This software makes state-of-the-art omics data +analysis more accessible to a wider range of users. With GenoCraft, researchers +and data scientists have access to an array of cutting-edge bioinformatics +tools under a user-friendly interface, making it a valuable resource for +managing and analyzing large-scale omics data. The API with an interactive web +interface is publicly available at https://genocraft.stanford. edu/. We also +release all the codes in https://github.com/futianfan/GenoCraft. + +
+
+
+
+
+ + ♻ ☆ $μ$GUIDE: a framework for quantitative imaging via generalized + uncertainty-driven inference using deep learning + + +
+ This work proposes $\mu$GUIDE: a general Bayesian framework to estimate +posterior distributions of tissue microstructure parameters from any given +biophysical model or MRI signal representation, with exemplar demonstration in +diffusion-weighted MRI. Harnessing a new deep learning architecture for +automatic signal feature selection combined with simulation-based inference and +efficient sampling of the posterior distributions, $\mu$GUIDE bypasses the high +computational and time cost of conventional Bayesian approaches and does not +rely on acquisition constraints to define model-specific summary statistics. +The obtained posterior distributions allow to highlight degeneracies present in +the model definition and quantify the uncertainty and ambiguity of the +estimated parameters. + +
+
+
+
+
+ + ♻ ☆ Partially Observable Multi-Agent Reinforcement Learning with Information + Sharing ICML 2023 + + +
+ We study provable multi-agent reinforcement learning (RL) in the general +framework of partially observable stochastic games (POSGs). To circumvent the +known hardness results and the use of computationally intractable oracles, we +advocate leveraging the potential \emph{information-sharing} among agents, a +common practice in empirical multi-agent RL, and a standard model for +multi-agent control systems with communications. We first establish several +computational complexity results to justify the necessity of +information-sharing, as well as the observability assumption that has enabled +quasi-efficient single-agent RL with partial observations, for efficiently +solving POSGs. {Inspired by the inefficiency of planning in the ground-truth +model,} we then propose to further \emph{approximate} the shared common +information to construct an {approximate model} of the POSG, in which planning +an approximate \emph{equilibrium} (in terms of solving the original POSG) can +be quasi-efficient, i.e., of quasi-polynomial-time, under the aforementioned +assumptions. Furthermore, we develop a partially observable multi-agent RL +algorithm that is \emph{both} statistically and computationally +quasi-efficient. {Finally, beyond equilibrium learning, we extend our +algorithmic framework to finding the \emph{team-optimal solution} in +cooperative POSGs, i.e., decentralized partially observable Markov decision +processes, a much more challenging goal. We establish concrete computational +and sample complexities under several common structural assumptions of the +model.} We hope our study could open up the possibilities of leveraging and +even designing different \emph{information structures}, a well-studied notion +in control theory, for developing both sample- and computation-efficient +partially observable multi-agent RL. + +
+
+ comment: Journal extension of the conference version at ICML 2023. Changed to + the more general reward function form, added new results for learning in + Dec-POMDPs, and streamlined proof outlines +
+
+
+
+
+ + ♻ ☆ Domain Decomposition-based coupling of Operator Inference reduced order + models via the Schwarz alternating method + + +
+ This paper presents and evaluates an approach for coupling together +subdomain-local reduced order models (ROMs) constructed via non-intrusive +operator inference (OpInf) with each other and with subdomain-local full order +models (FOMs), following a domain decomposition of the spatial geometry on +which a given partial differential equation (PDE) is posed. Joining +subdomain-local models is accomplished using the overlapping Schwarz +alternating method, a minimally-intrusive multiscale coupling technique that +works by transforming a monolithic problem into a sequence of subdomain-local +problems, which communicate through transmission boundary conditions imposed on +the subdomain interfaces. After formulating the overlapping Schwarz alternating +method for OpInf ROMs, termed OpInf-Schwarz, we evaluate the method's accuracy +and efficiency on several test cases involving the heat equation in two spatial +dimensions. We demonstrate that the method is capable of coupling together +arbitrary combinations of OpInf ROMs and FOMs, and that speed-ups over a +monolithic FOM are possible when performing OpInf ROM coupling. + +
+
+
+
+
+ + ♻ ☆ Simple and Scalable Strategies to Continually Pre-train Large Language + Models + + +
+ Large language models (LLMs) are routinely pre-trained on billions of tokens, +only to start the process over again once new data becomes available. A much +more efficient solution is to continually pre-train these models, saving +significant compute compared to re-training. However, the distribution shift +induced by new data typically results in degraded performance on previous data +or poor adaptation to the new data. In this work, we show that a simple and +scalable combination of learning rate (LR) re-warming, LR re-decaying, and +replay of previous data is sufficient to match the performance of fully +re-training from scratch on all available data, as measured by the final loss +and the average score on several language model (LM) evaluation benchmarks. +Specifically, we show this for a weak but realistic distribution shift between +two commonly used LLM pre-training datasets (English$\rightarrow$English) and a +stronger distribution shift (English$\rightarrow$German) at the $405$M +parameter model scale with large dataset sizes (hundreds of billions of +tokens). Selecting the weak but realistic shift for larger-scale experiments, +we also find that our continual learning strategies match the re-training +baseline for a 10B parameter LLM. Our results demonstrate that LLMs can be +successfully updated via simple and scalable continual learning strategies, +matching the re-training baseline using only a fraction of the compute. +Finally, inspired by previous work, we propose alternatives to the cosine +learning rate schedule that help circumvent forgetting induced by LR re-warming +and that are not bound to a fixed token budget. + +
+
+
+
+
+ + ♻ ☆ Convolutional L2LFlows: Generating Accurate Showers in Highly Granular + Calorimeters Using Convolutional Normalizing Flows + + +
+ In the quest to build generative surrogate models as computationally +efficient alternatives to rule-based simulations, the quality of the generated +samples remains a crucial frontier. So far, normalizing flows have been among +the models with the best fidelity. However, as the latent space in such models +is required to have the same dimensionality as the data space, scaling up +normalizing flows to high dimensional datasets is not straightforward. The +prior L2LFlows approach successfully used a series of separate normalizing +flows and sequence of conditioning steps to circumvent this problem. In this +work, we extend L2LFlows to simulate showers with a 9-times larger profile in +the lateral direction. To achieve this, we introduce convolutional layers and +U-Net-type connections, move from masked autoregressive flows to coupling +layers, and demonstrate the successful modelling of showers in the ILD +Electromagnetic Calorimeter as well as Dataset 3 from the public CaloChallenge +dataset. + +
+
+
+
+
+ + ♻ ☆ Multi-Agent Reinforcement Learning from Human Feedback: Data Coverage + and Algorithmic Techniques + + +
+ We initiate the study of Multi-Agent Reinforcement Learning from Human +Feedback (MARLHF), exploring both theoretical foundations and empirical +validations. We define the task as identifying Nash equilibrium from a +preference-only offline dataset in general-sum games, a problem marked by the +challenge of sparse feedback signals. Our theory establishes the upper +complexity bounds for Nash Equilibrium in effective MARLHF, demonstrating that +single-policy coverage is inadequate and highlighting the importance of +unilateral dataset coverage. These theoretical insights are verified through +comprehensive experiments. To enhance the practical performance, we further +introduce two algorithmic techniques. (1) We propose a Mean Squared Error (MSE) +regularization along the time axis to achieve a more uniform reward +distribution and improve reward learning outcomes. (2) We utilize imitation +learning to approximate the reference policy, ensuring stability and +effectiveness in training. Our findings underscore the multifaceted approach +required for MARLHF, paving the way for effective preference-based multi-agent +systems. + +
+
+
+
+
+ + ♻ ☆ Revisiting Character-level Adversarial Attacks for Language Models ICML 2024 + + +
+ Adversarial attacks in Natural Language Processing apply perturbations in the +character or token levels. Token-level attacks, gaining prominence for their +use of gradient-based methods, are susceptible to altering sentence semantics, +leading to invalid adversarial examples. While character-level attacks easily +maintain semantics, they have received less attention as they cannot easily +adopt popular gradient-based methods, and are thought to be easy to defend. +Challenging these beliefs, we introduce Charmer, an efficient query-based +adversarial attack capable of achieving high attack success rate (ASR) while +generating highly similar adversarial examples. Our method successfully targets +both small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2, +Charmer improves the ASR in 4.84% points and the USE similarity in 8% points +with respect to the previous art. Our implementation is available in +https://github.com/LIONS-EPFL/Charmer. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ Privacy-aware Berrut Approximated Coded Computing for Federated Learning + + +
+ Federated Learning (FL) is an interesting strategy that enables the +collaborative training of an AI model among different data owners without +revealing their private datasets. Even so, FL has some privacy vulnerabilities +that have been tried to be overcome by applying some techniques like +Differential Privacy (DP), Homomorphic Encryption (HE), or Secure Multi-Party +Computation (SMPC). However, these techniques have some important drawbacks +that might narrow their range of application: problems to work with non-linear +functions and to operate large matrix multiplications and high communication +and computational costs to manage semi-honest nodes. In this context, we +propose a solution to guarantee privacy in FL schemes that simultaneously +solves the previously mentioned problems. Our proposal is based on the Berrut +Approximated Coded Computing, a technique from the Coded Distributed Computing +paradigm, adapted to a Secret Sharing configuration, to provide input privacy +to FL in a scalable way. It can be applied for computing non-linear functions +and treats the special case of distributed matrix multiplication, a key +primitive at the core of many automated learning tasks. Because of these +characteristics, it could be applied in a wide range of FL scenarios, since it +is independent of the machine learning models or aggregation algorithms used in +the FL scheme. We provide analysis of the achieved privacy and complexity of +our solution and, due to the extensive numerical results performed, a good +trade-off between privacy and precision can be observed. + +
+
+
+
+
+ + ♻ ☆ A Systematic Bias of Machine Learning Regression Models and Its + Correction: an Application to Imaging-based Brain Age Prediction + + +
+ Machine learning models for continuous outcomes often yield systematically +biased predictions, particularly for values that largely deviate from the mean. +Specifically, predictions for large-valued outcomes tend to be negatively +biased (underestimating actual values), while those for small-valued outcomes +are positively biased (overestimating actual values). We refer to this linear +central tendency warped bias as the "systematic bias of machine learning +regression". In this paper, we first demonstrate that this systematic +prediction bias persists across various machine learning regression models, and +then delve into its theoretical underpinnings. To address this issue, we +propose a general constrained optimization approach designed to correct this +bias and develop computationally efficient implementation algorithms. +Simulation results indicate that our correction method effectively eliminates +the bias from the predicted outcomes. We apply the proposed approach to the +prediction of brain age using neuroimaging data. In comparison to competing +machine learning regression models, our method effectively addresses the +longstanding issue of "systematic bias of machine learning regression" in +neuroimaging-based brain age calculation, yielding unbiased predictions of +brain age. + +
+
+
+
+
+ + ♻ ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations change +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations changes through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that how ID changes through the network differs +noticeably between natural and medical image models. Specifically, medical +image models peak in representation ID earlier in the network, implying a +difference in the image features and their abstractness that are typically used +for downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ♻ ☆ Energy-Efficient Channel Decoding for Wireless Federated Learning: + Convergence Analysis and Adaptive Design + + +
+ One of the most critical challenges for deploying distributed learning +solutions, such as federated learning (FL), in wireless networks is the limited +battery capacity of mobile clients. While it is a common belief that the major +energy consumption of mobile clients comes from the uplink data transmission, +this paper presents a novel finding, namely channel decoding also contributes +significantly to the overall energy consumption of mobile clients in FL. +Motivated by this new observation, we propose an energy-efficient adaptive +channel decoding scheme that leverages the intrinsic robustness of FL to model +errors. In particular, the robustness is exploited to reduce the energy +consumption of channel decoders at mobile clients by adaptively adjusting the +number of decoding iterations. We theoretically prove that wireless FL with +communication errors can converge at the same rate as the case with error-free +communication provided the bit error rate (BER) is properly constrained. An +adaptive channel decoding scheme is then proposed to improve the energy +efficiency of wireless FL systems. Experimental results demonstrate that the +proposed method maintains the same learning accuracy while reducing the channel +decoding energy consumption by ~20% when compared to an existing approach. + +
+
+ comment: This paper has been accepted by the IEEE TWC. Copyright may be + transferred without notice, after which this version may no longer be + accessible +
+
+
+
+
+ + ♻ ☆ Negation Blindness in Large Language Models: Unveiling the NO Syndrome + in Image Generation + + +
+ Foundational Large Language Models (LLMs) have changed the way we perceive +technology. They have been shown to excel in tasks ranging from poem writing +and coding to essay generation and puzzle solving. With the incorporation of +image generation capability, they have become more comprehensive and versatile +AI tools. At the same time, researchers are striving to identify the +limitations of these tools to improve them further. Currently identified flaws +include hallucination, biases, and bypassing restricted commands to generate +harmful content. In the present work, we have identified a fundamental +limitation related to the image generation ability of LLMs, and termed it The +NO Syndrome. This negation blindness refers to LLMs inability to correctly +comprehend NO related natural language prompts to generate the desired images. +Interestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found +to be suffering from this syndrome. To demonstrate the generalization of this +limitation, we carried out simulation experiments and conducted entropy-based +and benchmark statistical analysis tests on various LLMs in multiple languages, +including English, Hindi, and French. We conclude that the NO syndrome is a +significant flaw in current LLMs that needs to be addressed. A related finding +of this study showed a consistent discrepancy between image and textual +responses as a result of this NO syndrome. We posit that the introduction of a +negation context-aware reinforcement learning based feedback loop between the +LLMs textual response and generated image could help ensure the generated text +is based on both the LLMs correct contextual understanding of the negation +query and the generated visual output. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Different Victims, Same Layout: Email Visual Similarity Detection for + Enhanced Email Protection CCS 2024 + + +
+ In the pursuit of an effective spam detection system, the focus has often +been on identifying known spam patterns either through rule-based detection +systems or machine learning (ML) solutions that rely on keywords. However, both +systems are susceptible to evasion techniques and zero-day attacks that can be +achieved at low cost. Therefore, an email that bypassed the defense system once +can do it again in the following days, even though rules are updated or the ML +models are retrained. The recurrence of failures to detect emails that exhibit +layout similarities to previously undetected spam is concerning for customers +and can erode their trust in a company. Our observations show that threat +actors reuse email kits extensively and can bypass detection with little +effort, for example, by making changes to the content of emails. In this work, +we propose an email visual similarity detection approach, named Pisco, to +improve the detection capabilities of an email threat defense system. We apply +our proof of concept to some real-world samples received from different +sources. Our results show that email kits are being reused extensively and +visually similar emails are sent to our customers at various time intervals. +Therefore, this method could be very helpful in situations where detection +engines that rely on textual features and keywords are bypassed, an occurrence +our observations show happens frequently. + +
+
+ comment: To be published in the proceedings of the ACM Conference on Computer + and Communications Security (ACM CCS 2024) +
+
+
+
+
+ + ♻ ☆ Fast and interpretable Support Vector Classification based on the + truncated ANOVA decomposition + + +
+ Support Vector Machines (SVMs) are an important tool for performing +classification on scattered data, where one usually has to deal with many data +points in high-dimensional spaces. We propose solving SVMs in primal form using +feature maps based on trigonometric functions or wavelets. In small dimensional +settings the Fast Fourier Transform (FFT) and related methods are a powerful +tool in order to deal with the considered basis functions. For growing +dimensions the classical FFT-based methods become inefficient due to the curse +of dimensionality. Therefore, we restrict ourselves to multivariate basis +functions, each of which only depends on a small number of dimensions. This is +motivated by the well-known sparsity of effects and recent results regarding +the reconstruction of functions from scattered data in terms of truncated +analysis of variance (ANOVA) decompositions, which makes the resulting model +even interpretable in terms of importance of the features as well as their +couplings. The usage of small superposition dimensions has the consequence that +the computational effort no longer grows exponentially but only polynomially +with respect to the dimension. In order to enforce sparsity regarding the basis +coefficients, we use the frequently applied $\ell_2$-norm and, in addition, +$\ell_1$-norm regularization. The found classifying function, which is the +linear combination of basis functions, and its variance can then be analyzed in +terms of the classical ANOVA decomposition of functions. Based on numerical +examples we show that we are able to recover the signum of a function that +perfectly fits our model assumptions. Furthermore, we perform classification on +different artificial and real-world data sets. We obtain better results with +$\ell_1$-norm regularization, both in terms of accuracy and clarity of +interpretability. + +
+
+
+
+
+ + ♻ ☆ The future of cosmological likelihood-based inference: accelerated + high-dimensional parameter estimation and model comparison + + +
+ We advocate for a new paradigm of cosmological likelihood-based inference, +leveraging recent developments in machine learning and its underlying +technology, to accelerate Bayesian inference in high-dimensional settings. +Specifically, we combine (i) emulation, where a machine learning model is +trained to mimic cosmological observables, e.g. CosmoPower-JAX; (ii) +differentiable and probabilistic programming, e.g. JAX and NumPyro, +respectively; (iii) scalable Markov chain Monte Carlo (MCMC) sampling +techniques that exploit gradients, e.g. Hamiltonian Monte Carlo; and (iv) +decoupled and scalable Bayesian model selection techniques that compute the +Bayesian evidence purely from posterior samples, e.g. the learned harmonic mean +implemented in harmonic. This paradigm allows us to carry out a complete +Bayesian analysis, including both parameter estimation and model selection, in +a fraction of the time of traditional approaches. First, we demonstrate the +application of this paradigm on a simulated cosmic shear analysis for a Stage +IV survey in 37- and 39-dimensional parameter spaces, comparing $\Lambda$CDM +and a dynamical dark energy model ($w_0w_a$CDM). We recover posterior contours +and evidence estimates that are in excellent agreement with those computed by +the traditional nested sampling approach while reducing the computational cost +from 8 months on 48 CPU cores to 2 days on 12 GPUs. Second, we consider a joint +analysis between three simulated next-generation surveys, each performing a +3x2pt analysis, resulting in 157- and 159-dimensional parameter spaces. +Standard nested sampling techniques are simply unlikely to be feasible in this +high-dimensional setting, requiring a projected 12 years of compute time on 48 +CPU cores; on the other hand, the proposed approach only requires 8 days of +compute time on 24 GPUs. All packages used in our analyses are publicly +available. + +
+
+ comment: 14 pages, 6 figures. Accepted for publication in the Open Journal of + Astrophysics. Codes available at + https://github.com/alessiospuriomancini/cosmopower, + https://github.com/dpiras/cosmopower-jax, + https://github.com/astro-informatics/harmonic/ +
+
+
+
+
+ + ♻ ☆ A possible late-time transition of $M_B$ inferred via neural networks + + +
+ The strengthening of tensions in the cosmological parameters has led to a +reconsideration of fundamental aspects of standard cosmology. The tension in +the Hubble constant can also be viewed as a tension between local and early +Universe constraints on the absolute magnitude $M_B$ of Type Ia supernova. In +this work, we reconsider the possibility of a variation of this parameter in a +model-independent way. We employ neural networks to agnostically constrain the +value of the absolute magnitude as well as assess the impact and statistical +significance of a variation in $M_B$ with redshift from the Pantheon+ +compilation, together with a thorough analysis of the neural network +architecture. We find an indication for a possible transition redshift at the +$z\approx 1$ region. + +
+
+ comment: 13 pages, 9 sets of figures, 2 tables. To appear in JCAP +
+
+
+
+
+ + ♻ ☆ Variational Mode Decomposition and Linear Embeddings are What You Need + For Time-Series Forecasting + + +
+ Time-series forecasting often faces challenges due to data volatility, which +can lead to inaccurate predictions. Variational Mode Decomposition (VMD) has +emerged as a promising technique to mitigate volatility by decomposing data +into distinct modes, thereby enhancing forecast accuracy. In this study, we +integrate VMD with linear models to develop a robust forecasting framework. Our +approach is evaluated on 13 diverse datasets, including ETTm2, WindTurbine, M4, +and 10 air quality datasets from various Southeast Asian cities. The +effectiveness of the VMD strategy is assessed by comparing Root Mean Squared +Error (RMSE) values from models utilizing VMD against those without it. +Additionally, we benchmark linear-based models against well-known neural +network architectures such as LSTM, Bidirectional LSTM, and RNN. The results +demonstrate a significant reduction in RMSE across nearly all models following +VMD application. Notably, the Linear + VMD model achieved the lowest average +RMSE in univariate forecasting at 0.619. In multivariate forecasting, the +DLinear + VMD model consistently outperformed others, attaining the lowest RMSE +across all datasets with an average of 0.019. These findings underscore the +effectiveness of combining VMD with linear models for superior time-series +forecasting. + +
+
+ comment: For associated repository, see + https://github.com/Espalemit/VMD-With-LTSF-Linear.git +
+
+
+
+
+ + ♻ ☆ GT-CausIn: a novel causal-based insight for traffic prediction + + +
+ Traffic forecasting is an important application of spatiotemporal series +prediction. Among different methods, graph neural networks have achieved so far +the most promising results, learning relations between graph nodes then becomes +a crucial task. However, improvement space is very limited when these relations +are learned in a node-to-node manner. The challenge stems from (1) obscure +temporal dependencies between different stations, (2) difficulties in defining +variables beyond the node level, and (3) no ready-made method to validate the +learned relations. To confront these challenges, we define legitimate traffic +causal variables to discover the causal relation inside the traffic network, +which is carefully checked with statistic tools and case analysis. We then +present a novel model named Graph Spatial-Temporal Network Based on Causal +Insight (GT-CausIn), where prior learned causal information is integrated with +graph diffusion layers and temporal convolutional network (TCN) layers. +Experiments are carried out on two real-world traffic datasets: PEMS-BAY and +METR-LA, which show that GT-CausIn significantly outperforms the +state-of-the-art models on mid-term and long-term prediction. + +
+
+
+
+
+ + ♻ ☆ When Does Visual Prompting Outperform Linear Probing for Vision-Language + Models? A Likelihood Perspective + + +
+ Adapting pre-trained models to new tasks can exhibit varying effectiveness +across datasets. Visual prompting, a state-of-the-art parameter-efficient +transfer learning method, can significantly improve the performance of +out-of-distribution tasks. On the other hand, linear probing, a standard +transfer learning method, can sometimes become the best approach. We propose a +log-likelihood ratio (LLR) approach to analyze the comparative benefits of +visual prompting and linear probing. By employing the LLR score alongside +resource-efficient visual prompts approximations, our cost-effective measure +attains up to a 100-fold reduction in run time compared to full training, while +achieving prediction accuracies up to 91%. The source code is available at +https://github.com/IBM/VP-LLR. + +
+
+
+
+
+ + ♻ ☆ Pseudo Replay-based Class Continual Learning for Online New Category + Anomaly Detection in Additive Manufacturing + + +
+ The incorporation of advanced sensors and machine learning techniques has +enabled modern manufacturing enterprises to perform data-driven +classification-based anomaly detection based on the sensor data collected in +manufacturing processes. However, one critical challenge is that newly +presented defect category may manifest as the manufacturing process continues, +resulting in monitoring performance deterioration of previously trained machine +learning models. Hence, there is an increasing need for empowering machine +learning models to learn continually. Among all continual learning methods, +memory-based continual learning has the best performance but faces the +constraints of data storage capacity. To address this issue, this paper +develops a novel pseudo replay-based continual learning framework by +integrating class incremental learning and oversampling-based data generation. +Without storing all the data, the developed framework could generate +high-quality data representing previous classes to train machine learning model +incrementally when new category anomaly occurs. In addition, it could even +enhance the monitoring performance since it also effectively improves the data +quality. The effectiveness of the proposed framework is validated in three +cases studies, which leverages supervised classification problem for anomaly +detection. The experimental results show that the developed method is very +promising in detecting novel anomaly while maintaining a good performance on +the previous task and brings up more flexibility in model architecture. + +
+
+
+
+
+ + ♻ ☆ Navigating the Maize: Cyclic and conditional computational graphs for + molecular simulation + + +
+ Many computational chemistry and molecular simulation workflows can be +expressed as graphs. This abstraction is useful to modularize and potentially +reuse existing components, as well as provide parallelization and ease +reproducibility. Existing tools represent the computation as a directed acyclic +graph (DAG), thus allowing efficient execution by parallelization of concurrent +branches. These systems can, however, generally not express cyclic and +conditional workflows. We therefore developed Maize, a workflow manager for +cyclic and conditional graphs based on the principles of flow-based +programming. By running each node of the graph concurrently in separate +processes and allowing communication at any time through dedicated inter-node +channels, arbitrary graph structures can be executed. We demonstrate the +effectiveness of the tool on a dynamic active learning task in computational +drug design, involving the use of a small molecule generative model and an +associated scoring system, and on a reactivity prediction pipeline using +quantum-chemistry and semiempirical approaches. + +
+
+
+
+
+ + ♻ ☆ DNN-GDITD: Out-of-distribution detection via Deep Neural Network based + Gaussian Descriptor for Imbalanced Tabular Data + + +
+ Classification tasks present challenges due to class imbalances and evolving +data distributions. Addressing these issues requires a robust method to handle +imbalances while effectively detecting out-of-distribution (OOD) samples not +encountered during training. This study introduces a novel OOD detection +algorithm designed for tabular datasets, titled Deep Neural Network-based +Gaussian Descriptor for Imbalanced Tabular Data (DNN-GDITD). The DNN-GDITD +algorithm can be placed on top of any DNN to facilitate better classification +of imbalanced data and OOD detection using spherical decision boundaries. Using +a combination of Push, Score-based, and focal losses, DNN-GDITD assigns +confidence scores to test data points, categorizing them as known classes or as +an OOD sample. Extensive experimentation on tabular datasets demonstrates the +effectiveness of DNN-GDITD compared to three OOD algorithms. Evaluation +encompasses imbalanced and balanced scenarios on diverse tabular datasets, +including a synthetic financial dispute dataset and publicly available tabular +datasets like Gas Sensor, Drive Diagnosis, and MNIST, showcasing DNN-GDITD's +versatility. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN + for Precise Facial Expression Intensity Estimation + + +
+ This paper presents MMA-MRNNet, a novel deep learning architecture for +dynamic multi-output Facial Expression Intensity Estimation (FEIE) from video +data. Traditional approaches to this task often rely on complex 3-D CNNs, which +require extensive pre-training and assume that facial expressions are uniformly +distributed across all frames of a video. These methods struggle to handle +videos of varying lengths, often resorting to ad-hoc strategies that either +discard valuable information or introduce bias. MMA-MRNNet addresses these +challenges through a two-stage process. First, the Multiple Models of Affect +(MMA) extractor component is a Multi-Task Learning CNN that concurrently +estimates valence-arousal, recognizes basic facial expressions, and detects +action units in each frame. These representations are then processed by a +Masked RNN component, which captures temporal dependencies and dynamically +updates weights according to the true length of the input video, ensuring that +only the most relevant features are used for the final prediction. The proposed +unimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction +dataset and demonstrated significantly superior performance, surpassing +state-of-the-art methods by a wide margin, regardless of whether they were +unimodal, multimodal, or ensemble approaches. Finally, we demonstrated the +effectiveness of the MMA component of our proposed method across multiple +in-the-wild datasets, where it consistently outperformed all state-of-the-art +methods across various metrics. + +
+
+
+
+
+ + ♻ ☆ What Formal Languages Can Transformers Express? A Survey + + +
+ As transformers have gained prominence in natural language processing, some +researchers have investigated theoretically what problems they can and cannot +solve, by treating problems as formal languages. Exploring such questions can +help clarify the power of transformers relative to other models of computation, +their fundamental capabilities and limits, and the impact of architectural +choices. Work in this subarea has made considerable progress in recent years. +Here, we undertake a comprehensive survey of this work, documenting the diverse +assumptions that underlie different results and providing a unified framework +for harmonizing seemingly contradictory findings. + +
+
+ comment: One minor correction in {\S}5.1 +
+
+
+
+
+ + ♻ ☆ Decision-Focused Learning: Foundations, State of the Art, Benchmark and + Future Opportunities + + +
+ Decision-focused learning (DFL) is an emerging paradigm that integrates +machine learning (ML) and constrained optimization to enhance decision quality +by training ML models in an end-to-end system. This approach shows significant +potential to revolutionize combinatorial decision-making in real-world +applications that operate under uncertainty, where estimating unknown +parameters within decision models is a major challenge. This paper presents a +comprehensive review of DFL, providing an in-depth analysis of both +gradient-based and gradient-free techniques used to combine ML and constrained +optimization. It evaluates the strengths and limitations of these techniques +and includes an extensive empirical evaluation of eleven methods across seven +problems. The survey also offers insights into recent advancements and future +research directions in DFL. + Code and benchmark: https://github.com/PredOpt/predopt-benchmarks + +
+
+ comment: Experimental Survey and Benchmarking +
+
+
+
+
+ + ♻ ☆ Can Vehicle Motion Planning Generalize to Realistic Long-tail Scenarios? + + +
+ Real-world autonomous driving systems must make safe decisions in the face of +rare and diverse traffic scenarios. Current state-of-the-art planners are +mostly evaluated on real-world datasets like nuScenes (open-loop) or nuPlan +(closed-loop). In particular, nuPlan seems to be an expressive evaluation +method since it is based on real-world data and closed-loop, yet it mostly +covers basic driving scenarios. This makes it difficult to judge a planner's +capabilities to generalize to rarely-seen situations. Therefore, we propose a +novel closed-loop benchmark interPlan containing several edge cases and +challenging driving scenarios. We assess existing state-of-the-art planners on +our benchmark and show that neither rule-based nor learning-based planners can +safely navigate the interPlan scenarios. A recently evolving direction is the +usage of foundation models like large language models (LLM) to handle +generalization. We evaluate an LLM-only planner and introduce a novel hybrid +planner that combines an LLM-based behavior planner with a rule-based motion +planner that achieves state-of-the-art performance on our benchmark. + +
+
+
+
+
+ + ♻ ☆ In the Search for Optimal Multi-view Learning Models for Crop + Classification with Global Remote Sensing Data + + +
+ Studying and analyzing cropland is a difficult task due to its dynamic and +heterogeneous growth behavior. Usually, diverse data sources can be collected +for its estimation. Although deep learning models have proven to excel in the +crop classification task, they face substantial challenges when dealing with +multiple inputs, named Multi-View Learning (MVL). The methods used in the MVL +scenario can be structured based on the encoder architecture, the fusion +strategy, and the optimization technique. The literature has primarily focused +on using specific encoder architectures for local regions, lacking a deeper +exploration of other components in the MVL methodology. In contrast, we +investigate the simultaneous selection of the fusion strategy and encoder +architecture, assessing global-scale cropland and crop-type classifications. We +use a range of five fusion strategies (Input, Feature, Decision, Ensemble, +Hybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible +configurations in the MVL method. We use the CropHarvest dataset for +validation, which provides optical, radar, weather time series, and topographic +information as input data. We found that in scenarios with a limited number of +labeled samples, a unique configuration is insufficient for all the cases. +Instead, a specialized combination should be meticulously sought, including an +encoder and fusion strategy. To streamline this search process, we suggest +identifying the optimal encoder architecture tailored for a particular fusion +strategy, and then determining the most suitable fusion strategy for the +classification task. We provide a methodological framework for researchers +exploring crop classification through an MVL methodology. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ♻ ☆ Increasing the Robustness of Model Predictions to Missing Sensors in + Earth Observation ACL + + +
+ Multi-sensor ML models for EO aim to enhance prediction accuracy by +integrating data from various sources. However, the presence of missing data +poses a significant challenge, particularly in non-persistent sensors that can +be affected by external factors. Existing literature has explored strategies +like temporal dropout and sensor-invariant models to address the generalization +to missing data issues. Inspired by these works, we study two novel methods +tailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and +Ensemble Sensor Invariant (ESensI). Through experimentation on three +multi-sensor temporal EO datasets, we demonstrate that these methods +effectively increase the robustness of model predictions to missing sensors. +Particularly, we focus on how the predictive performance of models drops when +sensors are missing at different levels. We observe that ensemble multi-sensor +models are the most robust to the lack of sensors. In addition, the sensor +dropout component in ISensD shows promising robustness results. + +
+
+ comment: Accepted at the MACLEAN workshop in the ECML/PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Scalable Glacier Mapping using Deep Learning and Open Earth Observation + Data Matches the Accuracy of Manual Delineation + + +
+ Accurate global glacier mapping is critical for understanding climate change +impacts. Despite its importance, automated glacier mapping at a global scale +remains largely unexplored. Here we address this gap and propose +Glacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep +learning model, and five strategies for multitemporal global-scale glacier +mapping using open satellite imagery. Assessing the spatial, temporal and +cross-sensor generalisation shows that our best strategy achieves intersection +over union >0.85 on previously unobserved images in most cases, which drops to +>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90 +for regions dominated by clean ice. A comparative validation against human +expert uncertainties in terms of area and distance deviations underscores +GlaViTU performance, approaching or matching expert-level delineation. Adding +synthetic aperture radar data, namely, backscatter and interferometric +coherence, increases the accuracy in all regions where available. The +calibrated confidence for glacier extents is reported making the predictions +more reliable and interpretable. We also release a benchmark dataset that +covers 9% of glaciers worldwide. Our results support efforts towards automated +multitemporal and global glacier mapping. + +
+
+ comment: after major revision, expanded validation +
+
+
+
+
+ + ♻ ☆ Smart E-commerce Recommendations with Semantic AI + + +
+ In e-commerce, web mining for page recommendations is widely used but often +fails to meet user needs. To address this, we propose a novel solution +combining semantic web mining with BP neural networks. We process user search +logs to extract five key features: content priority, time spent, user feedback, +recommendation semantics, and input deviation. These features are then fed into +a BP neural network to classify and prioritize web pages. The prioritized pages +are recommended to users. Using book sales pages for testing, our results +demonstrate that this solution can quickly and accurately identify the pages +users need. Our approach ensures that recommendations are more relevant and +tailored to individual preferences, enhancing the online shopping experience. +By leveraging advanced semantic analysis and neural network techniques, we +bridge the gap between user expectations and actual recommendations. This +innovative method not only improves accuracy but also speeds up the +recommendation process, making it a valuable tool for e-commerce platforms +aiming to boost user satisfaction and engagement. Additionally, our system +ability to handle large datasets and provide real-time recommendations makes it +a scalable and efficient solution for modern e-commerce challenges. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ A Hybrid Framework for Spatial Interpolation: Merging Data-driven with + Domain Knowledge + + +
+ Estimating spatially distributed information through the interpolation of +scattered observation datasets often overlooks the critical role of domain +knowledge in understanding spatial dependencies. Additionally, the features of +these data sets are typically limited to the spatial coordinates of the +scattered observation locations. In this paper, we propose a hybrid framework +that integrates data-driven spatial dependency feature extraction with +rule-assisted spatial dependency function mapping to augment domain knowledge. +We demonstrate the superior performance of our framework in two comparative +application scenarios, highlighting its ability to capture more localized +spatial features in the reconstructed distribution fields. Furthermore, we +underscore its potential to enhance nonlinear estimation capabilities through +the application of transformed fuzzy rules and to quantify the inherent +uncertainties associated with the observation data sets. Our framework +introduces an innovative approach to spatial information estimation by +synergistically combining observational data with rule-assisted domain +knowledge. + +
+
+ comment: 21 pages, 13 figures; typos corrected, references updated +
+
+
+
+
+ + ♻ ☆ Open Implementation and Study of BEST-RQ for Speech Processing ICASSP 2024 + + +
+ Self-Supervised Learning (SSL) has proven to be useful in various speech +tasks. However, these methods are generally very demanding in terms of data, +memory, and computational resources. BERT-based Speech pre-Training with +Random-projection Quantizer (BEST-RQ), is an SSL method that has shown great +performance on Automatic Speech Recognition (ASR) while being simpler than +other SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance, +details are lacking in the original paper, such as the amount of GPU/TPU hours +used in pre-training, and there is no official easy-to-use open-source +implementation. Furthermore, BEST-RQ has not been evaluated on other downstream +tasks aside from ASR and speech translation. In this work, we describe a +re-implementation of a Random-projection quantizer and perform a preliminary +study with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the +details and differences of our implementation. We show that a random projection +quantizer can achieve similar downstream performance as wav2vec 2.0 while +decreasing training time by over a factor of two. + +
+
+ comment: Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio, + Speech and Beyond (SASB 2024) +
+
+
+
+
+ + ♻ ☆ Moderate Adaptive Linear Units (MoLU) + + +
+ We propose a new high-performance activation function, Moderate Adaptive +Linear Units (MoLU), for the deep neural network. The MoLU is a simple, +beautiful and powerful activation function that can be a good main activation +function among hundreds of activation functions. Because the MoLU is made up of +the elementary functions, not only it is a diffeomorphism (i.e. analytic over +whole domains), but also it reduces the training time. + +
+
+ comment: 4 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Prompt Compression with Context-Aware Sentence Encoding for Fast and + Improved LLM Inference + + +
+ Large language models (LLMs) have triggered a new stream of research focusing +on compressing the context length to reduce the computational cost while +ensuring the retention of helpful information for LLMs to answer the given +question. Token-based removal methods are one of the most prominent approaches +in this direction, but risk losing the semantics of the context caused by +intermediate token removal, especially under high compression ratios, while +also facing challenges in computational efficiency. In this work, we propose +context-aware prompt compression (CPC), a sentence-level prompt compression +technique where its key innovation is a novel context-aware sentence encoder +that provides a relevance score for each sentence for a given question. To +train this encoder, we generate a new dataset consisting of questions, +positives, and negative pairs where positives are sentences relevant to the +question, while negatives are irrelevant context sentences. We train the +encoder in a contrastive setup to learn context-aware sentence representations. +Our method considerably outperforms prior works on prompt compression on +benchmark datasets and is up to 10.93x faster at inference compared to the best +token-level compression method. We also find better improvement for shorter +length constraints in most benchmarks, showing the effectiveness of our +proposed solution in the compression of relevant information in a shorter +context. Finally, we release the code and the dataset for quick reproducibility +and further development: https://github.com/Workday/cpc. + +
+
+
+
+
+ + ♻ ☆ Simultaneous Training of First- and Second-Order Optimizers in + Population-Based Reinforcement Learning + + +
+ The tuning of hyperparameters in reinforcement learning (RL) is critical, as +these parameters significantly impact an agent's performance and learning +efficiency. Dynamic adjustment of hyperparameters during the training process +can significantly enhance both the performance and stability of learning. +Population-based training (PBT) provides a method to achieve this by +continuously tuning hyperparameters throughout the training. This ongoing +adjustment enables models to adapt to different learning stages, resulting in +faster convergence and overall improved performance. In this paper, we propose +an enhancement to PBT by simultaneously utilizing both first- and second-order +optimizers within a single population. We conducted a series of experiments +using the TD3 algorithm across various MuJoCo environments. Our results, for +the first time, empirically demonstrate the potential of incorporating +second-order optimizers within PBT-based RL. Specifically, the combination of +the K-FAC optimizer with Adam led to up to a 10% improvement in overall +performance compared to PBT using only Adam. Additionally, in environments +where Adam occasionally fails, such as the Swimmer environment, the mixed +population with K-FAC exhibited more reliable learning outcomes, offering a +significant advantage in training stability without a substantial increase in +computational time. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ SparQ Attention: Bandwidth-Efficient LLM Inference + + +
+ The computational difficulties of large language model (LLM) inference remain +a significant obstacle to their widespread deployment. The need for many +applications to support long input sequences and process them in large batches +typically causes token-generation to be bottlenecked by data transfer. For this +reason, we introduce SparQ Attention, a technique for increasing the inference +throughput of LLMs by utilising memory bandwidth more efficiently within the +attention layers, through selective fetching of the cached history. Our +proposed technique can be applied directly to off-the-shelf LLMs during +inference, without requiring any modification to the pre-training setup or +additional fine-tuning. We show that SparQ Attention brings up to 8x savings in +attention data transfers without substantial drops in accuracy, by evaluating +Llama 2 and 3, Mistral, Gemma and Pythia models on a wide range of downstream +tasks. + +
+
+
+
+
+ + ♻ ☆ Enhancing Sindhi Word Segmentation using Subword Representation Learning + and Position-aware Self-attention + + +
+ Sindhi word segmentation is a challenging task due to space omission and +insertion issues. The Sindhi language itself adds to this complexity. It's +cursive and consists of characters with inherent joining and non-joining +properties, independent of word boundaries. Existing Sindhi word segmentation +methods rely on designing and combining hand-crafted features. However, these +methods have limitations, such as difficulty handling out-of-vocabulary words, +limited robustness for other languages, and inefficiency with large amounts of +noisy or raw text. Neural network-based models, in contrast, can automatically +capture word boundary information without requiring prior knowledge. In this +paper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses +word segmentation as a sequence labeling task. The SGNWS model incorporates +subword representation learning through a bidirectional long short-term memory +encoder, position-aware self-attention, and a conditional random field. Our +empirical results demonstrate that the SGNWS model achieves state-of-the-art +performance in Sindhi word segmentation on six datasets. + +
+
+ comment: Journal Paper, 14 pages +
+
+
+
+
+ + ♻ ☆ BiKC: Keypose-Conditioned Consistency Policy for Bimanual Robotic + Manipulation + + +
+ Bimanual manipulation tasks typically involve multiple stages which require +efficient interactions between two arms, posing step-wise and stage-wise +challenges for imitation learning systems. Specifically, failure and delay of +one step will broadcast through time, hinder success and efficiency of each +sub-stage task, and thereby overall task performance. Although recent works +have made strides in addressing certain challenges, few approaches explicitly +consider the multi-stage nature of bimanual tasks while simultaneously +emphasizing the importance of inference speed. In this paper, we introduce a +novel keypose-conditioned consistency policy tailored for bimanual +manipulation. It is a hierarchical imitation learning framework that consists +of a high-level keypose predictor and a low-level trajectory generator. The +predicted keyposes provide guidance for trajectory generation and also mark the +completion of one sub-stage task. The trajectory generator is designed as a +consistency model trained from scratch without distillation, which generates +action sequences conditioning on current observations and predicted keyposes +with fast inference speed. Simulated and real-world experimental results +demonstrate that the proposed approach surpasses baseline methods in terms of +success rate and operational efficiency. Codes are available at +https://github.com/ManUtdMoon/BiKC. + +
+
+ comment: Accepted by The 16th International Workshop on the Algorithmic + Foundations of Robotics (WAFR 2024) +
+
+
+
+
+ + ♻ ☆ NetMamba: Efficient Network Traffic Classification via Pre-training + Unidirectional Mamba + + +
+ Network traffic classification is a crucial research area aiming to enhance +service quality, streamline network management, and bolster cybersecurity. To +address the growing complexity of transmission encryption techniques, various +machine learning and deep learning methods have been proposed. However, +existing approaches face two main challenges. Firstly, they struggle with model +inefficiency due to the quadratic complexity of the widely used Transformer +architecture. Secondly, they suffer from inadequate traffic representation +because of discarding important byte information while retaining unwanted +biases. To address these challenges, we propose NetMamba, an efficient +linear-time state space model equipped with a comprehensive traffic +representation scheme. We adopt a specially selected and improved +unidirectional Mamba architecture for the networking field, instead of the +Transformer, to address efficiency issues. In addition, we design a traffic +representation scheme to extract valid information from massive traffic data +while removing biased information. Evaluation experiments on six public +datasets encompassing three main classification tasks showcase NetMamba's +superior classification performance compared to state-of-the-art baselines. It +achieves an accuracy rate of nearly 99% (some over 99%) in all tasks. +Additionally, NetMamba demonstrates excellent efficiency, improving inference +speed by up to 60 times while maintaining comparably low memory usage. +Furthermore, NetMamba exhibits superior few-shot learning abilities, achieving +better classification performance with fewer labeled data. To the best of our +knowledge, NetMamba is the first model to tailor the Mamba architecture for +networking. + +
+
+
+
+
+ + ♻ ☆ From Categories to Classifiers: Name-Only Continual Learning by + Exploring the Web + + +
+ Continual Learning (CL) often relies on the availability of extensive +annotated datasets, an assumption that is unrealistically time-consuming and +costly in practice. We explore a novel paradigm termed name-only continual +learning where time and cost constraints prohibit manual annotation. In this +scenario, learners adapt to new category shifts using only category names +without the luxury of annotated training data. Our proposed solution leverages +the expansive and ever-evolving internet to query and download uncurated +webly-supervised data for image classification. We investigate the reliability +of our web data and find them comparable, and in some cases superior, to +manually annotated datasets. Additionally, we show that by harnessing the web, +we can create support sets that surpass state-of-the-art name-only +classification that create support sets using generative models or image +retrieval from LAION-5B, achieving up to 25% boost in accuracy. When applied +across varied continual learning contexts, our method consistently exhibits a +small performance gap in comparison to models trained on manually annotated +datasets. We present EvoTrends, a class-incremental dataset made from the web +to capture real-world trends, created in just minutes. Overall, this paper +underscores the potential of using uncurated webly-supervised data to mitigate +the challenges associated with manual data labeling in continual learning. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review on Sleep Stage Classification and Sleep Disorder + Detection Using Artificial Intelligence + + +
+ Sleep is vital for people's physical and mental health, and sound sleep can +help them focus on daily activities. Therefore, a sleep study that includes +sleep patterns and sleep disorders is crucial to enhancing our knowledge about +individuals' health status. This study aims to provide a comprehensive, +systematic review of the recent literature to analyze the different approaches +and their outcomes in sleep studies, which includes works on "sleep stages +classification" and "sleep disorder detection" using AI. In this review, 183 +articles were initially selected from different journals, among which 80 +records were enlisted for explicit review, ranging from 2016 to 2023. Brain +waves were the most commonly employed body parameters for sleep staging and +disorder studies (almost 29% of the research used brain activity signals +exclusively, and 77% combined with the other signals). The convolutional neural +network (CNN), the most widely used of the 34 distinct artificial intelligence +models, comprised 27%. The other models included the long short-term memory +(LSTM), support vector machine (SVM), random forest (RF), and recurrent neural +network (RNN), which consisted of 11%, 6%, 6%, and 5% sequentially. For +performance metrics, accuracy was widely used for a maximum of 83.75% of the +cases, the F1 score of 45%, Kappa of 36.25%, Sensitivity of 31.25%, and +Specificity of 30% of cases, along with the other metrics. This article would +help physicians and researchers get the gist of AI's contribution to sleep +studies and the feasibility of their intended work. + +
+
+ comment: 39 pages, 11 Figures, 8 Tables +
+
+
+
+
+ + ♻ ☆ The Fault in our Stars: Quality Assessment of Code Generation Benchmarks SC + + +
+ Large Language Models (LLMs) are gaining popularity among software engineers. +A crucial aspect of developing effective code generation LLMs is to evaluate +these models using a robust benchmark. Evaluation benchmarks with quality +issues can provide a false sense of performance. In this work, we conduct the +first-of-its-kind study of the quality of prompts within benchmarks used to +compare the performance of different code generation models. To conduct this +study, we analyzed 3,566 prompts from 9 code generation benchmarks to identify +quality issues in them. We also investigated whether fixing the identified +quality issues in the benchmarks' prompts affects a model's performance. We +also studied memorization issues of the evaluation dataset, which can put into +question a benchmark's trustworthiness. We found that code generation +evaluation benchmarks mainly focused on Python and coding exercises and had +very limited contextual dependencies to challenge the model. These datasets and +the developers' prompts suffer from quality issues like spelling and +grammatical errors, unclear sentences to express developers' intent, and not +using proper documentation style. Fixing all these issues in the benchmarks can +lead to a better performance for Python code generation, but not a significant +improvement was observed for Java code generation. We also found evidence that +GPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues. + +
+
+ comment: Accepted at the 24th IEEE International Conference on Source Code + Analysis and Manipulation(SCAM 2024) Research Track +
+
+
+
+
+ + ♻ ☆ Sample Complexity of Variance-reduced Distributionally Robust Q-learning + + +
+ Dynamic decision-making under distributional shifts is of fundamental +interest in theory and applications of reinforcement learning: The distribution +of the environment in which the data is collected can differ from that of the +environment in which the model is deployed. This paper presents two novel +model-free algorithms, namely the distributionally robust Q-learning and its +variance-reduced counterpart, that can effectively learn a robust policy +despite distributional shifts. These algorithms are designed to efficiently +approximate the $q$-function of an infinite-horizon $\gamma$-discounted robust +Markov decision process with Kullback-Leibler ambiguity set to an entry-wise +$\epsilon$-degree of precision. Further, the variance-reduced distributionally +robust Q-learning combines the synchronous Q-learning with variance-reduction +techniques to enhance its performance. Consequently, we establish that it +attains a minimax sample complexity upper bound of $\tilde +O(|\mathbf{S}||\mathbf{A}|(1-\gamma)^{-4}\epsilon^{-2})$, where $\mathbf{S}$ +and $\mathbf{A}$ denote the state and action spaces. This is the first +complexity result that is independent of the ambiguity size $\delta$, thereby +providing new complexity theoretic insights. Additionally, a series of +numerical experiments confirm the theoretical findings and the efficiency of +the algorithms in handling distributional shifts. + +
+
+
+
+
+ + ♻ ☆ Predicting and Interpreting Energy Barriers of Metallic Glasses with + Graph Neural Networks ICML 2024 + + +
+ Metallic Glasses (MGs) are widely used materials that are stronger than steel +while being shapeable as plastic. While understanding the structure-property +relationship of MGs remains a challenge in materials science, studying their +energy barriers (EBs) as an intermediary step shows promise. In this work, we +utilize Graph Neural Networks (GNNs) to model MGs and study EBs. We contribute +a new dataset for EB prediction and a novel Symmetrized GNN (SymGNN) model that +is E(3)-invariant in expectation. SymGNN handles invariance by aggregating over +orthogonal transformations of the graph structure. When applied to EB +prediction, SymGNN are more accurate than molecular dynamics (MD) +local-sampling methods and other machine-learning models. Compared to precise +MD simulations, SymGNN reduces the inference time on new MGs from roughly 41 +days to less than one second. We apply explanation algorithms to reveal the +relationship between structures and EBs. The structures that we identify +through explanations match the medium-range order (MRO) hypothesis and possess +unique topological properties. Our work enables effective prediction and +interpretation of MG EBs, bolstering material science research. + +
+
+ comment: ICML 2024. Code available at https://github.com/haoyuli02/SymGNN +
+
+
+
+
+ + ♻ ☆ Semi-Decentralized Federated Edge Learning for Fast Convergence on + Non-IID Data + + +
+ Federated edge learning (FEEL) has emerged as an effective approach to reduce +the large communication latency in Cloud-based machine learning solutions, +while preserving data privacy. Unfortunately, the learning performance of FEEL +may be compromised due to limited training data in a single edge cluster. In +this paper, we investigate a novel framework of FEEL, namely semi-decentralized +federated edge learning (SD-FEEL). By allowing model aggregation across +different edge clusters, SD-FEEL enjoys the benefit of FEEL in reducing the +training latency, while improving the learning performance by accessing richer +training data from multiple edge clusters. A training algorithm for SD-FEEL +with three main procedures in each round is presented, including local model +updates, intra-cluster and inter-cluster model aggregations, which is proved to +converge on non-independent and identically distributed (non-IID) data. We also +characterize the interplay between the network topology of the edge servers and +the communication overhead of inter-cluster model aggregation on the training +performance. Experiment results corroborate our analysis and demonstrate the +effectiveness of SD-FFEL in achieving faster convergence than traditional +federated learning architectures. Besides, guidelines on choosing critical +hyper-parameters of the training algorithm are also provided. + +
+
+
+
+
+ + ♻ ☆ EnsLoss: Stochastic Calibrated Loss Ensembles for Preventing Overfitting + in Classification + + +
+ Empirical risk minimization (ERM) with a computationally feasible surrogate +loss is a widely accepted approach for classification. Notably, the convexity +and calibration (CC) properties of a loss function ensure consistency of ERM in +maximizing accuracy, thereby offering a wide range of options for surrogate +losses. In this article, we propose a novel ensemble method, namely EnsLoss, +which extends the ensemble learning concept to combine loss functions within +the ERM framework. A key feature of our method is the consideration on +preserving the "legitimacy" of the combined losses, i.e., ensuring the CC +properties. Specifically, we first transform the CC conditions of losses into +loss-derivatives, thereby bypassing the need for explicit loss functions and +directly generating calibrated loss-derivatives. Therefore, inspired by +Dropout, EnsLoss enables loss ensembles through one training process with +doubly stochastic gradient descent (i.e., random batch samples and random +calibrated loss-derivatives). We theoretically establish the statistical +consistency of our approach and provide insights into its benefits. The +numerical effectiveness of EnsLoss compared to fixed loss methods is +demonstrated through experiments on a broad range of 14 OpenML tabular datasets +and 46 image datasets with various deep learning architectures. Python +repository and source code are available on GitHub at +https://github.com/statmlben/ensloss. + +
+
+ comment: 31 pages; 4 figures +
+
+
+
+
+ + ♻ ☆ A Confidence Interval for the $\ell_2$ Expected Calibration Error + + +
+ Recent advances in machine learning have significantly improved prediction +accuracy in various applications. However, ensuring the calibration of +probabilistic predictions remains a significant challenge. Despite efforts to +enhance model calibration, the rigorous statistical evaluation of model +calibration remains less explored. In this work, we develop confidence +intervals the $\ell_2$ Expected Calibration Error (ECE). We consider +top-1-to-$k$ calibration, which includes both the popular notion of confidence +calibration as well as full calibration. For a debiased estimator of the ECE, +we show asymptotic normality, but with different convergence rates and +asymptotic variances for calibrated and miscalibrated models. We develop +methods to construct asymptotically valid confidence intervals for the ECE, +accounting for this behavior as well as non-negativity. Our theoretical +findings are supported through extensive experiments, showing that our methods +produce valid confidence intervals with shorter lengths compared to those +obtained by resampling-based methods. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Classify Power Quality Signals Using Vision + Transformers + + +
+ With the rapid integration of electronically interfaced renewable energy +resources and loads into smart grids, there is increasing interest in power +quality disturbances (PQD) classification to enhance the security and +efficiency of these grids. This paper introduces a new approach to PQD +classification based on the Vision Transformer (ViT) model. When a PQD occurs, +the proposed approach first converts the power quality signal into an image and +then utilizes a pre-trained ViT to accurately determine the class of the PQD. +Unlike most previous works, which were limited to a few disturbance classes or +small datasets, the proposed method is trained and tested on a large dataset +with 17 disturbance classes. Our experimental results show that the proposed +ViT-based approach achieves PQD classification precision and recall of 98.28% +and 97.98%, respectively, outperforming recently proposed techniques applied to +the same dataset. + +
+
+ comment: IECON 2024-50th Annual Conference of the IEEE Industrial Electronics + Society, Chicago, U.S.A, 2024, pp. 1-6 +
+
+
+
+
+ + ♻ ☆ Graph-Based Bidirectional Transformer Decision Threshold Adjustment + Algorithm for Class-Imbalanced Molecular Data + + +
+ Data sets with imbalanced class sizes, where one class size is much smaller +than that of others, occur exceedingly often in many applications, including +those with biological foundations, such as disease diagnosis and drug +discovery. Therefore, it is extremely important to be able to identify data +elements of classes of various sizes, as a failure to do so can result in heavy +costs. Nonetheless, many data classification procedures do not perform well on +imbalanced data sets as they often fail to detect elements belonging to +underrepresented classes. In this work, we propose the BTDT-MBO algorithm, +incorporating Merriman-Bence-Osher (MBO) approaches and a bidirectional +transformer, as well as distance correlation and decision threshold +adjustments, for data classification tasks on highly imbalanced molecular data +sets, where the sizes of the classes vary greatly. The proposed technique not +only integrates adjustments in the classification threshold for the MBO +algorithm in order to help deal with the class imbalance, but also uses a +bidirectional transformer procedure based on an attention mechanism for +self-supervised learning. In addition, the model implements distance +correlation as a weight function for the similarity graph-based framework on +which the adjusted MBO algorithm operates. The proposed method is validated +using six molecular data sets and compared to other related techniques. The +computational experiments show that the proposed technique is superior to +competing approaches even in the case of a high class imbalance ratio. + +
+
+
+
+
+ + ♻ ☆ CCPL: Cross-modal Contrastive Protein Learning ICPR 2024 + + +
+ Effective protein representation learning is crucial for predicting protein +functions. Traditional methods often pretrain protein language models on large, +unlabeled amino acid sequences, followed by finetuning on labeled data. While +effective, these methods underutilize the potential of protein structures, +which are vital for function determination. Common structural representation +techniques rely heavily on annotated data, limiting their generalizability. +Moreover, structural pretraining methods, similar to natural language +pretraining, can distort actual protein structures. In this work, we introduce +a novel unsupervised protein structure representation pretraining method, +cross-modal contrastive protein learning (CCPL). CCPL leverages a robust +protein language model and uses unsupervised contrastive alignment to enhance +structure learning, incorporating self-supervised structural constraints to +maintain intrinsic structural information. We evaluated our model across +various benchmarks, demonstrating the framework's superiority. + +
+
+ comment: Accepted to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in + Federated Class Continual Learning ECCV 2024 + + +
+ Federated Class Continual Learning (FCCL) merges the challenges of +distributed client learning with the need for seamless adaptation to new +classes without forgetting old ones. The key challenge in FCCL is catastrophic +forgetting, an issue that has been explored to some extent in Continual +Learning (CL). However, due to privacy preservation requirements, some +conventional methods, such as experience replay, are not directly applicable to +FCCL. Existing FCCL methods mitigate forgetting by generating historical data +through federated training of GANs or data-free knowledge distillation. +However, these approaches often suffer from unstable training of generators or +low-quality generated data, limiting their guidance for the model. To address +this challenge, we propose a novel method of data replay based on diffusion +models. Instead of training a diffusion model, we employ a pre-trained +conditional diffusion model to reverse-engineer each class, searching the +corresponding input conditions for each class within the model's input space, +significantly reducing computational resources and time consumption while +ensuring effective generation. Furthermore, we enhance the classifier's domain +generalization ability on generated and real data through contrastive learning, +indirectly improving the representational capability of generated data for real +data. Comprehensive experiments demonstrate that our method significantly +outperforms existing baselines. Code is available at +https://github.com/jinglin-liang/DDDR. + +
+
+ comment: Accepted by ECCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ Small noise analysis for Tikhonov and RKHS regularizations + + +
+ Regularization plays a pivotal role in ill-posed machine learning and inverse +problems. However, the fundamental comparative analysis of various +regularization norms remains open. We establish a small noise analysis +framework to assess the effects of norms in Tikhonov and RKHS regularizations, +in the context of ill-posed linear inverse problems with Gaussian noise. This +framework studies the convergence rates of regularized estimators in the small +noise limit and reveals the potential instability of the conventional +L2-regularizer. We solve such instability by proposing an innovative class of +adaptive fractional RKHS regularizers, which covers the L2 Tikhonov and RKHS +regularizations by adjusting the fractional smoothness parameter. A surprising +insight is that over-smoothing via these fractional RKHSs consistently yields +optimal convergence rates, but the optimal hyper-parameter may decay too fast +to be selected in practice. + +
+
+
+
+
+ + ♻ ☆ Stacked ensemble\-based mutagenicity prediction model using multiple + modalities with graph attention network + + +
+ Mutagenicity is a concern due to its association with genetic mutations which +can result in a variety of negative consequences, including the development of +cancer. Earlier identification of mutagenic compounds in the drug development +process is therefore crucial for preventing the progression of unsafe +candidates and reducing development costs. While computational techniques, +especially machine learning models have become increasingly prevalent for this +endpoint, they rely on a single modality. In this work, we introduce a novel +stacked ensemble based mutagenicity prediction model which incorporate multiple +modalities such as simplified molecular input line entry system (SMILES) and +molecular graph. These modalities capture diverse information about molecules +such as substructural, physicochemical, geometrical and topological. To derive +substructural, geometrical and physicochemical information, we use SMILES, +while topological information is extracted through a graph attention network +(GAT) via molecular graph. Our model uses a stacked ensemble of machine +learning classifiers to make predictions using these multiple features. We +employ the explainable artificial intelligence (XAI) technique SHAP (Shapley +Additive Explanations) to determine the significance of each classifier and the +most relevant features in the prediction. We demonstrate that our method +surpasses SOTA methods on two standard datasets across various metrics. +Notably, we achieve an area under the curve of 95.21\% on the Hansen benchmark +dataset, affirming the efficacy of our method in predicting mutagenicity. We +believe that this research will captivate the interest of both clinicians and +computational biologists engaged in translational research. + +
+
+ comment: Submitted to a journal +
+
+
+
+
+ + ♻ ☆ OpenVLA: An Open-Source Vision-Language-Action Model + + +
+ Large policies pretrained on a combination of Internet-scale vision-language +data and diverse robot demonstrations have the potential to change how we teach +robots new skills: rather than training new behaviors from scratch, we can +fine-tune such vision-language-action (VLA) models to obtain robust, +generalizable policies for visuomotor control. Yet, widespread adoption of VLAs +for robotics has been challenging as 1) existing VLAs are largely closed and +inaccessible to the public, and 2) prior work fails to explore methods for +efficiently fine-tuning VLAs for new tasks, a key component for adoption. +Addressing these challenges, we introduce OpenVLA, a 7B-parameter open-source +VLA trained on a diverse collection of 970k real-world robot demonstrations. +OpenVLA builds on a Llama 2 language model combined with a visual encoder that +fuses pretrained features from DINOv2 and SigLIP. As a product of the added +data diversity and new model components, OpenVLA demonstrates strong results +for generalist manipulation, outperforming closed models such as RT-2-X (55B) +by 16.5% in absolute task success rate across 29 tasks and multiple robot +embodiments, with 7x fewer parameters. We further show that we can effectively +fine-tune OpenVLA for new settings, with especially strong generalization +results in multi-task environments involving multiple objects and strong +language grounding abilities, and outperform expressive from-scratch imitation +learning methods such as Diffusion Policy by 20.4%. We also explore compute +efficiency; as a separate contribution, we show that OpenVLA can be fine-tuned +on consumer GPUs via modern low-rank adaptation methods and served efficiently +via quantization without a hit to downstream success rate. Finally, we release +model checkpoints, fine-tuning notebooks, and our PyTorch codebase with +built-in support for training VLAs at scale on Open X-Embodiment datasets. + +
+
+ comment: Website: https://openvla.github.io/ +
+
+
+
+
+ + ♻ ☆ SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated + Responses + + +
+ Can LLMs consistently improve their previous outputs for better results? For +this to be true, LLMs would need to be better at discriminating among +previously-generated alternatives, than generating initial responses. We +explore the validity of this hypothesis in practice. We first formulate a +unified framework that allows us to compare the generative and discriminative +capability of any model on any task. In our resulting experimental analysis of +several open-source and industrial LLMs, we observe that models are not +reliably better at discriminating among previously-generated alternatives than +generating initial responses. This finding challenges the notion that LLMs may +be able to enhance their performance only through their own judgment. + +
+
+
+
+
+ + ♻ ☆ Thresholded Lexicographic Ordered Multiobjective Reinforcement Learning ECAI 2024 + + +
+ Lexicographic multi-objective problems, which impose a lexicographic +importance order over the objectives, arise in many real-life scenarios. +Existing Reinforcement Learning work directly addressing lexicographic tasks +has been scarce. The few proposed approaches were all noted to be heuristics +without theoretical guarantees as the Bellman equation is not applicable to +them. Additionally, the practical applicability of these prior approaches also +suffers from various issues such as not being able to reach the goal state. +While some of these issues have been known before, in this work we investigate +further shortcomings, and propose fixes for improving practical performance in +many cases. We also present a policy optimization approach using our +Lexicographic Projection Optimization (LPO) algorithm that has the potential to +address these theoretical and practical concerns. Finally, we demonstrate our +proposed algorithms on benchmark problems. + +
+
+ comment: Full version of ECAI 2024 paper +
+
+
+
+
+ + ♻ ☆ LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet + + +
+ Recent large language model (LLM) defenses have greatly improved models' +ability to refuse harmful queries, even when adversarially attacked. However, +LLM defenses are primarily evaluated against automated adversarial attacks in a +single turn of conversation, an insufficient threat model for real-world +malicious use. We demonstrate that multi-turn human jailbreaks uncover +significant vulnerabilities, exceeding 70% attack success rate (ASR) on +HarmBench against defenses that report single-digit ASRs with automated +single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine +unlearning defenses, successfully recovering dual-use biosecurity knowledge +from unlearned models. We compile these results into Multi-Turn Human +Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks. +We publicly release MHJ alongside a compendium of jailbreak tactics developed +across dozens of commercial red teaming engagements, supporting research +towards stronger LLM defenses. + +
+
+
+
+
+ + ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ♻ ☆ From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video + Solution to Enhance Community Safety + + +
+ This article adopts and evaluates an AI-enabled Smart Video Solution (SVS) +designed to enhance safety in the real world. The system integrates with +existing infrastructure camera networks, leveraging recent advancements in AI +for easy adoption. Prioritizing privacy and ethical standards, pose based data +is used for downstream AI tasks such as anomaly detection. Cloud-based +infrastructure and mobile app are deployed, enabling real-time alerts within +communities. The SVS employs innovative data representation and visualization +techniques, such as the Occupancy Indicator, Statistical Anomaly Detection, +Bird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance +public safety. Evaluation of the SVS demonstrates its capacity to convert +complex computer vision outputs into actionable insights for stakeholders, +community partners, law enforcement, urban planners, and social scientists. +This article presents a comprehensive real-world deployment and evaluation of +the SVS, implemented in a community college environment across 16 cameras. The +system integrates AI-driven visual processing, supported by statistical +analysis, database management, cloud communication, and user notifications. +Additionally, the article evaluates the end-to-end latency from the moment an +AI algorithm detects anomalous behavior in real-time at the camera level to the +time stakeholders receive a notification. The results demonstrate the system's +robustness, effectively managing 16 CCTV cameras with a consistent throughput +of 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end +latency of 26.76 seconds between anomaly detection and alert issuance. + +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 19 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ Multi-Track MusicLDM: Towards Versatile Music Generation with Latent + Diffusion Model + + +
+ Diffusion models have shown promising results in cross-modal generation tasks +involving audio and music, such as text-to-sound and text-to-music generation. +These text-controlled music generation models typically focus on generating +music by capturing global musical attributes like genre and mood. However, +music composition is a complex, multilayered task that often involves musical +arrangement as an integral part of the process. This process involves composing +each instrument to align with existing ones in terms of beat, dynamics, +harmony, and melody, requiring greater precision and control over tracks than +text prompts usually provide. In this work, we address these challenges by +extending the MusicLDM, a latent diffusion model for music, into a multi-track +generative model. By learning the joint probability of tracks sharing a +context, our model is capable of generating music across several tracks that +correspond well to each other, either conditionally or unconditionally. +Additionally, our model is capable of arrangement generation, where the model +can generate any subset of tracks given the others (e.g., generating a piano +track complementing given bass and drum tracks). We compared our model with an +existing multi-track generative model and demonstrated that our model achieves +considerable improvements across objective metrics for both total and +arrangement generation tasks. + +
+
+
+
+
+ + ☆ ExpLLM: Towards Chain of Thought for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is a critical task in multimedia with +significant implications across various domains. However, analyzing the causes +of facial expressions is essential for accurately recognizing them. Current +approaches, such as those based on facial action units (AUs), typically provide +AU names and intensities but lack insight into the interactions and +relationships between AUs and the overall expression. In this paper, we propose +a novel method called ExpLLM, which leverages large language models to generate +an accurate chain of thought (CoT) for facial expression recognition. +Specifically, we have designed the CoT mechanism from three key perspectives: +key observations, overall emotional interpretation, and conclusion. The key +observations describe the AU's name, intensity, and associated emotions. The +overall emotional interpretation provides an analysis based on multiple AUs and +their interactions, identifying the dominant emotions and their relationships. +Finally, the conclusion presents the final expression label derived from the +preceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed +to construct this expression CoT and generate instruction-description data for +training our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets +demonstrate that ExpLLM outperforms current state-of-the-art FER methods. +ExpLLM also surpasses the latest GPT-4o in expression CoT generation, +particularly in recognizing micro-expressions where GPT-4o frequently fails. + +
+
+ comment: project page: https://starhiking.github.io/ExpLLM_Page/ +
+
+
+
+
+ + ☆ PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for + One-Shot Talking Head Generation + + +
+ While previous audio-driven talking head generation (THG) methods generate +head poses from driving audio, the generated poses or lips cannot match the +audio well or are not editable. In this study, we propose \textbf{PoseTalk}, a +THG system that can freely generate lip-synchronized talking head videos with +free head poses conditioned on text prompts and audio. The core insight of our +method is using head pose to connect visual, linguistic, and audio signals. +First, we propose to generate poses from both audio and text prompts, where the +audio offers short-term variations and rhythm correspondence of the head +movements and the text prompts describe the long-term semantics of head +motions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to +generate motion latent from text prompts and audio cues in a pose latent space. +Second, we observe a loss-imbalance problem: the loss for the lip region +contributes less than 4\% of the total reconstruction loss caused by both pose +and lip, making optimization lean towards head movements rather than lip +shapes. To address this issue, we propose a refinement-based learning strategy +to synthesize natural talking videos using two cascaded networks, i.e., +CoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce +animated images in novel poses and the RefineNet focuses on learning finer lip +motions by progressively estimating lip motions from low-to-high resolutions, +yielding improved lip-synchronization performance. Experiments demonstrate our +pose prediction strategy achieves better pose diversity and realness compared +to text-only or audio-only, and our video generator model outperforms +state-of-the-art methods in synthesizing talking videos with natural head +motions. Project: https://junleen.github.io/projects/posetalk. + +
+
+ comment: 7+5 pages, 15 figures +
+
+
+
+
+ + ☆ Low-Resolution Object Recognition with Cross-Resolution Relational + Contrastive Distillation + + +
+ Recognizing objects in low-resolution images is a challenging task due to the +lack of informative details. Recent studies have shown that knowledge +distillation approaches can effectively transfer knowledge from a +high-resolution teacher model to a low-resolution student model by aligning +cross-resolution representations. However, these approaches still face +limitations in adapting to the situation where the recognized objects exhibit +significant representation discrepancies between training and testing images. +In this study, we propose a cross-resolution relational contrastive +distillation approach to facilitate low-resolution object recognition. Our +approach enables the student model to mimic the behavior of a well-trained +teacher model which delivers high accuracy in identifying high-resolution +objects. To extract sufficient knowledge, the student learning is supervised +with contrastive relational distillation loss, which preserves the similarities +in various relational structures in contrastive representation space. In this +manner, the capability of recovering missing details of familiar low-resolution +objects can be effectively enhanced, leading to a better knowledge transfer. +Extensive experiments on low-resolution object classification and +low-resolution face recognition clearly demonstrate the effectiveness and +adaptability of our approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Circuits and Systems + for Video Technology (TCSVT) +
+
+
+
+
+ + ☆ FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video + Reconstruction in Resource and Timing Constrained Network Settings + + +
+ Despite the growing adoption of video processing via Internet of Things (IoT) +devices due to their cost-effectiveness, transmitting captured data to nearby +servers poses challenges due to varying timing constraints and scarcity of +network bandwidth. Existing video compression methods face difficulties in +recovering compressed data when incomplete data is provided. Here, we introduce +\emph{\project}, a deep-learning based solution that utilizes previously +received data to predict the missing segments of a frame, enabling the +reconstruction of a frame from partially received data. + +
+
+
+
+
+ + ☆ Coral Model Generation from Single Images for Virtual Reality + Applications + + +
+ With the rapid development of VR technology, the demand for high-quality 3D +models is increasing. Traditional methods struggle with efficiency and quality +in large-scale customization. This paper introduces a deep-learning framework +that generates high-precision 3D coral models from a single image. Using the +Coral dataset, the framework extracts geometric and texture features, performs +3D reconstruction, and optimizes design and material blending. Advanced +optimization and polygon count control ensure shape accuracy, detail retention, +and flexible output for various complexities, catering to high-quality +rendering and real-time interaction needs.The project incorporates Explainable +AI (XAI) to transform AI-generated models into interactive "artworks," best +viewed in VR and XR. This enhances model interpretability and human-machine +collaboration. Real-time feedback in VR interactions displays information like +coral species and habitat, enriching user experience. The generated models +surpass traditional methods in detail, visual quality, and efficiency. This +research offers an intelligent approach to 3D content creation for VR, lowering +production barriers, and promoting widespread VR applications. Additionally, +integrating XAI provides new insights into AI-generated visual content and +advances research in 3D vision interpretability. + +
+
+ comment: In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts + 2024) arXiv:2406.14485 +
+
+
+
+
+ + ♻ ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images + + +
+ Text-to-image generation models have achieved remarkable advancements in +recent years, aiming to produce realistic images from textual descriptions. +However, these models often struggle with generating anatomically accurate +representations of human hands. The resulting images frequently exhibit issues +such as incorrect numbers of fingers, unnatural twisting or interlacing of +fingers, or blurred and indistinct hands. These issues stem from the inherent +complexity of hand structures and the difficulty in aligning textual +descriptions with precise visual depictions of hands. To address these +challenges, we propose a novel approach named Hand1000 that enables the +generation of realistic hand images with target gesture using only 1,000 +training samples. The training of Hand1000 is divided into three stages with +the first stage aiming to enhance the model's understanding of hand anatomy by +using a pre-trained hand gesture recognition model to extract gesture +representation. The second stage further optimizes text embedding by +incorporating the extracted hand gesture representation, to improve alignment +between the textual descriptions and the generated hand images. The third stage +utilizes the optimized embedding to fine-tune the Stable Diffusion model to +generate realistic hand images. In addition, we construct the first publicly +available dataset specifically designed for text-to-hand image generation. +Based on the existing hand gesture recognition dataset, we adopt advanced image +captioning models and LLaMA3 to generate high-quality textual descriptions +enriched with detailed gesture information. Extensive experiments demonstrate +that Hand1000 significantly outperforms existing models in producing +anatomically correct hand images while faithfully representing other details in +the text, such as faces, clothing, and colors. + +
+
+ comment: Project page https://haozhuo-zhang.github.io/Hand1000-project-page/ +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 72 + +
+
+
+ + ☆ Arctic-SnowCoder: Demystifying High-Quality Data in Code Pretraining + + +
+ Recent studies have been increasingly demonstrating that high-quality data is +crucial for effective pretraining of language models. However, the precise +definition of "high-quality" remains underexplored. Focusing on the code +domain, we introduce Arctic-SnowCoder-1.3B, a data-efficient base code model +pretrained on 555B tokens through three phases of progressively refined data: +(1) general pretraining with 500B standard-quality code tokens, preprocessed +through basic filtering, deduplication, and decontamination, (2) continued +pretraining with 50B high-quality tokens, selected from phase one by a +BERT-style quality annotator trained to distinguish good code from random data, +using positive examples drawn from high-quality code files, along with +instruction data from Magicoder and StarCoder2-Instruct, and (3) enhanced +pretraining with 5B synthetic data created by Llama-3.1-70B using phase two +data as seeds, adapting the Magicoder approach for pretraining. Despite being +trained on a limited dataset, Arctic-SnowCoder achieves state-of-the-art +performance on BigCodeBench, a coding benchmark focusing on practical and +challenging programming tasks, compared to similarly sized models trained on no +more than 1T tokens, outperforming Phi-1.5-1.3B by 36%. Across all evaluated +benchmarks, Arctic-SnowCoder-1.3B beats StarCoderBase-3B pretrained on 1T +tokens. Additionally, it matches the performance of leading small base code +models trained on trillions of tokens. For example, Arctic-SnowCoder-1.3B +surpasses StarCoder2-3B, pretrained on over 3.3T tokens, on HumanEval+, a +benchmark that evaluates function-level code generation, and remains +competitive on BigCodeBench. Our evaluation presents a comprehensive analysis +justifying various design choices for Arctic-SnowCoder. Most importantly, we +find that the key to high-quality data is its alignment with the distribution +of downstream applications. + +
+
+
+
+
+ + ☆ Optimal L-Systems for Stochastic L-system Inference Problems + + +
+ This paper presents two novel theorems that address two open problems in +stochastic Lindenmayer-system (L-system) inference, specifically focusing on +the construction of an optimal stochastic L-system capable of generating a +given sequence of strings. The first theorem delineates a method for crafting a +stochastic L-system that maximizes the likelihood of producing a given sequence +of words through a singular derivation. Furthermore, the second theorem +determines the stochastic L-systems with the highest probability of producing a +given sequence of words with multiple possible derivations. From these, we +introduce an algorithm to infer an optimal stochastic L-system from a given +sequence. This algorithm incorporates sophisticated optimization techniques, +such as interior point methods, ensuring production of a stochastically optimal +stochastic L-system suitable for generating the given sequence. This allows for +the use of using stochastic L-systems as model for machine learning using only +positive data for training. + +
+
+
+
+
+ + ☆ MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in + LLMs + + +
+ Existing benchmarks for large language models (LLMs) increasingly struggle to +differentiate between top-performing models, underscoring the need for more +challenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced +benchmark building upon MMLU-Pro to assess shortcut learning and higher-order +reasoning in LLMs. By incorporating questions with multiple correct answers +across diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex +reasoning and resist simplistic problem-solving strategies. Our results show +that MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous +test of model discrimination, particularly in multi-correct answer scenarios. +We introduce novel metrics like shortcut selection ratio and correct pair +identification ratio, offering deeper insights into model behavior and +anchoring bias. Evaluations of five state-of-the-art LLMs reveal significant +performance gaps, highlighting variations in reasoning abilities and bias +susceptibility. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/mmlu-pro-plus}. + +
+
+
+
+
+ + ☆ Therapy as an NLP Task: Psychologists' Comparison of LLMs and Human + Peers in CBT + + +
+ Wider access to therapeutic care is one of the biggest challenges in mental +health treatment. Due to institutional barriers, some people seeking mental +health support have turned to large language models (LLMs) for personalized +therapy, even though these models are largely unsanctioned and untested. We +investigate the potential and limitations of using LLMs as providers of +evidence-based therapy by using mixed methods clinical metrics. Using HELPERT, +a prompt run on a large language model using the same process and training as a +comparative group of peer counselors, we replicated publicly accessible mental +health conversations rooted in Cognitive Behavioral Therapy (CBT) to compare +session dynamics and counselor's CBT-based behaviors between original peer +support sessions and their reconstructed HELPERT sessions. Two licensed, +CBT-trained clinical psychologists evaluated the sessions using the Cognitive +Therapy Rating Scale and provided qualitative feedback. Our findings show that +the peer sessions are characterized by empathy, small talk, therapeutic +alliance, and shared experiences but often exhibit therapist drift. Conversely, +HELPERT reconstructed sessions exhibit minimal therapist drift and higher +adherence to CBT methods but display a lack of collaboration, empathy, and +cultural understanding. Through CTRS ratings and psychologists' feedback, we +highlight the importance of human-AI collaboration for scalable mental health. +Our work outlines the ethical implication of imparting human-like subjective +qualities to LLMs in therapeutic settings, particularly the risk of deceptive +empathy, which may lead to unrealistic patient expectations and potential harm. + +
+
+
+
+
+ + ☆ Temporal Order Preserved Optimal Transport-based Cross-modal Knowledge + Transfer Learning for ASR + + +
+ Transferring linguistic knowledge from a pretrained language model (PLM) to +an acoustic model has been shown to greatly improve the performance of +automatic speech recognition (ASR). However, due to the heterogeneous feature +distributions in cross-modalities, designing an effective model for feature +alignment and knowledge transfer between linguistic and acoustic sequences +remains a challenging task. Optimal transport (OT), which efficiently measures +probability distribution discrepancies, holds great potential for aligning and +transferring knowledge between acoustic and linguistic modalities. Nonetheless, +the original OT treats acoustic and linguistic feature sequences as two +unordered sets in alignment and neglects temporal order information during OT +coupling estimation. Consequently, a time-consuming pretraining stage is +required to learn a good alignment between the acoustic and linguistic +representations. In this paper, we propose a Temporal Order Preserved OT +(TOT)-based Cross-modal Alignment and Knowledge Transfer (CAKT) (TOT-CAKT) for +ASR. In the TOT-CAKT, local neighboring frames of acoustic sequences are +smoothly mapped to neighboring regions of linguistic sequences, preserving +their temporal order relationship in feature alignment and matching. With the +TOT-CAKT model framework, we conduct Mandarin ASR experiments with a pretrained +Chinese PLM for linguistic knowledge transfer. Our results demonstrate that the +proposed TOT-CAKT significantly improves ASR performance compared to several +state-of-the-art models employing linguistic knowledge transfer, and addresses +the weaknesses of the original OT-based method in sequential feature alignment +for ASR. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ Unforgettable Generalization in Language Models + + +
+ When language models (LMs) are trained to forget (or "unlearn'') a skill, how +precisely does their behavior change? We study the behavior of transformer LMs +in which tasks have been forgotten via fine-tuning on randomized labels. Such +LMs learn to generate near-random predictions for individual examples in the +"training'' set used for forgetting. Across tasks, however, LMs exhibit extreme +variability in whether LM predictions change on examples outside the training +set. In some tasks (like entailment classification), forgetting generalizes +robustly, and causes models to produce uninformative predictions on new task +instances; in other tasks (like physical commonsense reasoning and scientific +question answering) forgetting affects only the training examples, and models +continue to perform the "forgotten'' task accurately even for examples very +similar to those that appeared in the training set. Dataset difficulty is not +predictive of whether a behavior can be forgotten; instead, generalization in +forgetting is (weakly) predicted by the confidence of LMs' initial task +predictions and the variability of LM representations of training data, with +low confidence and low variability both associated with greater generalization. +Perhaps most surprisingly, random-label forgetting appears to be somewhat +insensitive to the contents of the training set: for example, models trained on +science questions with random labels continue to answer other science questions +accurately, but begin to produce random labels on entailment classification +tasks. Finally, we show that even generalizable forgetting is shallow: linear +probes trained on LMs' representations can still perform tasks reliably after +forgetting. Our results highlight the difficulty and unpredictability of +performing targeted skill removal from models via fine-tuning. + +
+
+ comment: 18 pages, 9 figures, published in First Conference on Language + Modeling 2024 +
+
+
+
+
+ + ☆ Visually Grounded Speech Models for Low-resource Languages and Cognitive + Modelling + + +
+ This dissertation examines visually grounded speech (VGS) models that learn +from unlabelled speech paired with images. It focuses on applications for +low-resource languages and understanding human language acquisition. We +introduce a task called visually prompted keyword localisation to detect and +localise keywords in speech using images. We demonstrate the effectiveness of +VGS models in few-shot learning scenarios for low-resource languages like +Yoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our +monolingual VGS model exhibits this bias, but we found that multilingualism +does not affect the bias in this VGS model similarly to what is observed in +children. + +
+
+ comment: PhD Dissertation +
+
+
+
+
+ + ☆ CRAFT Your Dataset: Task-Specific Synthetic Dataset Generation Through + Corpus Retrieval and Augmentation + + +
+ Building high-quality datasets for specialized tasks is a time-consuming and +resource-intensive process that often requires specialized domain knowledge. We +propose Corpus Retrieval and Augmentation for Fine-Tuning (CRAFT), a method for +generating synthetic datasets, given a small number of user-written few-shots +that demonstrate the task to be performed. Given the few-shot examples, we use +large-scale public web-crawled corpora and similarity-based document retrieval +to find other relevant human-written documents. Lastly, instruction-tuned large +language models (LLMs) augment the retrieved documents into custom-formatted +task samples, which then can be used for fine-tuning. We demonstrate that CRAFT +can efficiently generate large-scale task-specific training datasets for four +diverse tasks: biology question-answering (QA), medicine QA and commonsense QA +as well as summarization. Our experiments show that CRAFT-based models +outperform or achieve comparable performance to general LLMs for QA tasks, +while CRAFT-based summarization models outperform models trained on +human-curated data by 46 preference points. + +
+
+
+
+
+ + ☆ Political DEBATE: Efficient Zero-shot and Few-shot Classifiers for + Political Text + + +
+ Social scientists quickly adopted large language models due to their ability +to annotate documents without supervised training, an ability known as +zero-shot learning. However, due to their compute demands, cost, and often +proprietary nature, these models are often at odds with replication and open +science standards. This paper introduces the Political DEBATE (DeBERTa +Algorithm for Textual Entailment) language models for zero-shot and few-shot +classification of political documents. These models are not only as good, or +better than, state-of-the art large language models at zero and few-shot +classification, but are orders of magnitude more efficient and completely open +source. By training the models on a simple random sample of 10-25 documents, +they can outperform supervised classifiers trained on hundreds or thousands of +documents and state-of-the-art generative models with complex, engineered +prompts. Additionally, we release the PolNLI dataset used to train these models +-- a corpus of over 200,000 political documents with highly accurate labels +across over 800 classification tasks. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ☆ Spinning the Golden Thread: Benchmarking Long-Form Generation in + Language Models + + +
+ The abilities of long-context language models (LMs) are often evaluated using +the "Needle-in-a-Haystack" (NIAH) test, which comprises tasks designed to +assess a model's ability to identify specific information ("needle") within +large text sequences ("haystack"). While these benchmarks measure how well +models understand long-context input sequences, they do not effectively gauge +the quality of long-form text generation--a critical aspect for applications +such as design proposals and creative writing. To address this gap, we have +introduced a new long-form text evaluation benchmark, Spinning the Golden +Thread (SGT), which tests models' ability to identify specific events within +generated long text sequences. In this benchmark, we prompt long-context LMs to +create long-form text that must include particular events or constraints and +evaluate their ability to incorporate these elements. We evaluated ten +long-context LMs across four distinct scenarios, three types of prompt +instructions, and two different generation-length settings (16K and 32K). +Although these models perform well on NIAH benchmarks, none demonstrated +satisfactory performance on the Spinning the Golden Thread, raising concerns +about their ability to generate coherent long-form text that follows +instructions. Additionally, as the length of the generated text increases, all +models exhibit a significant drop in performance. + +
+
+
+
+
+ + ☆ OLMoE: Open Mixture-of-Experts Language Models + + +
+ We introduce OLMoE, a fully open, state-of-the-art language model leveraging +sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but +uses only 1B per input token. We pretrain it on 5 trillion tokens and further +adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available +models with similar active parameters, even surpassing larger ones like +Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE +training, analyze routing in our model showing high specialization, and +open-source all aspects of our work: model weights, training data, code, and +logs. + +
+
+ comment: 61 pages (24 main), 36 figures, 14 tables +
+
+
+
+
+ + ☆ Enhancing Code-Switching Speech Recognition with LID-Based Collaborative + Mixture of Experts Model + + +
+ Due to the inherent difficulty in modeling phonetic similarities across +different languages, code-switching speech recognition presents a formidable +challenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE) +model that leverages a collaborative mechanism among expert groups. Initially, +a preceding routing network explicitly learns Language Identification (LID) +tasks and selects experts based on acquired LID weights. This process ensures +robust routing information to the MoE layer, mitigating interference from +diverse language domains on expert network parameter updates. The LID weights +are also employed to facilitate inter-group collaboration, enabling the +integration of language-specific representations. Furthermore, within each +language expert group, a gating network operates unsupervised to foster +collaboration on attributes beyond language. Extensive experiments demonstrate +the efficacy of our approach, achieving significant performance enhancements +compared to alternative methods. Importantly, our method preserves the +efficient inference capabilities characteristic of MoE models without +necessitating additional pre-training. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ BEAVER: An Enterprise Benchmark for Text-to-SQL + + +
+ Existing text-to-SQL benchmarks have largely been constructed using publicly +available tables from the web with human-generated tests containing question +and SQL statement pairs. They typically show very good results and lead people +to think that LLMs are effective at text-to-SQL tasks. In this paper, we apply +off-the-shelf LLMs to a benchmark containing enterprise data warehouse data. In +this environment, LLMs perform poorly, even when standard prompt engineering +and RAG techniques are utilized. As we will show, the reasons for poor +performance are largely due to three characteristics: (1) public LLMs cannot +train on enterprise data warehouses because they are largely in the "dark web", +(2) schemas of enterprise tables are more complex than the schemas in public +data, which leads the SQL-generation task innately harder, and (3) +business-oriented questions are often more complex, requiring joins over +multiple tables and aggregations. As a result, we propose a new dataset BEAVER, +sourced from real enterprise data warehouses together with natural language +queries and their correct SQL statements which we collected from actual user +history. We evaluated this dataset using recent LLMs and demonstrated their +poor performance on this task. We hope this dataset will facilitate future +researchers building more sophisticated text-to-SQL systems which can do better +on this important class of data. + +
+
+
+
+
+ + ☆ Foundations of Large Language Model Compression -- Part 1: Weight + Quantization + + +
+ In recent years, compression of large language models (LLMs) has emerged as +an important problem to allow language model deployment on resource-constrained +devices, reduce computational costs, and mitigate the environmental footprint +of large-scale AI infrastructure. In this paper, we present the foundations of +LLM quantization from a convex optimization perspective and propose a +quantization method that builds on these foundations and outperforms previous +methods. Our quantization framework, CVXQ, scales to models containing hundreds +of billions of weight parameters and provides users with the flexibility to +compress models to any specified model size, post-training. A reference +implementation of CVXQ can be obtained from https://github.com/seannz/cvxq. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ FuzzCoder: Byte-level Fuzzing Test via Large Language Model + + +
+ Fuzzing is an important dynamic program analysis technique designed for +finding vulnerabilities in complex software. Fuzzing involves presenting a +target program with crafted malicious input to cause crashes, buffer overflows, +memory errors, and exceptions. Crafting malicious inputs in an efficient manner +is a difficult open problem and the best approaches often apply uniform random +mutations to pre-existing valid inputs. In this work, we propose to adopt +fine-tuned large language models (FuzzCoder) to learn patterns in the input +files from successful attacks to guide future fuzzing explorations. +Specifically, we develop a framework to leverage the code LLMs to guide the +mutation process of inputs in fuzzing. The mutation process is formulated as +the sequence-to-sequence modeling, where LLM receives a sequence of bytes and +then outputs the mutated byte sequence. FuzzCoder is fine-tuned on the created +instruction dataset (Fuzz-Instruct), where the successful fuzzing history is +collected from the heuristic fuzzing tool. FuzzCoder can predict mutation +locations and strategies locations in input files to trigger abnormal behaviors +of the program. Experimental results show that FuzzCoder based on AFL (American +Fuzzy Lop) gain significant improvements in terms of effective proportion of +mutation (EPM) and number of crashes (NC) for various input formats including +ELF, JPG, MP3, and XML. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Towards Leveraging Large Language Models for Automated Medical Q&A + Evaluation + + +
+ This paper explores the potential of using Large Language Models (LLMs) to +automate the evaluation of responses in medical Question and Answer (Q\&A) +systems, a crucial form of Natural Language Processing. Traditionally, human +evaluation has been indispensable for assessing the quality of these responses. +However, manual evaluation by medical professionals is time-consuming and +costly. Our study examines whether LLMs can reliably replicate human +evaluations by using questions derived from patient data, thereby saving +valuable time for medical experts. While the findings suggest promising +results, further research is needed to address more specific or complex +questions that were beyond the scope of this initial investigation. + +
+
+ comment: 10 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ 3D-LEX v1.0: 3D Lexicons for American Sign Language and Sign Language of + the Netherlands + + +
+ In this work, we present an efficient approach for capturing sign language in +3D, introduce the 3D-LEX v1.0 dataset, and detail a method for semi-automatic +annotation of phonetic properties. Our procedure integrates three motion +capture techniques encompassing high-resolution 3D poses, 3D handshapes, and +depth-aware facial features, and attains an average sampling rate of one sign +every 10 seconds. This includes the time for presenting a sign example, +performing and recording the sign, and archiving the capture. The 3D-LEX +dataset includes 1,000 signs from American Sign Language and an additional +1,000 signs from the Sign Language of the Netherlands. We showcase the dataset +utility by presenting a simple method for generating handshape annotations +directly from 3D-LEX. We produce handshape labels for 1,000 signs from American +Sign Language and evaluate the labels in a sign recognition task. The labels +enhance gloss recognition accuracy by 5% over using no handshape annotations, +and by 1% over expert annotations. Our motion capture data supports in-depth +analysis of sign features and facilitates the generation of 2D projections from +any viewpoint. The 3D-LEX collection has been aligned with existing sign +language benchmarks and linguistic resources, to support studies in 3D-aware +sign language processing. + +
+
+
+
+
+ + ☆ What are the Essential Factors in Crafting Effective Long Context + Multi-Hop Instruction Datasets? Insights and Best Practices + + +
+ Recent advancements in large language models (LLMs) with extended context +windows have significantly improved tasks such as information extraction, +question answering, and complex planning scenarios. In order to achieve success +in long context tasks, a large amount of work has been done to enhance the long +context capabilities of the model through synthetic data. Existing methods +typically utilize the Self-Instruct framework to generate instruction tuning +data for better long context capability improvement. However, our preliminary +experiments indicate that less than 35% of generated samples are multi-hop, and +more than 40% exhibit poor quality, limiting comprehensive understanding and +further research. To improve the quality of synthetic data, we propose the +Multi-agent Interactive Multi-hop Generation (MIMG) framework, incorporating a +Quality Verification Agent, a Single-hop Question Generation Agent, a Multiple +Question Sampling Strategy, and a Multi-hop Question Merger Agent. This +framework improves the data quality, with the proportion of high-quality, +multi-hop, and diverse data exceeding 85%. Furthermore, we systematically +investigate strategies for document selection, question merging, and validation +techniques through extensive experiments across various models. Our findings +show that our synthetic high-quality long-context instruction data +significantly enhances model performance, even surpassing models trained on +larger amounts of human-annotated data. Our code is available at: +https://github.com/WowCZ/LongMIT. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Investigating Expert-in-the-Loop LLM Discourse Patterns for Ancient + Intertextual Analysis + + +
+ This study explores the potential of large language models (LLMs) for +identifying and examining intertextual relationships within biblical, Koine +Greek texts. By evaluating the performance of LLMs on various intertextuality +scenarios the study demonstrates that these models can detect direct +quotations, allusions, and echoes between texts. The LLM's ability to generate +novel intertextual observations and connections highlights its potential to +uncover new insights. However, the model also struggles with long query +passages and the inclusion of false intertextual dependences, emphasizing the +importance of expert evaluation. The expert-in-the-loop methodology presented +offers a scalable approach for intertextual research into the complex web of +intertextuality within and beyond the biblical corpus. + +
+
+
+
+
+ + ☆ The Role of Large Language Models in Musicology: Are We Ready to Trust + the Machines? + + +
+ In this work, we explore the use and reliability of Large Language Models +(LLMs) in musicology. From a discussion with experts and students, we assess +the current acceptance and concerns regarding this, nowadays ubiquitous, +technology. We aim to go one step further, proposing a semi-automatic method to +create an initial benchmark using retrieval-augmented generation models and +multiple-choice question generation, validated by human experts. Our evaluation +on 400 human-validated questions shows that current vanilla LLMs are less +reliable than retrieval augmented generation from music dictionaries. This +paper suggests that the potential of LLMs in musicology requires musicology +driven research that can specialized LLMs by including accurate and reliable +domain knowledge. + +
+
+
+
+
+ + ☆ AgentRE: An Agent-Based Framework for Navigating Complex Information + Landscapes in Relation Extraction CIKM 2024 + + +
+ The relation extraction (RE) in complex scenarios faces challenges such as +diverse relation types and ambiguous relations between entities within a single +sentence, leading to the poor performance of pure "text-in, text-out" language +models (LMs). To address these challenges, in this paper, we propose an +agent-based RE framework, namely AgentRE, which fully leverages the potential +of large language models (LLMs) including memory, retrieval and reflection, to +achieve RE in complex scenarios. Specifically, three major modules are built in +AgentRE serving as the tools to help the agent acquire and process various +useful information, thereby obtaining improved RE performance. Our extensive +experimental results upon two datasets in English and Chinese demonstrate our +AgentRE's superior performance, especially in low-resource scenarios. +Additionally, the trajectories generated by AgentRE can be refined to construct +a high-quality training dataset incorporating different reasoning methods, +which can be used to fine-tune smaller models. Code is available at +https://github.com/Lightblues/AgentRE. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Towards Generative Class Prompt Learning for Few-shot Visual Recognition BMVC 2024 + + +
+ Although foundational vision-language models (VLMs) have proven to be very +successful for various semantic discrimination tasks, they still struggle to +perform faithfully for fine-grained categorization. Moreover, foundational +models trained on one domain do not generalize well on a different domain +without fine-tuning. We attribute these to the limitations of the VLM's +semantic representations and attempt to improve their fine-grained visual +awareness using generative modeling. Specifically, we propose two novel +methods: Generative Class Prompt Learning (GCPL) and Contrastive Multi-class +Prompt Learning (CoMPLe). Utilizing text-to-image diffusion models, GCPL +significantly improves the visio-linguistic synergy in class embeddings by +conditioning on few-shot exemplars with learnable class prompts. CoMPLe builds +on this foundation by introducing a contrastive learning component that +encourages inter-class separation during the generative optimization process. +Our empirical results demonstrate that such a generative class prompt learning +approach substantially outperform existing methods, offering a better +alternative to few shot image recognition challenges. The source code will be +made available at: https://github.com/soumitri2001/GCPL. + +
+
+ comment: Accepted at BMVC 2024 +
+
+
+
+
+ + ☆ Dialogue You Can Trust: Human and AI Perspectives on Generated + Conversations ALT + + +
+ As dialogue systems and chatbots increasingly integrate into everyday +interactions, the need for efficient and accurate evaluation methods becomes +paramount. This study explores the comparative performance of human and AI +assessments across a range of dialogue scenarios, focusing on seven key +performance indicators (KPIs): Coherence, Innovation, Concreteness, Goal +Contribution, Commonsense Contradiction, Incorrect Fact, and Redundancy. +Utilizing the GPT-4o API, we generated a diverse dataset of conversations and +conducted a two-part experimental analysis. In Experiment 1, we evaluated +multi-party conversations on Coherence, Innovation, Concreteness, and Goal +Contribution, revealing that GPT models align closely with human judgments. +Notably, both human and AI evaluators exhibited a tendency towards binary +judgment rather than linear scaling, highlighting a shared challenge in these +assessments. Experiment 2 extended the work of Finch et al. (2023) by focusing +on dyadic dialogues and assessing Commonsense Contradiction, Incorrect Fact, +and Redundancy. The results indicate that while GPT-4o demonstrates strong +performance in maintaining factual accuracy and commonsense reasoning, it still +struggles with reducing redundancy and self-contradiction. Our findings +underscore the potential of GPT models to closely replicate human evaluation in +dialogue systems, while also pointing to areas for improvement. This research +offers valuable insights for advancing the development and implementation of +more refined dialogue evaluation methodologies, contributing to the evolution +of more effective and human-like AI communication tools. + +
+
+ comment: 17 pages, 15 figures, shorter version submitted to 22nd Annual + Workshop of the Australasian Language Technology Association (ALTA'24) +
+
+
+
+
+ + ☆ LASP: Surveying the State-of-the-Art in Large Language Model-Assisted AI + Planning + + +
+ Effective planning is essential for the success of any task, from organizing +a vacation to routing autonomous vehicles and developing corporate strategies. +It involves setting goals, formulating plans, and allocating resources to +achieve them. LLMs are particularly well-suited for automated planning due to +their strong capabilities in commonsense reasoning. They can deduce a sequence +of actions needed to achieve a goal from a given state and identify an +effective course of action. However, it is frequently observed that plans +generated through direct prompting often fail upon execution. Our survey aims +to highlight the existing challenges in planning with language models, focusing +on key areas such as embodied environments, optimal scheduling, competitive and +cooperative games, task decomposition, reasoning, and planning. Through this +study, we explore how LLMs transform AI planning and provide unique insights +into the future of LM-assisted planning. + +
+
+
+
+
+ + ☆ Training on the Benchmark Is Not All You Need + + +
+ The success of Large Language Models (LLMs) relies heavily on the huge amount +of pre-training data learned in the pre-training phase. The opacity of the +pre-training process and the training data causes the results of many benchmark +tests to become unreliable. If any model has been trained on a benchmark test +set, it can seriously hinder the health of the field. In order to automate and +efficiently test the capabilities of large language models, numerous mainstream +benchmarks adopt a multiple-choice format. As the swapping of the contents of +multiple-choice options does not affect the meaning of the question itself, we +propose a simple and effective data leakage detection method based on this +property. Specifically, we shuffle the contents of the options in the data to +generate the corresponding derived data sets, and then detect data leakage +based on the model's log probability distribution over the derived data sets. +If there is a maximum and outlier in the set of log probabilities, it indicates +that the data is leaked. Our method is able to work under black-box conditions +without access to model training data or weights, effectively identifying data +leakage from benchmark test sets in model pre-training data, including both +normal scenarios and complex scenarios where options may have been shuffled +intentionally or unintentionally. Through experiments based on two LLMs and +benchmark designs, we demonstrate the effectiveness of our method. In addition, +we evaluate the degree of data leakage of 31 mainstream open-source LLMs on +four benchmark datasets and give a ranking of the leaked LLMs for each +benchmark, and we find that the Qwen family of LLMs has the highest degree of +data leakage. + +
+
+
+
+
+ + ☆ LLM-GAN: Construct Generative Adversarial Network Through Large Language + Models For Explainable Fake News Detection + + +
+ Explainable fake news detection predicts the authenticity of news items with +annotated explanations. Today, Large Language Models (LLMs) are known for their +powerful natural language understanding and explanation generation abilities. +However, presenting LLMs for explainable fake news detection remains two main +challenges. Firstly, fake news appears reasonable and could easily mislead +LLMs, leaving them unable to understand the complex news-faking process. +Secondly, utilizing LLMs for this task would generate both correct and +incorrect explanations, which necessitates abundant labor in the loop. In this +paper, we propose LLM-GAN, a novel framework that utilizes prompting mechanisms +to enable an LLM to become Generator and Detector and for realistic fake news +generation and detection. Our results demonstrate LLM-GAN's effectiveness in +both prediction performance and explanation quality. We further showcase the +integration of LLM-GAN to a cloud-native AI platform to provide better fake +news detection service in the cloud. + +
+
+
+
+
+ + ☆ State-of-the-art Advances of Deep-learning Linguistic Steganalysis + Research + + +
+ With the evolution of generative linguistic steganography techniques, +conventional steganalysis falls short in robustly quantifying the alterations +induced by steganography, thereby complicating detection. Consequently, the +research paradigm has pivoted towards deep-learning-based linguistic +steganalysis. This study offers a comprehensive review of existing +contributions and evaluates prevailing developmental trajectories. +Specifically, we first provided a formalized exposition of the general formulas +for linguistic steganalysis, while comparing the differences between this field +and the domain of text classification. Subsequently, we classified the existing +work into two levels based on vector space mapping and feature extraction +models, thereby comparing the research motivations, model advantages, and other +details. A comparative analysis of the experiments is conducted to assess the +performances. Finally, the challenges faced by this field are discussed, and +several directions for future development and key issues that urgently need to +be addressed are proposed. + +
+
+ comment: Accepted by 2023 International Conference on Data, Information and + Computing Science +
+
+
+
+
+ + ☆ FC-KAN: Function Combinations in Kolmogorov-Arnold Networks + + +
+ In this paper, we introduce FC-KAN, a Kolmogorov-Arnold Network (KAN) that +leverages combinations of popular mathematical functions such as B-splines, +wavelets, and radial basis functions on low-dimensional data through +element-wise operations. We explore several methods for combining the outputs +of these functions, including sum, element-wise product, the addition of sum +and element-wise product, quadratic function representation, and concatenation. +In our experiments, we compare FC-KAN with multi-layer perceptron network (MLP) +and other existing KANs, such as BSRBF-KAN, EfficientKAN, FastKAN, and +FasterKAN, on the MNIST and Fashion-MNIST datasets. A variant of FC-KAN, which +uses a combination of outputs from B-splines and Difference of Gaussians (DoG) +in the form of a quadratic function, outperformed all other models on the +average of 5 independent training runs. We expect that FC-KAN can leverage +function combinations to design future KANs. Our repository is publicly +available at: https://github.com/hoangthangta/FC_KAN. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ Empirical evidence of Large Language Model's influence on human spoken + communication + + +
+ Artificial Intelligence (AI) agents now interact with billions of humans in +natural language, thanks to advances in Large Language Models (LLMs) like +ChatGPT. This raises the question of whether AI has the potential to shape a +fundamental aspect of human culture: the way we speak. Recent analyses revealed +that scientific publications already exhibit evidence of AI-specific language. +But this evidence is inconclusive, since scientists may simply be using AI to +copy-edit their writing. To explore whether AI has influenced human spoken +communication, we transcribed and analyzed about 280,000 English-language +videos of presentations, talks, and speeches from more than 20,000 YouTube +channels of academic institutions. We find a significant shift in the trend of +word usage specific to words distinctively associated with ChatGPT following +its release. These findings provide the first empirical evidence that humans +increasingly imitate LLMs in their spoken language. Our results raise societal +and policy-relevant concerns about the potential of AI to unintentionally +reduce linguistic diversity, or to be deliberately misused for mass +manipulation. They also highlight the need for further investigation into the +feedback loops between machine behavior and human culture. + +
+
+
+
+
+ + ☆ Taming CLIP for Fine-grained and Structured Visual Understanding of + Museum Exhibits ECCV 2024 + + +
+ CLIP is a powerful and widely used tool for understanding images in the +context of natural language descriptions to perform nuanced tasks. However, it +does not offer application-specific fine-grained and structured understanding, +due to its generic nature. In this work, we aim to adapt CLIP for fine-grained +and structured -- in the form of tabular data -- visual understanding of museum +exhibits. To facilitate such understanding we (a) collect, curate, and +benchmark a dataset of 200K+ image-table pairs, and (b) develop a method that +allows predicting tabular outputs for input images. Our dataset is the first of +its kind in the public domain. At the same time, the proposed method is novel +in leveraging CLIP's powerful representations for fine-grained and tabular +understanding. The proposed method (MUZE) learns to map CLIP's image embeddings +to the tabular structure by means of a proposed transformer-based parsing +network (parseNet). More specifically, parseNet enables prediction of missing +attribute values while integrating context from known attribute-value pairs for +an input image. We show that this leads to significant improvement in accuracy. +Through exhaustive experiments, we show the effectiveness of the proposed +method on fine-grained and structured understanding of museum exhibits, by +achieving encouraging results in a newly established benchmark. Our dataset and +source-code can be found at: https://github.com/insait-institute/MUZE + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ In Defense of RAG in the Era of Long-Context Language Models + + +
+ Overcoming the limited context limitations in early-generation LLMs, +retrieval-augmented generation (RAG) has been a reliable solution for +context-based answer generation in the past. Recently, the emergence of +long-context LLMs allows the models to incorporate much longer text sequences, +making RAG less attractive. Recent studies show that long-context LLMs +significantly outperform RAG in long-context applications. Unlike the existing +works favoring the long-context LLM over RAG, we argue that the extremely long +context in LLMs suffers from a diminished focus on relevant information and +leads to potential degradation in answer quality. This paper revisits the RAG +in long-context answer generation. We propose an order-preserve +retrieval-augmented generation (OP-RAG) mechanism, which significantly improves +the performance of RAG for long-context question-answer applications. With +OP-RAG, as the number of retrieved chunks increases, the answer quality +initially rises, and then declines, forming an inverted U-shaped curve. There +exist sweet points where OP-RAG could achieve higher answer quality with much +less tokens than long-context LLM taking the whole context as input. Extensive +experiments on public benchmark demonstrate the superiority of our OP-RAG. + +
+
+
+
+
+ + ☆ Interpreting and Improving Large Language Models in Arithmetic + Calculation ICML 2024 + + +
+ Large language models (LLMs) have demonstrated remarkable potential across +numerous applications and have shown an emergent ability to tackle complex +reasoning tasks, such as mathematical computations. However, even for the +simplest arithmetic calculations, the intrinsic mechanisms behind LLMs remain +mysterious, making it challenging to ensure reliability. In this work, we delve +into uncovering a specific mechanism by which LLMs execute calculations. +Through comprehensive experiments, we find that LLMs frequently involve a small +fraction (< 5%) of attention heads, which play a pivotal role in focusing on +operands and operators during calculation processes. Subsequently, the +information from these operands is processed through multi-layer perceptrons +(MLPs), progressively leading to the final solution. These pivotal heads/MLPs, +though identified on a specific dataset, exhibit transferability across +different datasets and even distinct tasks. This insight prompted us to +investigate the potential benefits of selectively fine-tuning these essential +heads/MLPs to boost the LLMs' computational performance. We empirically find +that such precise tuning can yield notable enhancements on mathematical +prowess, without compromising the performance on non-mathematical tasks. Our +work serves as a preliminary exploration into the arithmetic calculation +abilities inherent in LLMs, laying a solid foundation to reveal more intricate +mathematical tasks. + +
+
+ comment: Accepted by ICML 2024 (oral) +
+
+
+
+
+ + ☆ From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language + Models with Pinpoint Tuning ICML 2024 + + +
+ Large Language Models (LLMs) tend to prioritize adherence to user prompts +over providing veracious responses, leading to the sycophancy issue. When +challenged by users, LLMs tend to admit mistakes and provide inaccurate +responses even if they initially provided the correct answer. Recent works +propose to employ supervised fine-tuning (SFT) to mitigate the sycophancy +issue, while it typically leads to the degeneration of LLMs' general +capability. To address the challenge, we propose a novel supervised pinpoint +tuning (SPT), where the region-of-interest modules are tuned for a given +objective. Specifically, SPT first reveals and verifies a small percentage +(<5%) of the basic modules, which significantly affect a particular behavior of +LLMs. i.e., sycophancy. Subsequently, SPT merely fine-tunes these identified +modules while freezing the rest. To verify the effectiveness of the proposed +SPT, we conduct comprehensive experiments, demonstrating that SPT significantly +mitigates the sycophancy issue of LLMs (even better than SFT). Moreover, SPT +introduces limited or even no side effects on the general capability of LLMs. +Our results shed light on how to precisely, effectively, and efficiently +explain and improve the targeted ability of LLMs. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ☆ CTG-KrEW: Generating Synthetic Structured Contextually Correlated + Content by Conditional Tabular GAN with K-Means Clustering and Efficient Word + Embedding + + +
+ Conditional Tabular Generative Adversarial Networks (CTGAN) and their various +derivatives are attractive for their ability to efficiently and flexibly create +synthetic tabular data, showcasing strong performance and adaptability. +However, there are certain critical limitations to such models. The first is +their inability to preserve the semantic integrity of contextually correlated +words or phrases. For instance, skillset in freelancer profiles is one such +attribute where individual skills are semantically interconnected and +indicative of specific domain interests or qualifications. The second challenge +of traditional approaches is that, when applied to generate contextually +correlated tabular content, besides generating semantically shallow content, +they consume huge memory resources and CPU time during the training stage. To +address these problems, we introduce a novel framework, CTGKrEW (Conditional +Tabular GAN with KMeans Clustering and Word Embedding), which is adept at +generating realistic synthetic tabular data where attributes are collections of +semantically and contextually coherent words. CTGKrEW is trained and evaluated +using a dataset from Upwork, a realworld freelancing platform. Comprehensive +experiments were conducted to analyze the variability, contextual similarity, +frequency distribution, and associativity of the generated data, along with +testing the framework's system feasibility. CTGKrEW also takes around 99\% less +CPU time and 33\% less memory footprints than the conventional approach. +Furthermore, we developed KrEW, a web application to facilitate the generation +of realistic data containing skill-related information. This application, +available at https://riyasamanta.github.io/krew.html, is freely accessible to +both the general public and the research community. + +
+
+
+
+
+ + ☆ Booster: Tackling Harmful Fine-tuing for Large Language Models via + Attenuating Harmful Perturbation + + +
+ Harmful fine-tuning issue \citep{qi2023fine} poses serious safety concerns +for Large language models' fine-tuning-as-a-service. While existing defenses +\citep{huang2024vaccine,rosati2024representation} have been proposed to +mitigate the issue, their performances are still far away from satisfactory, +and the root cause of the problem has not been fully recovered. For the first +time in the literature, we in this paper show that \textit{harmful +perturbation} over the model weights should be the root cause of +alignment-broken of harmful fine-tuning. In order to attenuate the negative +impact of harmful perturbation, we propose an alignment-stage solution, dubbed +Booster. Technically, along with the original alignment loss, we append a loss +regularizer in the alignment stage's optimization. The regularizer ensures that +the model's harmful loss reduction before/after simulated harmful perturbation +is attenuated, thereby mitigating the subsequent fine-tuning risk. Empirical +results show that Booster can effectively reduce the harmful score of the +fine-tuned models while maintaining the performance of downstream tasks. Our +code is available at \url{https://github.com/git-disl/Booster}. + +
+
+
+
+
+ + ☆ Towards Cross-Lingual Explanation of Artwork in Large-scale Vision + Language Models + + +
+ As the performance of Large-scale Vision Language Models (LVLMs) improves, +they are increasingly capable of responding in multiple languages, and there is +an expectation that the demand for explanations generated by LVLMs will grow. +However, pre-training of Vision Encoder and the integrated training of LLMs +with Vision Encoder are mainly conducted using English training data, leaving +it uncertain whether LVLMs can completely handle their potential when +generating explanations in languages other than English. In addition, +multilingual QA benchmarks that create datasets using machine translation have +cultural differences and biases, remaining issues for use as evaluation tasks. +To address these challenges, this study created an extended dataset in multiple +languages without relying on machine translation. This dataset that takes into +account nuances and country-specific phrases was then used to evaluate the +generation explanation abilities of LVLMs. Furthermore, this study examined +whether Instruction-Tuning in resource-rich English improves performance in +other languages. Our findings indicate that LVLMs perform worse in languages +other than English compared to English. In addition, it was observed that LVLMs +struggle to effectively manage the knowledge learned from English data. + +
+
+
+
+
+ + ☆ AdaComp: Extractive Context Compression with Adaptive Predictor for + Retrieval-Augmented Large Language Models + + +
+ Retrieved documents containing noise will hinder RAG from detecting answer +clues and make the inference process slow and expensive. Therefore, context +compression is necessary to enhance its accuracy and efficiency. Existing +context compression methods use extractive or generative models to retain the +most query-relevant sentences or apply the information bottleneck theory to +preserve sufficient information. However, these methods may face issues such as +over-compression or high computational costs. We observe that the retriever +often ranks relevant documents at the top, but the exact number of documents +needed to answer the query is uncertain due to the impact of query complexity +and retrieval quality: complex queries like multi-hop questions may require +retaining more documents than simpler queries, and a low-quality retrieval may +need to rely on more documents to generate accurate outputs. Therefore, +determining the minimum number of required documents (compression rate) is +still a challenge for RAG. In this paper, we introduce AdaComp, a low-cost +extractive context compression method that adaptively determines the +compression rate based on both query complexity and retrieval quality. +Specifically, we first annotate the minimum top-k documents necessary for the +RAG system to answer the current query as the compression rate and then +construct triplets of the query, retrieved documents, and its compression rate. +Then, we use this triplet dataset to train a compression-rate predictor. +Experiments on three QA datasets and one conversational Muiti-doc QA dataset +show that AdaComp significantly reduces inference costs while maintaining +performance nearly identical to uncompressed models, achieving a balance +between efficiency and performance. + +
+
+ comment: 8 pages, 5 figures, code available at + https://anonymous.4open.science/r/AdaComp-8C0C/ +
+
+
+
+
+ + ☆ An Implementation of Werewolf Agent That does not Truly Trust LLMs + + +
+ Werewolf is an incomplete information game, which has several challenges when +creating a computer agent as a player given the lack of understanding of the +situation and individuality of utterance (e.g., computer agents are not capable +of characterful utterance or situational lying). We propose a werewolf agent +that solves some of those difficulties by combining a Large Language Model +(LLM) and a rule-based algorithm. In particular, our agent uses a rule-based +algorithm to select an output either from an LLM or a template prepared +beforehand based on the results of analyzing conversation history using an LLM. +It allows the agent to refute in specific situations, identify when to end the +conversation, and behave with persona. This approach mitigated conversational +inconsistencies and facilitated logical utterance as a result. We also +conducted a qualitative evaluation, which resulted in our agent being perceived +as more human-like compared to an unmodified LLM. The agent is freely available +for contributing to advance the research in the field of Werewolf game. + +
+
+
+
+
+ + ☆ Benchmarking Cognitive Domains for LLMs: Insights from Taiwanese Hakka + Culture + + +
+ This study introduces a comprehensive benchmark designed to evaluate the +performance of large language models (LLMs) in understanding and processing +cultural knowledge, with a specific focus on Hakka culture as a case study. +Leveraging Bloom's Taxonomy, the study develops a multi-dimensional framework +that systematically assesses LLMs across six cognitive domains: Remembering, +Understanding, Applying, Analyzing, Evaluating, and Creating. This benchmark +extends beyond traditional single-dimensional evaluations by providing a deeper +analysis of LLMs' abilities to handle culturally specific content, ranging from +basic recall of facts to higher-order cognitive tasks such as creative +synthesis. Additionally, the study integrates Retrieval-Augmented Generation +(RAG) technology to address the challenges of minority cultural knowledge +representation in LLMs, demonstrating how RAG enhances the models' performance +by dynamically incorporating relevant external information. The results +highlight the effectiveness of RAG in improving accuracy across all cognitive +domains, particularly in tasks requiring precise retrieval and application of +cultural knowledge. However, the findings also reveal the limitations of RAG in +creative tasks, underscoring the need for further optimization. This benchmark +provides a robust tool for evaluating and comparing LLMs in culturally diverse +contexts, offering valuable insights for future research and development in +AI-driven cultural knowledge preservation and dissemination. + +
+
+ comment: Submitted to O-COCOSDA 2024 +
+
+
+
+
+ + ☆ Self-Instructed Derived Prompt Generation Meets In-Context Learning: + Unlocking New Potential of Black-Box LLMs + + +
+ Large language models (LLMs) have shown success in generating high-quality +responses. In order to achieve better alignment with LLMs with human +preference, various works are proposed based on specific optimization process, +which, however, is not suitable to Black-Box LLMs like GPT-4, due to +inaccessible parameters. In Black-Box LLMs case, their performance is highly +dependent on the quality of the provided prompts. Existing methods to enhance +response quality often involve a prompt refinement model, yet these approaches +potentially suffer from semantic inconsistencies between the refined and +original prompts, and typically overlook the relationship between them. To +address these challenges, we introduce a self-instructed in-context learning +framework that empowers LLMs to deliver more effective responses by generating +reliable derived prompts to construct informative contextual environments. Our +approach incorporates a self-instructed reinforcement learning mechanism, +enabling direct interaction with the response model during derived prompt +generation for better alignment. We then formulate querying as an in-context +learning task, using responses from LLMs combined with the derived prompts to +establish a contextual demonstration for the original prompt. This strategy +ensures alignment with the original query, reduces discrepancies from refined +prompts, and maximizes the LLMs' in-context learning capability. Extensive +experiments demonstrate that the proposed method not only generates more +reliable derived prompts but also significantly enhances LLMs' ability to +deliver more effective responses, including Black-Box models such as GPT-4. + +
+
+
+
+
+ + ☆ VoxHakka: A Dialectally Diverse Multi-speaker Text-to-Speech System for + Taiwanese Hakka + + +
+ This paper introduces VoxHakka, a text-to-speech (TTS) system designed for +Taiwanese Hakka, a critically under-resourced language spoken in Taiwan. +Leveraging the YourTTS framework, VoxHakka achieves high naturalness and +accuracy and low real-time factor in speech synthesis while supporting six +distinct Hakka dialects. This is achieved by training the model with +dialect-specific data, allowing for the generation of speaker-aware Hakka +speech. To address the scarcity of publicly available Hakka speech corpora, we +employed a cost-effective approach utilizing a web scraping pipeline coupled +with automatic speech recognition (ASR)-based data cleaning techniques. This +process ensured the acquisition of a high-quality, multi-speaker, multi-dialect +dataset suitable for TTS training. Subjective listening tests conducted using +comparative mean opinion scores (CMOS) demonstrate that VoxHakka significantly +outperforms existing publicly available Hakka TTS systems in terms of +pronunciation accuracy, tone correctness, and overall naturalness. This work +represents a significant advancement in Hakka language technology and provides +a valuable resource for language preservation and revitalization efforts. + +
+
+ comment: Submitted to O-COCOSDA 2024 +
+
+
+
+
+ + ☆ Effective Noise-aware Data Simulation for Domain-adaptive Speech + Enhancement Leveraging Dynamic Stochastic Perturbation + + +
+ Cross-domain speech enhancement (SE) is often faced with severe challenges +due to the scarcity of noise and background information in an unseen target +domain, leading to a mismatch between training and test conditions. This study +puts forward a novel data simulation method to address this issue, leveraging +noise-extractive techniques and generative adversarial networks (GANs) with +only limited target noisy speech data. Notably, our method employs a noise +encoder to extract noise embeddings from target-domain data. These embeddings +aptly guide the generator to synthesize utterances acoustically fitted to the +target domain while authentically preserving the phonetic content of the input +clean speech. Furthermore, we introduce the notion of dynamic stochastic +perturbation, which can inject controlled perturbations into the noise +embeddings during inference, thereby enabling the model to generalize well to +unseen noise conditions. Experiments on the VoiceBank-DEMAND benchmark dataset +demonstrate that our domain-adaptive SE method outperforms an existing strong +baseline based on data simulation. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ It is Time to Develop an Auditing Framework to Promote Value Aware + Chatbots + + +
+ The launch of ChatGPT in November 2022 marked the beginning of a new era in +AI, the availability of generative AI tools for everyone to use. ChatGPT and +other similar chatbots boast a wide range of capabilities from answering +student homework questions to creating music and art. Given the large amounts +of human data chatbots are built on, it is inevitable that they will inherit +human errors and biases. These biases have the potential to inflict significant +harm or increase inequity on different subpopulations. Because chatbots do not +have an inherent understanding of societal values, they may create new content +that is contrary to established norms. Examples of concerning generated content +includes child pornography, inaccurate facts, and discriminatory posts. In this +position paper, we argue that the speed of advancement of this technology +requires us, as computer and data scientists, to mobilize and develop a +values-based auditing framework containing a community established standard set +of measurements to monitor the health of different chatbots and LLMs. To +support our argument, we use a simple audit template to share the results of +basic audits we conduct that are focused on measuring potential bias in search +engine style tasks, code generation, and story generation. We identify +responses from GPT 3.5 and GPT 4 that are both consistent and not consistent +with values derived from existing law. While the findings come as no surprise, +they do underscore the urgency of developing a robust auditing framework for +openly sharing results in a consistent way so that mitigation strategies can be +developed by the academic community, government agencies, and companies when +our values are not being adhered to. We conclude this paper with +recommendations for value-based strategies for improving the technologies. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2306.07500 +
+
+
+
+
+ + ☆ S$^3$c-Math: Spontaneous Step-level Self-correction Makes Large Language + Models Better Mathematical Reasoners + + +
+ Self-correction is a novel method that can stimulate the potential reasoning +abilities of large language models (LLMs). It involves detecting and correcting +errors during the inference process when LLMs solve reasoning problems. +However, recent works do not regard self-correction as a spontaneous and +intrinsic capability of LLMs. Instead, such correction is achieved through +post-hoc generation, external knowledge introduction, multi-model +collaboration, and similar techniques. In this paper, we propose a series of +mathematical LLMs called S$^3$c-Math, which are able to perform Spontaneous +Step-level Self-correction for Mathematical reasoning. This capability helps +LLMs to recognize whether their ongoing inference tends to contain errors and +simultaneously correct these errors to produce a more reliable response. We +proposed a method, which employs a step-level sampling approach to construct +step-wise self-correction data for achieving such ability. Additionally, we +implement a training strategy that uses above constructed data to equip LLMs +with spontaneous step-level self-correction capacities. Our data and methods +have been demonstrated to be effective across various foundation LLMs, +consistently showing significant progress in evaluations on GSM8K, MATH, and +other mathematical benchmarks. To the best of our knowledge, we are the first +to introduce the spontaneous step-level self-correction ability of LLMs in +mathematical reasoning. + +
+
+
+
+
+ + ♻ ☆ Improving Rare Word Translation With Dictionaries and Attention Masking + + +
+ In machine translation, rare words continue to be a problem for the dominant +encoder-decoder architecture, especially in low-resource and out-of-domain +translation settings. Human translators solve this problem with monolingual or +bilingual dictionaries. In this paper, we propose appending definitions from a +bilingual dictionary to source sentences and using attention masking to link +together rare words with their definitions. We find that including definitions +for rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1. + +
+
+ comment: 11 pages, 3 figures, 3 tables. Accepted at AMTA 2024 +
+
+
+
+
+ + ♻ ☆ Low-Rank Quantization-Aware Training for LLMs + + +
+ Large language models (LLMs) are omnipresent, however their practical +deployment is challenging due to their ever increasing computational and memory +demands. Quantization is one of the most effective ways to make them more +compute and memory efficient. Quantization-aware training (QAT) methods, +generally produce the best quantized performance, however it comes at the cost +of potentially long training time and excessive memory usage, making it +impractical when applying for LLMs. Inspired by parameter-efficient fine-tuning +(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a +lightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several +components to save memory without sacrificing predictive performance: (a) +low-rank auxiliary weights that are aware of the quantization grid; (b) a +downcasting operator using fixed-point or double-packed integers and (c) +checkpointing. Unlike most related work, our method (i) is inference-efficient, +leading to no additional overhead compared to traditional PTQ; (ii) can be seen +as a general extended pretraining framework, meaning that the resulting model +can still be utilized for any downstream task afterwards; (iii) can be applied +across a wide range of quantization settings, such as different choices +quantization granularity, activation quantization, and seamlessly combined with +many PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families +and validate its effectiveness on several downstream tasks. Our method +outperforms common post-training quantization (PTQ) approaches and reaches the +same model performance as full-model QAT at the fraction of its memory usage. +Specifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of +memory. Our source code is available at +https://github.com/qualcomm-ai-research/LR-QAT + +
+
+
+
+
+ + ♻ ☆ Foundation Models for Music: A Survey + + +
+ In recent years, foundation models (FMs) such as large language models (LLMs) +and latent diffusion models (LDMs) have profoundly impacted diverse sectors, +including music. This comprehensive review examines state-of-the-art (SOTA) +pre-trained models and foundation models in music, spanning from representation +learning, generative learning and multimodal learning. We first contextualise +the significance of music in various industries and trace the evolution of AI +in music. By delineating the modalities targeted by foundation models, we +discover many of the music representations are underexplored in FM development. +Then, emphasis is placed on the lack of versatility of previous methods on +diverse music applications, along with the potential of FMs in music +understanding, generation and medical application. By comprehensively exploring +the details of the model pre-training paradigm, architectural choices, +tokenisation, finetuning methodologies and controllability, we emphasise the +important topics that should have been well explored, like instruction tuning +and in-context learning, scaling law and emergent ability, as well as +long-sequence modelling etc. A dedicated section presents insights into music +agents, accompanied by a thorough analysis of datasets and evaluations +essential for pre-training and downstream tasks. Finally, by underscoring the +vital importance of ethical considerations, we advocate that following research +on FM for music should focus more on such issues as interpretability, +transparency, human responsibility, and copyright issues. The paper offers +insights into future challenges and trends on FMs for music, aiming to shape +the trajectory of human-AI collaboration in the music realm. + +
+
+
+
+
+ + ♻ ☆ InkubaLM: A small language model for low-resource African languages + + +
+ High-resource language models often fall short in the African context, where +there is a critical need for models that are efficient, accessible, and locally +relevant, even amidst significant computing and data constraints. This paper +introduces InkubaLM, a small language model with 0.4 billion parameters, which +achieves performance comparable to models with significantly larger parameter +counts and more extensive training data on tasks such as machine translation, +question-answering, AfriMMLU, and the AfriXnli task. Notably, InkubaLM +outperforms many larger models in sentiment analysis and demonstrates +remarkable consistency across multiple languages. This work represents a +pivotal advancement in challenging the conventional paradigm that effective +language models must rely on substantial resources. Our model and datasets are +publicly available at https://huggingface.co/lelapa to encourage research and +development on low-resource languages. + +
+
+
+
+
+ + ♻ ☆ OceanGPT: A Large Language Model for Ocean Science Tasks ACL2024 + + +
+ Ocean science, which delves into the oceans that are reservoirs of life and +biodiversity, is of great significance given that oceans cover over 70% of our +planet's surface. Recently, advances in Large Language Models (LLMs) have +transformed the paradigm in science. Despite the success in other domains, +current LLMs often fall short in catering to the needs of domain experts like +oceanographers, and the potential of LLMs for ocean science is under-explored. +The intrinsic reasons are the immense and intricate nature of ocean data as +well as the necessity for higher granularity and richness in knowledge. To +alleviate these issues, we introduce OceanGPT, the first-ever large language +model in the ocean domain, which is expert in various ocean science tasks. We +also propose OceanGPT, a novel framework to automatically obtain a large volume +of ocean domain instruction data, which generates instructions based on +multi-agent collaboration. Additionally, we construct the first oceanography +benchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean +domain. Though comprehensive experiments, OceanGPT not only shows a higher +level of knowledge expertise for oceans science tasks but also gains +preliminary embodied intelligence capabilities in ocean technology. + +
+
+ comment: ACL2024. Project Website: http://oceangpt.zjukg.cn/ +
+
+
+
+
+ + ♻ ☆ A Survey on Stability of Learning with Limited Labelled Data and its + Sensitivity to the Effects of Randomness + + +
+ Learning with limited labelled data, such as prompting, in-context learning, +fine-tuning, meta-learning or few-shot learning, aims to effectively train a +model using only a small amount of labelled samples. However, these approaches +have been observed to be excessively sensitive to the effects of uncontrolled +randomness caused by non-determinism in the training process. The randomness +negatively affects the stability of the models, leading to large variances in +results across training runs. When such sensitivity is disregarded, it can +unintentionally, but unfortunately also intentionally, create an imaginary +perception of research progress. Recently, this area started to attract +research attention and the number of relevant studies is continuously growing. +In this survey, we provide a comprehensive overview of 415 papers addressing +the effects of randomness on the stability of learning with limited labelled +data. We distinguish between four main tasks addressed in the papers +(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness +effects), providing findings for each one. Furthermore, we identify and discuss +seven challenges and open problems together with possible directions to +facilitate further research. The ultimate goal of this survey is to emphasise +the importance of this growing research area, which so far has not received an +appropriate level of attention, and reveal impactful directions for future +research. + +
+
+ comment: Accepted to ACM Comput. Surv. 2024 +
+
+
+
+
+ + ♻ ☆ Towards Scalable Automated Alignment of LLMs: A Survey + + +
+ Alignment is the most critical step in building large language models (LLMs) +that meet human needs. With the rapid development of LLMs gradually surpassing +human capabilities, traditional alignment methods based on human-annotation are +increasingly unable to meet the scalability demands. Therefore, there is an +urgent need to explore new sources of automated alignment signals and technical +approaches. In this paper, we systematically review the recently emerging +methods of automated alignment, attempting to explore how to achieve effective, +scalable, automated alignment once the capabilities of LLMs exceed those of +humans. Specifically, we categorize existing automated alignment methods into 4 +major categories based on the sources of alignment signals and discuss the +current status and potential development of each category. Additionally, we +explore the underlying mechanisms that enable automated alignment and discuss +the essential factors that make automated alignment technologies feasible and +effective from the fundamental role of alignment. + +
+
+ comment: Paper List: https://github.com/cascip/awesome-auto-alignment +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes and formatting improvements. V3: improvements from + journal revisions +
+
+
+
+
+ + ♻ ☆ A Fundamental Trade-off in Aligned Language Models and its Relation to + Sampling Adaptors + + +
+ The relationship between the quality of a string, as judged by a human +reader, and its probability, $p(\boldsymbol{y})$ under a language model +undergirds the development of better language models. For example, many popular +algorithms for sampling from a language model have been conceived with the goal +of manipulating $p(\boldsymbol{y})$ to place higher probability on strings that +humans deem of high quality. In this article, we examine the +probability--quality relationship in language models explicitly aligned to +human preferences, e.g., through reinforcement learning through human feedback. +We show that, when sampling corpora from an aligned language model, there +exists a trade-off between the strings' average reward and average +log-likelihood under the prior language model, i.e., the same model before +alignment with human preferences. We provide a formal treatment of this +phenomenon and demonstrate how a choice of sampling adaptor allows for a +selection of how much likelihood we exchange for the reward. + +
+
+
+
+
+ + ♻ ☆ Correcting misinformation on social media with a large language model + + +
+ Real-world misinformation, often multimodal, can be partially or fully +factual but misleading using diverse tactics like conflating correlation with +causation. Such misinformation is severely understudied, challenging to +address, and harms various social domains, particularly on social media, where +it can spread rapidly. High-quality and timely correction of misinformation +that identifies and explains its (in)accuracies effectively reduces false +beliefs. Despite the wide acceptance of manual correction, it is difficult to +be timely and scalable. While LLMs have versatile capabilities that could +accelerate misinformation correction, they struggle due to a lack of recent +information, a tendency to produce false content, and limitations in addressing +multimodal information. We propose MUSE, an LLM augmented with access to and +credibility evaluation of up-to-date information. By retrieving evidence as +refutations or supporting context, MUSE identifies and explains content +(in)accuracies with references. It conducts multimodal retrieval and interprets +visual content to verify and correct multimodal content. Given the absence of a +comprehensive evaluation approach, we propose 13 dimensions of misinformation +correction quality. Then, fact-checking experts evaluate responses to social +media content that are not presupposed to be misinformation but broadly include +(partially) incorrect and correct posts that may (not) be misleading. Results +demonstrate MUSE's ability to write high-quality responses to potential +misinformation--across modalities, tactics, domains, political leanings, and +for information that has not previously been fact-checked online--within +minutes of its appearance on social media. Overall, MUSE outperforms GPT-4 by +37% and even high-quality responses from laypeople by 29%. Our work provides a +general methodological and evaluative framework to correct misinformation at +scale. + +
+
+ comment: 50 pages +
+
+
+
+
+ + ♻ ☆ NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment + + +
+ Aligning Large Language Models (LLMs) with human values and preferences is +essential for making them helpful and safe. However, building efficient tools +to perform alignment can be challenging, especially for the largest and most +competent LLMs which often contain tens or hundreds of billions of parameters. +We create NeMo-Aligner, a toolkit for model alignment that can efficiently +scale to a thousand GPUs for training the largest open-source LLMs such as +Nemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized +and scalable implementations for major paradigms of model alignment such as: +Reinforcement Learning from Human Feedback (RLHF), Direct Preference +Optimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally, +our toolkit supports running most of the alignment techniques in a Parameter +Efficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for +extensibility, allowing support for other alignment techniques with minimal +effort. It is open-sourced with Apache 2.0 License and we invite community +contributions at https://github.com/NVIDIA/NeMo-Aligner + +
+
+ comment: 16 pages, 4 figures, Accepted to COLM 2024 +
+
+
+
+
+ + ♻ ☆ Squid: Long Context as a New Modality for Energy-Efficient On-Device + Language Models + + +
+ This paper presents Dolphin, a novel decoder-decoder architecture for +energy-efficient processing of long contexts in language models. Our approach +addresses the significant energy consumption and latency challenges inherent in +on-device models. Dolphin employs a compact 0.5B parameter decoder to distill +extensive contextual information into a memory embedding, substantially +reducing the input length for the primary 7B parameter decoder model. Inspired +by vision-language models, we repurpose the image embedding projector to encode +long textual contexts, effectively treating extended context as a distinct +modality. This innovative method enables processing of substantially longer +contexts without the typical computational overhead associated with extended +input sequences. Empirical evaluations demonstrate a 10-fold improvement in +energy efficiency and a 5-fold reduction in latency compared to conventional +full-length context processing methods without losing quality of the response. +Our work contributes to the development of more sustainable and scalable +language models for on-device applications, addressing the critical need for +energy-efficient and responsive AI technologies in resource-constrained +environments while maintaining the accuracy to understand long contexts. This +research has implications for the broader field of natural language processing, +particularly in the domain of efficient model design for resource-limited +settings. By enabling more sophisticated AI capabilities on edge devices, +Dolphin paves the way for advanced language processing in a wide range of +applications where computational resources are at a premium. The Dolphin model +is publicly available at https://huggingface.co/NexaAIDev/Dolphin. + +
+
+
+
+
+ + ♻ ☆ OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step + + +
+ Despite significant advancements in text generation and reasoning, Large +Language Models (LLMs) still face challenges in accurately performing complex +arithmetic operations. Language model systems often enable LLMs to generate +code for arithmetic operations to achieve accurate calculations. However, this +approach compromises speed and security, and fine-tuning risks the language +model losing prior capabilities. We propose a framework that enables exact +arithmetic in a single autoregressive step, providing faster, more secure, and +more interpretable LLM systems with arithmetic capabilities. We use the hidden +states of a LLM to control a symbolic architecture that performs arithmetic. +Our implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama) +achieves 100\% accuracy on single arithmetic operations +($+,-,\times,\div,\sin{},\cos{},\log{},\exp{},\sqrt{}$), outperforming GPT 4o +with and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o +with and without a code interpreter on average across a range of mathematical +problem solving benchmarks, demonstrating that OccamLLMs can excel in +arithmetic tasks, even surpassing much larger models. We will make our code +public shortly. + +
+
+
+
+
+ + ♻ ☆ Flood of Techniques and Drought of Theories: Emotion Mining in Disasters + + +
+ Emotion mining has become a crucial tool for understanding human emotions +during disasters, leveraging the extensive data generated on social media +platforms. This paper aims to summarize existing research on emotion mining +within disaster contexts, highlighting both significant discoveries and +persistent issues. On the one hand, emotion mining techniques have achieved +acceptable accuracy enabling applications such as rapid damage assessment and +mental health surveillance. On the other hand, with many studies adopting +data-driven approaches, several methodological issues remain. These include +arbitrary emotion classification, ignoring biases inherent in data collection +from social media, such as the overrepresentation of individuals from higher +socioeconomic status on Twitter, and the lack of application of theoretical +frameworks like cross-cultural comparisons. These problems can be summarized as +a notable lack of theory-driven research and ignoring insights from social and +behavioral sciences. This paper underscores the need for interdisciplinary +collaboration between computer scientists and social scientists to develop more +robust and theoretically grounded approaches in emotion mining. By addressing +these gaps, we aim to enhance the effectiveness and reliability of emotion +mining methodologies, ultimately contributing to improved disaster +preparedness, response, and recovery. + Keywords: emotion mining, sentiment analysis, natural disasters, psychology, +technological disasters + +
+
+
+
+
+ + ♻ ☆ How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming + Ability in Multi-Agent Environments + + +
+ Decision-making, a complicated task requiring various types of abilities, +presents an excellent framework for assessing Large Language Models (LLMs). Our +research investigates decision-making capabilities of LLMs through the lens of +Game Theory. We focus specifically on games that support the simultaneous +participation of more than two agents. We introduce GAMA($\gamma$)-Bench, which +evaluates LLMs' Gaming Ability in Multi-Agent environments. $\gamma$-Bench +includes eight classical multi-agent games and a scoring scheme specially +designed to quantitatively assess LLMs' performance. Leveraging $\gamma$-Bench, +we investigate LLMs' robustness, generalizability, and strategies for +enhancement. Results reveal that while GPT-3.5 shows satisfying robustness, its +generalizability is relatively limited. However, its performance can be +improved through approaches such as Chain-of-Thought. Additionally, we evaluate +twelve versions from six models, including GPT-3.5, GPT-4, Gemini, LLaMA-3.1, +Mixtral, and Qwen-2. We find that Gemini-1.5-Pro outperforms other models with +a score of $63.8$ out of $100$, followed by LLaMA-3.1-70B and GPT-4 with scores +of $60.9$ and $60.5$, respectively. The code and experimental results are made +publicly available via https://github.com/CUHK-ARISE/GAMABench. + +
+
+ comment: 11 pages of main text. 20 pages of appendices. 12 figures, 9 tables. + Added models: Gemini-1.5-Pro, LLaMA-3.1-{7, 70, 405}B, Mixtral-8x{7, 22}B, + Qwen-2-72B +
+
+
+
+
+ + ♻ ☆ The Responsible Foundation Model Development Cheatsheet: A Review of + Tools & Resources + + +
+ Foundation model development attracts a rapidly expanding body of +contributors, scientists, and applications. To help shape responsible +development practices, we introduce the Foundation Model Development +Cheatsheet: a growing collection of 250+ tools and resources spanning text, +vision, and speech modalities. We draw on a large body of prior work to survey +resources (e.g. software, documentation, frameworks, guides, and practical +tools) that support informed data selection, processing, and understanding, +precise and limitation-aware artifact documentation, efficient model training, +advance awareness of the environmental impact from training, careful model +evaluation of capabilities, risks, and claims, as well as responsible model +release, licensing and deployment practices. We hope this curated collection of +resources helps guide more responsible development. The process of curating +this list, enabled us to review the AI development ecosystem, revealing what +tools are critically missing, misused, or over-used in existing practices. We +find that (i) tools for data sourcing, model evaluation, and monitoring are +critically under-serving ethical and real-world needs, (ii) evaluations for +model safety, capabilities, and environmental impact all lack reproducibility +and transparency, (iii) text and particularly English-centric analyses continue +to dominate over multilingual and multi-modal analyses, and (iv) evaluation of +systems, rather than just models, is needed so that capabilities and impact are +assessed in context. + +
+
+
+
+
+ + ♻ ☆ COFFEE: A Contrastive Oracle-Free Framework for Event Extraction ATC + + +
+ Event extraction is a complex information extraction task that involves +extracting events from unstructured text. Prior classification-based methods +require comprehensive entity annotations for joint training, while newer +generation-based methods rely on heuristic templates containing oracle +information such as event type, which is often unavailable in real-world +scenarios. In this study, we consider a more realistic setting of this task, +namely the Oracle-Free Event Extraction (OFEE) task, where only the input +context is given without any oracle information, including event type, event +ontology and trigger word. To solve this task, we propose a new framework, +called COFFEE, which extracts the events solely based on the document context +without referring to any oracle information. In particular, a contrastive +selection model is introduced in COFFEE to rectify the generated triggers and +handle multi-event instances. The proposed COFFEE outperforms state-of-the-art +approaches under the oracle-free setting of the event extraction task, as +evaluated on a public event extraction benchmark ACE05. + +
+
+ comment: Accepted to MATCHING Workshop at ACL 2023 +
+
+
+
+
+ + ♻ ☆ Persian Slang Text Conversion to Formal and Deep Learning of Persian + Short Texts on Social Media for Sentiment Classification + + +
+ The lack of a suitable tool for the analysis of conversational texts in the +Persian language has made various analyses of these texts, including Sentiment +Analysis, difficult. In this research, we tried to make the understanding of +these texts easier for the machine by providing PSC, Persian Slang Converter, a +tool for converting conversational texts into formal ones, and by using the +most up-to-date and best deep learning methods along with the PSC, the +sentiment learning of short Persian language texts for the machine in a better +way. be made More than 10 million unlabeled texts from various social networks +and movie subtitles (as Conversational texts) and about 10 million news texts +(as formal texts) have been used for training unsupervised models and formal +implementation of the tool. 60,000 texts from the comments of Instagram social +network users with positive, negative, and neutral labels are considered +supervised data for training the emotion classification model of short texts. +Using the formal tool, 57% of the words of the corpus of conversation were +converted. Finally, by using the formalizer, FastText model, and deep LSTM +network, an accuracy of 81.91 was obtained on the test data. + +
+
+ comment: 16 pages, 4 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Preference Learning Algorithms Do Not Learn Preference Rankings + + +
+ Preference learning algorithms (e.g., RLHF and DPO) are frequently used to +steer LLMs to produce generations that are more preferred by humans, but our +understanding of their inner workings is still limited. In this work, we study +the conventional wisdom that preference learning trains models to assign higher +likelihoods to more preferred outputs than less preferred outputs, measured via +$\textit{ranking accuracy}$. Surprisingly, we find that most state-of-the-art +preference-tuned models achieve a ranking accuracy of less than 60% on common +preference datasets. We furthermore derive the $\textit{idealized ranking +accuracy}$ that a preference-tuned LLM would achieve if it optimized the DPO or +RLHF objective perfectly. We demonstrate that existing models exhibit a +significant $\textit{alignment gap}$ -- $\textit{i.e.}$, a gap between the +observed and idealized ranking accuracies. We attribute this discrepancy to the +DPO objective, which is empirically and theoretically ill-suited to fix even +mild ranking errors in the reference model, and derive a simple and efficient +formula for quantifying the difficulty of learning a given preference +datapoint. Finally, we demonstrate that ranking accuracy strongly correlates +with the empirically popular win rate metric when the model is close to the +reference model used in the objective, shedding further light on the +differences between on-policy (e.g., RLHF) and off-policy (e.g., DPO) +preference learning algorithms. + +
+
+
+
+
+ + ♻ ☆ A Voter-Based Stochastic Rejection-Method Framework for Asymptotically + Safe Language Model Outputs + + +
+ This paper proposes a new method for preventing unsafe or otherwise low +quality large language model (LLM) outputs, by leveraging the stochasticity of +LLMs. We propose a system whereby LLM checkers vote on the acceptability of a +generated output, regenerating it if a threshold of disapproval is reached, +until sufficient checkers approve. We further propose estimators for cost and +failure rate, and based on those estimators and experimental data tailored to +the application, we propose an algorithm that achieves a desired failure rate +at the least possible cost. We demonstrate that, under these models, failure +rate decreases exponentially as a function of cost when voter count and +threshold are chosen according to the algorithm, and that the models reasonably +estimate the actual performance of such a system in action, even with limited +data. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ ARN: Analogical Reasoning on Narratives + + +
+ As a core cognitive skill that enables the transferability of information +across domains, analogical reasoning has been extensively studied for both +humans and computational models. However, while cognitive theories of analogy +often focus on narratives and study the distinction between surface, +relational, and system similarities, existing work in natural language +processing has a narrower focus as far as relational analogies between word +pairs. This gap brings a natural question: can state-of-the-art large language +models (LLMs) detect system analogies between narratives? To gain insight into +this question and extend word-based relational analogies to relational system +analogies, we devise a comprehensive computational framework that +operationalizes dominant theories of analogy, using narrative elements to +create surface and system mappings. Leveraging the interplay between these +mappings, we create a binary task and benchmark for Analogical Reasoning on +Narratives (ARN), covering four categories of far (cross-domain)/near +(within-domain) analogies and disanalogies. We show that while all LLMs can +largely recognize near analogies, even the largest ones struggle with far +analogies in a zero-shot setting, with GPT4.0 scoring below random. Guiding the +models through solved examples and chain-of-thought reasoning enhances their +analogical reasoning ability. Yet, since even in the few-shot setting, the best +model only performs halfway between random and humans, ARN opens exciting +directions for computational analogical reasoners. + +
+
+
+
+
+ + ♻ ☆ Zyda: A 1.3T Dataset for Open Language Modeling + + +
+ The size of large language models (LLMs) has scaled dramatically in recent +years and their computational and data requirements have surged +correspondingly. State-of-the-art language models, even at relatively smaller +sizes, typically require training on at least a trillion tokens. This rapid +advancement has eclipsed the growth of open-source datasets available for +large-scale LLM pretraining. In this paper, we introduce Zyda (Zyphra Dataset), +a dataset under a permissive license comprising 1.3 trillion tokens, assembled +by integrating several major respected open-source datasets into a single, +high-quality corpus. We apply rigorous filtering and deduplication processes, +both within and across datasets, to maintain and enhance the quality derived +from the original datasets. Our evaluations show that Zyda not only competes +favorably with other open datasets like Dolma, FineWeb, and RefinedWeb, but +also substantially improves the performance of comparable models from the +Pythia suite. Our rigorous data processing methods significantly enhance Zyda's +effectiveness, outperforming even the best of its constituent datasets when +used independently. + +
+
+
+
+
+ + ♻ ☆ Correction with Backtracking Reduces Hallucination in Summarization + + +
+ Abstractive summarization aims at generating natural language summaries of a +source document that are succinct while preserving the important elements. +Despite recent advances, neural text summarization models are known to be +susceptible to hallucinating (or more correctly confabulating), that is to +produce summaries with details that are not grounded in the source document. In +this paper, we introduce a simple yet efficient technique, CoBa, to reduce +hallucination in abstractive summarization. The approach is based on two steps: +hallucination detection and mitigation. We show that the former can be achieved +through measuring simple statistics about conditional word probabilities and +distance to context words. Further, we demonstrate that straight-forward +backtracking is surprisingly effective at mitigation. We thoroughly evaluate +the proposed method with prior art on three benchmark datasets for text +summarization. The results show that CoBa is effective and efficient in +reducing hallucination, and offers great adaptability and flexibility. Code can +be found at https://github.com/zhenzhel/CoBa. + +
+
+
+
+
+ + ♻ ☆ Investigating the Robustness of LLMs on Math Word Problems + + +
+ Large Language Models (LLMs) excel at various tasks, including solving math +word problems (MWPs), but struggle with real-world problems containing +irrelevant information. To address this, we propose a prompting framework that +generates adversarial variants of MWPs by adding irrelevant variables. We +introduce a dataset, ProbleMATHIC, containing both adversarial and +non-adversarial MWPs. Our experiments reveal that LLMs are susceptible to +distraction by numerical noise, resulting in an average relative performance +drop of ~26% on adversarial MWPs. To mitigate this, we fine-tune LLMs (Llama-2, +Mistral) on the adversarial samples from our dataset. Fine-tuning on +adversarial training instances improves performance on adversarial MWPs by ~8%, +indicating increased robustness to noise and better ability to identify +relevant data for reasoning. Finally, to assess the generalizability of our +prompting framework, we introduce GSM-8K-Adv, an adversarial variant of the +GSM-8K benchmark. LLMs continue to struggle when faced with adversarial +information, reducing performance by up to ~6%. + +
+
+
+
+
+ + ♻ ☆ A Survey on Responsible Generative AI: What to Generate and What Not + + +
+ In recent years, generative AI (GenAI), like large language models and +text-to-image models, has received significant attention across various +domains. However, ensuring the responsible generation of content by these +models is crucial for their real-world applicability. This raises an +interesting question: What should responsible GenAI generate, and what should +it not? To answer the question, this paper investigates the practical +responsible requirements of both textual and visual generative models, +outlining five key considerations: generating truthful content, avoiding toxic +content, refusing harmful instruction, leaking no training data-related +content, and ensuring generated content identifiable. Specifically, we review +recent advancements and challenges in addressing these requirements. Besides, +we discuss and emphasize the importance of responsible GenAI across healthcare, +education, finance, and artificial general intelligence domains. Through a +unified perspective on both textual and visual generative models, this paper +aims to provide insights into practical safety-related issues and further +benefit the community in building responsible GenAI. + +
+
+ comment: 77 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ GPT has become financially literate: Insights from financial literacy + tests of GPT and a preliminary test of how people use it as a source of + advice + + +
+ We assess the ability of GPT -- a large language model -- to serve as a +financial robo-advisor for the masses, by using a financial literacy test. +Davinci and ChatGPT based on GPT-3.5 score 66% and 65% on the financial +literacy test, respectively, compared to a baseline of 33%. However, ChatGPT +based on GPT-4 achieves a near-perfect 99% score, pointing to financial +literacy becoming an emergent ability of state-of-the-art models. We use the +Judge-Advisor System and a savings dilemma to illustrate how researchers might +assess advice-utilization from large language models. We also present a number +of directions for future research. + +
+
+ comment: 43 pages, 2 figures and 2 tables in main text; in V2 added + information that this is the Author Accepted Manuscript version +
+
+
+
+
+ + ♻ ☆ Interpretation of Intracardiac Electrograms Through Textual + Representations + + +
+ Understanding the irregular electrical activity of atrial fibrillation (AFib) +has been a key challenge in electrocardiography. For serious cases of AFib, +catheter ablations are performed to collect intracardiac electrograms (EGMs). +EGMs offer intricately detailed and localized electrical activity of the heart +and are an ideal modality for interpretable cardiac studies. Recent +advancements in artificial intelligence (AI) has allowed some works to utilize +deep learning frameworks to interpret EGMs during AFib. Additionally, language +models (LMs) have shown exceptional performance in being able to generalize to +unseen domains, especially in healthcare. In this study, we are the first to +leverage pretrained LMs for finetuning of EGM interpolation and AFib +classification via masked language modeling. We formulate the EGM as a textual +sequence and present competitive performances on AFib classification compared +against other representations. Lastly, we provide a comprehensive +interpretability study to provide a multi-perspective intuition of the model's +behavior, which could greatly benefit the clinical use. + +
+
+ comment: 17 pages, 7 figures; Accepted to CHIL 2024 +
+
+
+
+
+ + ♻ ☆ RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with + AI Feedback ICML 2024 + + +
+ Reinforcement learning from human feedback (RLHF) has proven effective in +aligning large language models (LLMs) with human preferences, but gathering +high-quality preference labels is expensive. RL from AI Feedback (RLAIF), +introduced in Bai et al., offers a promising alternative that trains the reward +model (RM) on preferences generated by an off-the-shelf LLM. Across the tasks +of summarization, helpful dialogue generation, and harmless dialogue +generation, we show that RLAIF achieves comparable performance to RLHF. +Furthermore, we take a step towards "self-improvement" by demonstrating that +RLAIF can outperform a supervised fine-tuned baseline even when the AI labeler +is the same size as the policy, or even the exact same checkpoint as the +initial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that +circumvents RM training by obtaining rewards directly from an off-the-shelf LLM +during RL, which achieves superior performance to canonical RLAIF. Our results +suggest that RLAIF can achieve performance on-par with using human feedback, +offering a potential solution to the scalability limitations of RLHF. + +
+
+ comment: Presented at ICML 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 67 + +
+
+
+ + ☆ Coaching a Robotic Sonographer: Learning Robotic Ultrasound with Sparse + Expert's Feedback + + +
+ Ultrasound is widely employed for clinical intervention and diagnosis, due to +its advantages of offering non-invasive, radiation-free, and real-time imaging. +However, the accessibility of this dexterous procedure is limited due to the +substantial training and expertise required of operators. The robotic +ultrasound (RUS) offers a viable solution to address this limitation; +nonetheless, achieving human-level proficiency remains challenging. Learning +from demonstrations (LfD) methods have been explored in RUS, which learns the +policy prior from a dataset of offline demonstrations to encode the mental +model of the expert sonographer. However, active engagement of experts, i.e. +Coaching, during the training of RUS has not been explored thus far. Coaching +is known for enhancing efficiency and performance in human training. This paper +proposes a coaching framework for RUS to amplify its performance. The framework +combines DRL (self-supervised practice) with sparse expert's feedback through +coaching. The DRL employs an off-policy Soft Actor-Critic (SAC) network, with a +reward based on image quality rating. The coaching by experts is modeled as a +Partially Observable Markov Decision Process (POMDP), which updates the policy +parameters based on the correction by the expert. The validation study on +phantoms showed that coaching increases the learning rate by $25\%$ and the +number of high-quality image acquisition by $74.5\%$. + +
+
+ comment: Accepted in IEEE Transactions on Medical Robotics and Bionics (TMRB) + 2024 +
+
+
+
+
+ + ☆ What Do You See in Common? Learning Hierarchical Prototypes over + Tree-of-Life to Discover Evolutionary Traits + + +
+ A grand challenge in biology is to discover evolutionary traits - features of +organisms common to a group of species with a shared ancestor in the tree of +life (also referred to as phylogenetic tree). With the growing availability of +image repositories in biology, there is a tremendous opportunity to discover +evolutionary traits directly from images in the form of a hierarchy of +prototypes. However, current prototype-based methods are mostly designed to +operate over a flat structure of classes and face several challenges in +discovering hierarchical prototypes, including the issue of learning +over-specific features at internal nodes. To overcome these challenges, we +introduce the framework of Hierarchy aligned Commonality through Prototypical +Networks (HComP-Net). We empirically show that HComP-Net learns prototypes that +are accurate, semantically consistent, and generalizable to unseen species in +comparison to baselines on birds, butterflies, and fishes datasets. The code +and datasets are available at https://github.com/Imageomics/HComPNet. + +
+
+ comment: 34 pages, 27 figures +
+
+
+
+
+ + ☆ YoloTag: Vision-based Robust UAV Navigation with Fiducial Markers + + +
+ By harnessing fiducial markers as visual landmarks in the environment, +Unmanned Aerial Vehicles (UAVs) can rapidly build precise maps and navigate +spaces safely and efficiently, unlocking their potential for fluent +collaboration and coexistence with humans. Existing fiducial marker methods +rely on handcrafted feature extraction, which sacrifices accuracy. On the other +hand, deep learning pipelines for marker detection fail to meet real-time +runtime constraints crucial for navigation applications. In this work, we +propose YoloTag \textemdash a real-time fiducial marker-based localization +system. YoloTag uses a lightweight YOLO v8 object detector to accurately detect +fiducial markers in images while meeting the runtime constraints needed for +navigation. The detected markers are then used by an efficient +perspective-n-point algorithm to estimate UAV states. However, this +localization system introduces noise, causing instability in trajectory +tracking. To suppress noise, we design a higher-order Butterworth filter that +effectively eliminates noise through frequency domain analysis. We evaluate our +algorithm through real-robot experiments in an indoor environment, comparing +the trajectory tracking performance of our method against other approaches in +terms of several distance metrics. + +
+
+
+
+
+ + ☆ Visual Servoing for Robotic On-Orbit Servicing: A Survey + + +
+ On-orbit servicing (OOS) activities will power the next big step for +sustainable exploration and commercialization of space. Developing robotic +capabilities for autonomous OOS operations is a priority for the space +industry. Visual Servoing (VS) enables robots to achieve the precise manoeuvres +needed for critical OOS missions by utilizing visual information for motion +control. This article presents an overview of existing VS approaches for +autonomous OOS operations with space manipulator systems (SMS). We divide the +approaches according to their contribution to the typical phases of a robotic +OOS mission: a) Recognition, b) Approach, and c) Contact. We also present a +discussion on the reviewed VS approaches, identifying current trends. Finally, +we highlight the challenges and areas for future research on VS techniques for +robotic OOS. + +
+
+ comment: Accepted for publication at the 2024 International Conference on + Space Robotics (iSpaRo) +
+
+
+
+
+ + ☆ Geometry-aware Feature Matching for Large-Scale Structure from Motion + + +
+ Establishing consistent and dense correspondences across multiple images is +crucial for Structure from Motion (SfM) systems. Significant view changes, such +as air-to-ground with very sparse view overlap, pose an even greater challenge +to the correspondence solvers. We present a novel optimization-based approach +that significantly enhances existing feature matching methods by introducing +geometry cues in addition to color cues. This helps fill gaps when there is +less overlap in large-scale scenarios. Our method formulates geometric +verification as an optimization problem, guiding feature matching within +detector-free methods and using sparse correspondences from detector-based +methods as anchor points. By enforcing geometric constraints via the Sampson +Distance, our approach ensures that the denser correspondences from +detector-free methods are geometrically consistent and more accurate. This +hybrid strategy significantly improves correspondence density and accuracy, +mitigates multi-view inconsistencies, and leads to notable advancements in +camera pose accuracy and point cloud density. It outperforms state-of-the-art +feature matching methods on benchmark datasets and enables feature matching in +challenging extreme large-scale settings. + +
+
+
+
+
+ + ☆ QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of + DWI Data MICCAI 2024 + + +
+ We propose an image-conditioned diffusion model to estimate high angular +resolution diffusion weighted imaging (DWI) from a low angular resolution +acquisition. Our model, which we call QID$^2$, takes as input a set of low +angular resolution DWI data and uses this information to estimate the DWI data +associated with a target gradient direction. We leverage a U-Net architecture +with cross-attention to preserve the positional information of the reference +images, further guiding the target image generation. We train and evaluate +QID$^2$ on single-shell DWI samples curated from the Human Connectome Project +(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to +produce low angular resolution DWI data and train QID$^2$ to reconstruct the +missing high angular resolution samples. We compare QID$^2$ with two +state-of-the-art GAN models. Our results demonstrate that QID$^2$ not only +achieves higher-quality generated images, but it consistently outperforms the +GAN models in downstream tensor estimation across multiple metrics. Taken +together, this study highlights the potential of diffusion models, and QID$^2$ +in particular, for q-space up-sampling, thus offering a promising toolkit for +clinical and research applications. + +
+
+ comment: Accepted at MICCAI 2024 International Workshop on Computational + Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work +
+
+
+
+
+ + ☆ Unsupervised Welding Defect Detection Using Audio And Video + + +
+ In this work we explore the application of AI to robotic welding. Robotic +welding is a widely used technology in many industries, but robots currently do +not have the capability to detect welding defects which get introduced due to +various reasons in the welding process. We describe how deep-learning methods +can be applied to detect weld defects in real-time by recording the welding +process with microphones and a camera. Our findings are based on a large +database with more than 4000 welding samples we collected which covers +different weld types, materials and various defect categories. All deep +learning models are trained in an unsupervised fashion because the space of +possible defects is large and the defects in our data may contain biases. We +demonstrate that a reliable real-time detection of most categories of weld +defects is feasible both from audio and video, with improvements achieved by +combining both modalities. Specifically, the multi-modal approach achieves an +average Area-under-ROC-Curve (AUC) of 0.92 over all eleven defect types in our +data. We conclude the paper with an analysis of the results by defect type and +a discussion of future work. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Biochemical Prostate Cancer Recurrence Prediction: Thinking Fast & Slow + + +
+ Time to biochemical recurrence in prostate cancer is essential for prognostic +monitoring of the progression of patients after prostatectomy, which assesses +the efficacy of the surgery. In this work, we proposed to leverage multiple +instance learning through a two-stage ``thinking fast \& slow'' strategy for +the time to recurrence (TTR) prediction. The first (``thinking fast'') stage +finds the most relevant WSI area for biochemical recurrence and the second +(``thinking slow'') stage leverages higher resolution patches to predict TTR. +Our approach reveals a mean C-index ($Ci$) of 0.733 ($\theta=0.059$) on our +internal validation and $Ci=0.603$ on the LEOPARD challenge validation set. +Post hoc attention visualization shows that the most attentive area contributes +to the TTR prediction. + +
+
+ comment: 8 pages, 3 figures, methodology paper for LEOPRARD Challenge +
+
+
+
+
+ + ☆ K-Origins: Better Colour Quantification for Neural Networks + + +
+ K-Origins is a neural network layer designed to improve image-based network +performances when learning colour, or intensities, is beneficial. Over 250 +encoder-decoder convolutional networks are trained and tested on 16-bit +synthetic data, demonstrating that K-Origins improves semantic segmentation +accuracy in two scenarios: object detection with low signal-to-noise ratios, +and segmenting multiple objects that are identical in shape but vary in colour. +K-Origins generates output features from the input features, $\textbf{X}$, by +the equation $\textbf{Y}_k = \textbf{X}-\textbf{J}\cdot w_k$ for each trainable +parameter $w_k$, where $\textbf{J}$ is a matrix of ones. Additionally, networks +with varying receptive fields were trained to determine optimal network depths +based on the dimensions of target classes, suggesting that receptive field +lengths should exceed object sizes. By ensuring a sufficient receptive field +length and incorporating K-Origins, we can achieve better semantic network +performance. + +
+
+ comment: 16 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Evaluation and Comparison of Visual Language Models for Transportation + Engineering Problems + + +
+ Recent developments in vision language models (VLM) have shown great +potential for diverse applications related to image understanding. In this +study, we have explored state-of-the-art VLM models for vision-based +transportation engineering tasks such as image classification and object +detection. The image classification task involves congestion detection and +crack identification, whereas, for object detection, helmet violations were +identified. We have applied open-source models such as CLIP, BLIP, OWL-ViT, +Llava-Next, and closed-source GPT-4o to evaluate the performance of these +state-of-the-art VLM models to harness the capabilities of language +understanding for vision-based transportation tasks. These tasks were performed +by applying zero-shot prompting to the VLM models, as zero-shot prompting +involves performing tasks without any training on those tasks. It eliminates +the need for annotated datasets or fine-tuning for specific tasks. Though these +models gave comparative results with benchmark Convolutional Neural Networks +(CNN) models in the image classification tasks, for object localization tasks, +it still needs improvement. Therefore, this study provides a comprehensive +evaluation of the state-of-the-art VLM models highlighting the advantages and +limitations of the models, which can be taken as the baseline for future +improvement and wide-scale implementation. + +
+
+
+
+
+ + ☆ ADHD diagnosis based on action characteristics recorded in videos using + machine learning + + +
+ Demand for ADHD diagnosis and treatment is increasing significantly and the +existing services are unable to meet the demand in a timely manner. In this +work, we introduce a novel action recognition method for ADHD diagnosis by +identifying and analysing raw video recordings. Our main contributions include +1) designing and implementing a test focusing on the attention and +hyperactivity/impulsivity of participants, recorded through three cameras; 2) +implementing a novel machine learning ADHD diagnosis system based on action +recognition neural networks for the first time; 3) proposing classification +criteria to provide diagnosis results and analysis of ADHD action +characteristics. + +
+
+ comment: Neuroscience Applied +
+
+
+
+
+ + ☆ Action-Based ADHD Diagnosis in Video + + +
+ Attention Deficit Hyperactivity Disorder (ADHD) causes significant impairment +in various domains. Early diagnosis of ADHD and treatment could significantly +improve the quality of life and functioning. Recently, machine learning methods +have improved the accuracy and efficiency of the ADHD diagnosis process. +However, the cost of the equipment and trained staff required by the existing +methods are generally huge. Therefore, we introduce the video-based frame-level +action recognition network to ADHD diagnosis for the first time. We also record +a real multi-modal ADHD dataset and extract three action classes from the video +modality for ADHD diagnosis. The whole process data have been reported to +CNTW-NHS Foundation Trust, which would be reviewed by medical +consultants/professionals and will be made public in due course. + +
+
+ comment: 31st European Symposium on Artificial Neural Networks +
+
+
+
+
+ + ☆ Optimal L-Systems for Stochastic L-system Inference Problems + + +
+ This paper presents two novel theorems that address two open problems in +stochastic Lindenmayer-system (L-system) inference, specifically focusing on +the construction of an optimal stochastic L-system capable of generating a +given sequence of strings. The first theorem delineates a method for crafting a +stochastic L-system that maximizes the likelihood of producing a given sequence +of words through a singular derivation. Furthermore, the second theorem +determines the stochastic L-systems with the highest probability of producing a +given sequence of words with multiple possible derivations. From these, we +introduce an algorithm to infer an optimal stochastic L-system from a given +sequence. This algorithm incorporates sophisticated optimization techniques, +such as interior point methods, ensuring production of a stochastically optimal +stochastic L-system suitable for generating the given sequence. This allows for +the use of using stochastic L-systems as model for machine learning using only +positive data for training. + +
+
+
+
+
+ + ☆ How to Determine the Preferred Image Distribution of a Black-Box + Vision-Language Model? + + +
+ Large foundation models have revolutionized the field, yet challenges remain +in optimizing multi-modal models for specialized visual tasks. We propose a +novel, generalizable methodology to identify preferred image distributions for +black-box Vision-Language Models (VLMs) by measuring output consistency across +varied input prompts. Applying this to different rendering types of 3D objects, +we demonstrate its efficacy across various domains requiring precise +interpretation of complex structures, with a focus on Computer-Aided Design +(CAD) as an exemplar field. We further refine VLM outputs using in-context +learning with human feedback, significantly enhancing explanation quality. To +address the lack of benchmarks in specialized domains, we introduce CAD-VQA, a +new dataset for evaluating VLMs on CAD-related visual question answering tasks. +Our evaluation of state-of-the-art VLMs on CAD-VQA establishes baseline +performance levels, providing a framework for advancing VLM capabilities in +complex visual reasoning tasks across various fields requiring expert-level +visual interpretation. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/cad_vqa}. + +
+
+
+
+
+ + ☆ NoiseAttack: An Evasive Sample-Specific Multi-Targeted Backdoor Attack + Through White Gaussian Noise + + +
+ Backdoor attacks pose a significant threat when using third-party data for +deep learning development. In these attacks, data can be manipulated to cause a +trained model to behave improperly when a specific trigger pattern is applied, +providing the adversary with unauthorized advantages. While most existing works +focus on designing trigger patterns in both visible and invisible to poison the +victim class, they typically result in a single targeted class upon the success +of the backdoor attack, meaning that the victim class can only be converted to +another class based on the adversary predefined value. In this paper, we +address this issue by introducing a novel sample-specific multi-targeted +backdoor attack, namely NoiseAttack. Specifically, we adopt White Gaussian +Noise (WGN) with various Power Spectral Densities (PSD) as our underlying +triggers, coupled with a unique training strategy to execute the backdoor +attack. This work is the first of its kind to launch a vision backdoor attack +with the intent to generate multiple targeted classes with minimal input +configuration. Furthermore, our extensive experimental results demonstrate that +NoiseAttack can achieve a high attack success rate against popular network +architectures and datasets, as well as bypass state-of-the-art backdoor +detection methods. Our source code and experiments are available at +https://github.com/SiSL-URI/NoiseAttack/tree/main. + +
+
+
+
+
+ + ☆ A Novel Audio-Visual Information Fusion System for Mental Disorders + Detection + + +
+ Mental disorders are among the foremost contributors to the global healthcare +challenge. Research indicates that timely diagnosis and intervention are vital +in treating various mental disorders. However, the early somatization symptoms +of certain mental disorders may not be immediately evident, often resulting in +their oversight and misdiagnosis. Additionally, the traditional diagnosis +methods incur high time and cost. Deep learning methods based on fMRI and EEG +have improved the efficiency of the mental disorder detection process. However, +the cost of the equipment and trained staff are generally huge. Moreover, most +systems are only trained for a specific mental disorder and are not +general-purpose. Recently, physiological studies have shown that there are some +speech and facial-related symptoms in a few mental disorders (e.g., depression +and ADHD). In this paper, we focus on the emotional expression features of +mental disorders and introduce a multimodal mental disorder diagnosis system +based on audio-visual information input. Our proposed system is based on +spatial-temporal attention networks and innovative uses a less computationally +intensive pre-train audio recognition network to fine-tune the video +recognition module for better results. We also apply the unified system for +multiple mental disorders (ADHD and depression) for the first time. The +proposed system achieves over 80\% accuracy on the real multimodal ADHD dataset +and achieves state-of-the-art results on the depression dataset AVEC 2014. + +
+
+ comment: 27th International Conference on Information (FUSION) +
+
+
+
+
+ + ☆ What makes a face looks like a hat: Decoupling low-level and high-level + Visual Properties with Image Triplets ECCV2024 + + +
+ In visual decision making, high-level features, such as object categories, +have a strong influence on choice. However, the impact of low-level features on +behavior is less understood partly due to the high correlation between high- +and low-level features in the stimuli presented (e.g., objects of the same +category are more likely to share low-level features). To disentangle these +effects, we propose a method that de-correlates low- and high-level visual +properties in a novel set of stimuli. Our method uses two Convolutional Neural +Networks (CNNs) as candidate models of the ventral visual stream: the CORnet-S +that has high neural predictivity in high-level, IT-like responses and the +VGG-16 that has high neural predictivity in low-level responses. Triplets +(root, image1, image2) of stimuli are parametrized by the level of low- and +high-level similarity of images extracted from the different layers. These +stimuli are then used in a decision-making task where participants are tasked +to choose the most similar-to-the-root image. We found that different networks +show differing abilities to predict the effects of low-versus-high-level +similarity: while CORnet-S outperforms VGG-16 in explaining human choices based +on high-level similarity, VGG-16 outperforms CORnet-S in explaining human +choices based on low-level similarity. Using Brain-Score, we observed that the +behavioral prediction abilities of different layers of these networks +qualitatively corresponded to their ability to explain neural activity at +different levels of the visual hierarchy. In summary, our algorithm for +stimulus set generation enables the study of how different representations in +the visual stream affect high-level cognitive behaviors. + +
+
+ comment: Accepted at Workshop on Human-inspired Computer Vision @ ECCV2024 +
+
+
+
+
+ + ☆ EgoPressure: A Dataset for Hand Pressure and Pose Estimation in + Egocentric Vision + + +
+ Estimating touch contact and pressure in egocentric vision is a central task +for downstream applications in Augmented Reality, Virtual Reality, as well as +many robotic applications, because it provides precise physical insights into +hand-object interaction and object manipulation. However, existing contact +pressure datasets lack egocentric views and hand poses, which are essential for +accurate estimation during in-situ operation, both for AR/VR interaction and +robotic manipulation. In this paper, we introduce EgoPressure,a novel dataset +of touch contact and pressure interaction from an egocentric perspective, +complemented with hand pose meshes and fine-grained pressure intensities for +each contact. The hand poses in our dataset are optimized using our proposed +multi-view sequence-based method that processes footage from our capture rig of +8 accurately calibrated RGBD cameras. EgoPressure comprises 5.0 hours of touch +contact and pressure interaction from 21 participants captured by a moving +egocentric camera and 7 stationary Kinect cameras, which provided RGB images +and depth maps at 30 Hz. In addition, we provide baselines for estimating +pressure with different modalities, which will enable future developments and +benchmarking on the dataset. Overall, we demonstrate that pressure and hand +poses are complementary, which supports our intention to better facilitate the +physical understanding of hand-object interactions in AR/VR and robotics +research. + +
+
+
+
+
+ + ☆ Visually Grounded Speech Models for Low-resource Languages and Cognitive + Modelling + + +
+ This dissertation examines visually grounded speech (VGS) models that learn +from unlabelled speech paired with images. It focuses on applications for +low-resource languages and understanding human language acquisition. We +introduce a task called visually prompted keyword localisation to detect and +localise keywords in speech using images. We demonstrate the effectiveness of +VGS models in few-shot learning scenarios for low-resource languages like +Yoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our +monolingual VGS model exhibits this bias, but we found that multilingualism +does not affect the bias in this VGS model similarly to what is observed in +children. + +
+
+ comment: PhD Dissertation +
+
+
+
+
+ + ☆ Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection, + Removal, and Generation in the Era of Deep Learning + + +
+ Shadows are formed when light encounters obstacles, leading to areas of +diminished illumination. In computer vision, shadow detection, removal, and +generation are crucial for enhancing scene understanding, refining image +quality, ensuring visual consistency in video editing, and improving virtual +environments. This paper presents a comprehensive survey of shadow detection, +removal, and generation in images and videos within the deep learning landscape +over the past decade, covering tasks, deep models, datasets, and evaluation +metrics. Our key contributions include a comprehensive survey of shadow +analysis, standardization of experimental comparisons, exploration of the +relationships among model size, speed, and performance, a cross-dataset +generalization study, identification of open issues and future directions, and +provision of publicly available resources to support further research. + +
+
+ comment: Publicly available results, trained models, and evaluation metrics at + https://github.com/xw-hu/Unveiling-Deep-Shadows +
+
+
+
+
+ + ☆ DynOMo: Online Point Tracking by Dynamic Online Monocular Gaussian + Reconstruction + + +
+ Reconstructing scenes and tracking motion are two sides of the same coin. +Tracking points allow for geometric reconstruction [14], while geometric +reconstruction of (dynamic) scenes allows for 3D tracking of points over time +[24, 39]. The latter was recently also exploited for 2D point tracking to +overcome occlusion ambiguities by lifting tracking directly into 3D [38]. +However, above approaches either require offline processing or multi-view +camera setups both unrealistic for real-world applications like robot +navigation or mixed reality. We target the challenge of online 2D and 3D point +tracking from unposed monocular camera input introducing Dynamic Online +Monocular Reconstruction (DynOMo). We leverage 3D Gaussian splatting to +reconstruct dynamic scenes in an online fashion. Our approach extends 3D +Gaussians to capture new content and object motions while estimating camera +movements from a single RGB frame. DynOMo stands out by enabling emergence of +point trajectories through robust image feature reconstruction and a novel +similarity-enhanced regularization term, without requiring any +correspondence-level supervision. It sets the first baseline for online point +tracking with monocular unposed cameras, achieving performance on par with +existing methods. We aim to inspire the community to advance online point +tracking and reconstruction, expanding the applicability to diverse real-world +scenarios. + +
+
+
+
+
+ + ☆ Towards Real-World Adverse Weather Image Restoration: Enhancing + Clearness and Semantics with Vision-Language Models ECCV 2024 + + +
+ This paper addresses the limitations of adverse weather image restoration +approaches trained on synthetic data when applied to real-world scenarios. We +formulate a semi-supervised learning framework employing vision-language models +to enhance restoration performance across diverse adverse weather conditions in +real-world settings. Our approach involves assessing image clearness and +providing semantics using vision-language models on real data, serving as +supervision signals for training restoration models. For clearness enhancement, +we use real-world data, utilizing a dual-step strategy with pseudo-labels +assessed by vision-language models and weather prompt learning. For semantic +enhancement, we integrate real-world data by adjusting weather conditions in +vision-language model descriptions while preserving semantic meaning. +Additionally, we introduce an effective training strategy to bootstrap +restoration performance. Our approach achieves superior results in real-world +adverse weather image restoration, demonstrated through qualitative and +quantitative comparisons with state-of-the-art works. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ LinFusion: 1 GPU, 1 Minute, 16K Image + + +
+ Modern diffusion models, particularly those utilizing a Transformer-based +UNet for denoising, rely heavily on self-attention operations to manage complex +spatial relationships, thus achieving impressive generation performance. +However, this existing paradigm faces significant challenges in generating +high-resolution visual content due to its quadratic time and memory complexity +with respect to the number of spatial tokens. To address this limitation, we +aim at a novel linear attention mechanism as an alternative in this paper. +Specifically, we begin our exploration from recently introduced models with +linear complexity, e.g., Mamba, Mamba2, and Gated Linear Attention, and +identify two key features-attention normalization and non-causal inference-that +enhance high-resolution visual generation performance. Building on these +insights, we introduce a generalized linear attention paradigm, which serves as +a low-rank approximation of a wide spectrum of popular linear token mixers. To +save the training cost and better leverage pre-trained models, we initialize +our models and distill the knowledge from pre-trained StableDiffusion (SD). We +find that the distilled model, termed LinFusion, achieves performance on par +with or superior to the original SD after only modest training, while +significantly reducing time and memory complexity. Extensive experiments on +SD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory +zero-shot cross-resolution generation performance, generating high-resolution +images like 16K resolution. Moreover, it is highly compatible with pre-trained +SD components, such as ControlNet and IP-Adapter, requiring no adaptation +efforts. Codes are available at https://github.com/Huage001/LinFusion. + +
+
+ comment: Work in Progress. Codes are available at + https://github.com/Huage001/LinFusion +
+
+
+
+
+ + ☆ DepthCrafter: Generating Consistent Long Depth Sequences for Open-world + Videos + + +
+ Despite significant advancements in monocular depth estimation for static +images, estimating video depth in the open world remains challenging, since +open-world videos are extremely diverse in content, motion, camera movement, +and length. We present DepthCrafter, an innovative method for generating +temporally consistent long depth sequences with intricate details for +open-world videos, without requiring any supplementary information such as +camera poses or optical flow. DepthCrafter achieves generalization ability to +open-world videos by training a video-to-depth model from a pre-trained +image-to-video diffusion model, through our meticulously designed three-stage +training strategy with the compiled paired video-depth datasets. Our training +approach enables the model to generate depth sequences with variable lengths at +one time, up to 110 frames, and harvest both precise depth details and rich +content diversity from realistic and synthetic datasets. We also propose an +inference strategy that processes extremely long videos through segment-wise +estimation and seamless stitching. Comprehensive evaluations on multiple +datasets reveal that DepthCrafter achieves state-of-the-art performance in +open-world video depth estimation under zero-shot settings. Furthermore, +DepthCrafter facilitates various downstream applications, including depth-based +visual effects and conditional video generation. + +
+
+ comment: Project webpage: https://depthcrafter.github.io +
+
+
+
+
+ + ☆ GraspSplats: Efficient Manipulation with 3D Feature Splatting + + +
+ The ability for robots to perform efficient and zero-shot grasping of object +parts is crucial for practical applications and is becoming prevalent with +recent advances in Vision-Language Models (VLMs). To bridge the 2D-to-3D gap +for representations to support such a capability, existing methods rely on +neural fields (NeRFs) via differentiable rendering or point-based projection +methods. However, we demonstrate that NeRFs are inappropriate for scene changes +due to their implicitness and point-based methods are inaccurate for part +localization without rendering-based optimization. To amend these issues, we +propose GraspSplats. Using depth supervision and a novel reference feature +computation method, GraspSplats generates high-quality scene representations in +under 60 seconds. We further validate the advantages of Gaussian-based +representation by showing that the explicit and optimized geometry in +GraspSplats is sufficient to natively support (1) real-time grasp sampling and +(2) dynamic and articulated object manipulation with point trackers. With +extensive experiments on a Franka robot, we demonstrate that GraspSplats +significantly outperforms existing methods under diverse task settings. In +particular, GraspSplats outperforms NeRF-based methods like F3RM and LERF-TOGO, +and 2D detection methods. + +
+
+ comment: Project webpage: https://graspsplats.github.io/ +
+
+
+
+
+ + ☆ Physical Rule-Guided Convolutional Neural Network + + +
+ The black-box nature of Convolutional Neural Networks (CNNs) and their +reliance on large datasets limit their use in complex domains with limited +labeled data. Physics-Guided Neural Networks (PGNNs) have emerged to address +these limitations by integrating scientific principles and real-world +knowledge, enhancing model interpretability and efficiency. This paper proposes +a novel Physics-Guided CNN (PGCNN) architecture that incorporates dynamic, +trainable, and automated LLM-generated, widely recognized rules integrated into +the model as custom layers to address challenges like limited data and low +confidence scores. The PGCNN is evaluated on multiple datasets, demonstrating +superior performance compared to a baseline CNN model. Key improvements include +a significant reduction in false positives and enhanced confidence scores for +true detection. The results highlight the potential of PGCNNs to improve CNN +performance for broader application areas. + +
+
+
+
+
+ + ♻ ☆ Open-vocabulary Temporal Action Localization using VLMs + + +
+ Video action localization aims to find timings of a specific action from a +long video. Although existing learning-based approaches have been successful, +those require annotating videos that come with a considerable labor cost. This +paper proposes a learning-free, open-vocabulary approach based on emerging +off-the-shelf vision-language models (VLM). The challenge stems from the fact +that VLMs are neither designed to process long videos nor tailored for finding +actions. We overcome these problems by extending an iterative visual prompting +technique. Specifically, we sample video frames into a concatenated image with +frame index labels, making a VLM guess a frame that is considered to be closest +to the start/end of the action. Iterating this process by narrowing a sampling +time window results in finding a specific frame of start and end of an action. +We demonstrate that this sampling technique yields reasonable results, +illustrating a practical extension of VLMs for understanding videos. A sample +code is available at +https://microsoft.github.io/VLM-Video-Action-Localization/. + +
+
+ comment: 7 pages, 5 figures, 4 tables. Last updated on September 3rd, 2024 +
+
+
+
+
+ + ♻ ☆ A Multiscale Gradient Fusion Method for Edge Detection in Color Images + Utilizing the CBM3D Filter + + +
+ In this paper, a color edge detection strategy based on collaborative +filtering combined with multiscale gradient fusion is proposed. The +block-matching and 3D (BM3D) filter are used to enhance the sparse +representation in the transform domain and achieve the effect of denoising, +whereas the multiscale gradient fusion makes up for the defect of loss of +details in single-scale edge detection and improves the edge detection +resolution and quality. First, the RGB images in the dataset are converted to +XYZ color space images through mathematical operations. Second, the colored +block-matching and 3D (CBM3D) filter are used on the sparse images and to +remove noise interference. Then, the vector gradients of the color image and +the anisotropic Gaussian directional derivative of the two scale parameters are +calculated and averaged pixel-by-pixel to obtain a new edge strength map. +Finally, the edge features are enhanced by image normalization and non-maximum +suppression technology, and on that basis, the edge contour is obtained by +double threshold selection and a new morphological refinement method. Through +an experimental analysis of the edge detection dataset, the method proposed has +good noise robustness and high edge quality, which is better than the Color +Sobel, Color Canny, SE and Color AGDD as shown by the PR curve, AUC, PSNR, MSE, +and FOM indicators. + +
+
+ comment: 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial + Intelligence Evaluation in Histopathology + + +
+ Histopathological image analysis is crucial for accurate cancer diagnosis and +treatment planning. While deep learning models, especially convolutional neural +networks, have advanced this field, their "black-box" nature raises concerns +about interpretability and trustworthiness. Explainable Artificial Intelligence +(XAI) techniques aim to address these concerns, but evaluating their +effectiveness remains challenging. A significant issue with current +occlusion-based XAI methods is that they often generate Out-of-Distribution +(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce +Inpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a +Denoising Diffusion Probabilistic Model to inpaint occluded regions in +histopathological images. By replacing cancerous areas with realistic, +non-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity. +We evaluate our method on the CAMELYON16 dataset through two phases: first, by +assessing perceptual similarity using the Learned Perceptual Image Patch +Similarity (LPIPS) metric, and second, by quantifying the impact on model +predictions through Area Under the Curve (AUC) analysis. Our results +demonstrate that IBO significantly improves perceptual fidelity, achieving +nearly twice the improvement in LPIPS scores compared to the best existing +occlusion strategy. Additionally, IBO increased the precision of XAI +performance prediction from 42% to 71% compared to traditional methods. These +results demonstrate IBO's potential to provide more reliable evaluations of XAI +techniques, benefiting histopathology and other applications. The source code +for this study is available at https://github.com/a-fsh-r/IBO. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Learning a Generalized Physical Face Model From Data + + +
+ Physically-based simulation is a powerful approach for 3D facial animation as +the resulting deformations are governed by physical constraints, allowing to +easily resolve self-collisions, respond to external forces and perform +realistic anatomy edits. Today's methods are data-driven, where the actuations +for finite elements are inferred from captured skin geometry. Unfortunately, +these approaches have not been widely adopted due to the complexity of +initializing the material space and learning the deformation model for each +character separately, which often requires a skilled artist followed by lengthy +network training. In this work, we aim to make physics-based facial animation +more accessible by proposing a generalized physical face model that we learn +from a large 3D face dataset. Once trained, our model can be quickly fit to any +unseen identity and produce a ready-to-animate physical face model +automatically. Fitting is as easy as providing a single 3D face scan, or even a +single face image. After fitting, we offer intuitive animation controls, as +well as the ability to retarget animations across characters. All the while, +the resulting animations allow for physical effects like collision avoidance, +gravity, paralysis, bone reshaping and more. + +
+
+
+
+
+ + ♻ ☆ $OC^4-ReID$: Occluded Cloth-Changing Person Re-Identification + + +
+ The study of Cloth-Changing Person Re-identification (CC-ReID) focuses on +retrieving specific pedestrians when their clothing has changed, typically +under the assumption that the entire pedestrian images are visible. Pedestrian +images in real-world scenarios, however, are often partially obscured by +obstacles, presenting a significant challenge to existing CC-ReID systems. In +this paper, we introduce a more challenging task termed Occluded Cloth-Changing +Person Re-Identification ($OC^4-ReID$), which simultaneously addresses two +challenges of clothing changes and occlusion. Concretely, we construct two new +datasets, Occ-LTCC and Occ-PRCC, based on original CC-ReID datasets to include +random occlusions of key pedestrians components (e.g., head, torso). Moreover, +a novel benchmark is proposed for $OC^4-ReID$ incorporating a Train-Test Micro +Granularity Screening ($T^2MGS$) module to mitigate the influence of occlusion +and proposing a Part-Robust Triplet (PRT) loss for partial features learning. +Comprehensive experiments on the proposed datasets, as well as on two CC-ReID +benchmark datasets demonstrate the superior performance of proposed method +against other state-of-the-art methods. The codes and datasets are available +at: https://github.com/1024AILab/OC4-ReID. + +
+
+
+
+
+ + ♻ ☆ Restorer: Removing Multi-Degradation with All-Axis Attention and Prompt + Guidance + + +
+ There are many excellent solutions in image restoration.However, most methods +require on training separate models to restore images with different types of +degradation.Although existing all-in-one models effectively address multiple +types of degradation simultaneously, their performance in real-world scenarios +is still constrained by the task confusion problem.In this work, we attempt to +address this issue by introducing \textbf{Restorer}, a novel Transformer-based +all-in-one image restoration model.To effectively address the complex +degradation present in real-world images, we propose All-Axis Attention (AAA), +a mechanism that simultaneously models long-range dependencies across both +spatial and channel dimensions, capturing potential correlations along all +axes.Additionally, we introduce textual prompts in Restorer to incorporate +explicit task priors, enabling the removal of specific degradation types based +on user instructions. By iterating over these prompts, Restorer can handle +composite degradation in real-world scenarios without requiring additional +training.Based on these designs, Restorer with one set of parameters +demonstrates state-of-the-art performance in multiple image restoration tasks +compared to existing all-in-one and even single-task models.Additionally, +Restorer is efficient during inference, suggesting the potential in real-world +applications. + +
+
+
+
+
+ + ♻ ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+ comment: accepted by IEEE RA-L +
+
+
+
+
+ + ♻ ☆ 3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods + + +
+ We present a work-in-progress survey on 3D Gaussian Splatting compression +methods, focusing on their statistical performance across various benchmarks. +This survey aims to facilitate comparability by summarizing key statistics of +different compression approaches in a tabulated format. The datasets evaluated +include TanksAndTemples, MipNeRF360, DeepBlending, and SyntheticNeRF. For each +method, we report the Peak Signal-to-Noise Ratio (PSNR), Structural Similarity +Index (SSIM), Learned Perceptual Image Patch Similarity (LPIPS), and the +resultant size in megabytes (MB), as provided by the respective authors. This +is an ongoing, open project, and we invite contributions from the research +community as GitHub issues or pull requests. Please visit +http://w-m.github.io/3dgs-compression-survey/ for more information and a +sortable version of the table. + +
+
+ comment: 3D Gaussian Splatting compression survey; 3DGS compression; new + approaches added +
+
+
+
+
+ + ♻ ☆ SUMix: Mixup with Semantic and Uncertain Information ECCV2024 + + +
+ Mixup data augmentation approaches have been applied for various tasks of +deep learning to improve the generalization ability of deep neural networks. +Some existing approaches CutMix, SaliencyMix, etc. randomly replace a patch in +one image with patches from another to generate the mixed image. Similarly, the +corresponding labels are linearly combined by a fixed ratio $\lambda$ by l. The +objects in two images may be overlapped during the mixing process, so some +semantic information is corrupted in the mixed samples. In this case, the mixed +image does not match the mixed label information. Besides, such a label may +mislead the deep learning model training, which results in poor performance. To +solve this problem, we proposed a novel approach named SUMix to learn the +mixing ratio as well as the uncertainty for the mixed samples during the +training process. First, we design a learnable similarity function to compute +an accurate mix ratio. Second, an approach is investigated as a regularized +term to model the uncertainty of the mixed samples. We conduct experiments on +five image benchmarks, and extensive experimental results imply that our method +is capable of improving the performance of classifiers with different +cutting-based mixup approaches. The source code is available at +https://github.com/JinXins/SUMix. + +
+
+ comment: Accepted by ECCV2024 [Camera Ready] (19 pages, 7 figures) with the + source code at https://github.com/JinXins/SUMix +
+
+
+
+
+ + ♻ ☆ SPIdepth: Strengthened Pose Information for Self-supervised Monocular + Depth Estimation + + +
+ Self-supervised monocular depth estimation has garnered considerable +attention for its applications in autonomous driving and robotics. While recent +methods have made strides in leveraging techniques like the Self Query Layer +(SQL) to infer depth from motion, they often overlook the potential of +strengthening pose information. In this paper, we introduce SPIdepth, a novel +approach that prioritizes enhancing the pose network for improved depth +estimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the +importance of pose information in capturing fine-grained scene structures. By +enhancing the pose network's capabilities, SPIdepth achieves remarkable +advancements in scene understanding and depth estimation. Experimental results +on benchmark datasets such as KITTI, Cityscapes, and Make3D showcase SPIdepth's +state-of-the-art performance, surpassing previous methods by significant +margins. Specifically, SPIdepth tops the self-supervised KITTI benchmark. +Additionally, SPIdepth achieves the lowest AbsRel (0.029), SqRel (0.069), and +RMSE (1.394) on KITTI, establishing new state-of-the-art results. On +Cityscapes, SPIdepth shows improvements over SQLdepth of 21.7% in AbsRel, 36.8% +in SqRel, and 16.5% in RMSE, even without using motion masks. On Make3D, +SPIdepth in zero-shot outperforms all other models. Remarkably, SPIdepth +achieves these results using only a single image for inference, surpassing even +methods that utilize video sequences for inference, thus demonstrating its +efficacy and efficiency in real-world applications. Our approach represents a +significant leap forward in self-supervised monocular depth estimation, +underscoring the importance of strengthening pose information for advancing +scene understanding in real-world applications. The code and pre-trained models +are publicly available at https://github.com/Lavreniuk/SPIdepth. + +
+
+
+
+
+ + ♻ ☆ Realigned Softmax Warping for Deep Metric Learning + + +
+ Deep Metric Learning (DML) loss functions traditionally aim to control the +forces of separability and compactness within an embedding space so that the +same class data points are pulled together and different class ones are pushed +apart. Within the context of DML, a softmax operation will typically normalize +distances into a probability for optimization, thus coupling all the push/pull +forces together. This paper proposes a potential new class of loss functions +that operate within a euclidean domain and aim to take full advantage of the +coupled forces governing embedding space formation under a softmax. These +forces of compactness and separability can be boosted or mitigated within +controlled locations at will by using a warping function. In this work, we +provide a simple example of a warping function and use it to achieve +competitive, state-of-the-art results on various metric learning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Progressive Domain Adaptation for Thermal Infrared Object Tracking + + +
+ Due to the lack of large-scale labeled Thermal InfraRed (TIR) training +datasets, most existing TIR trackers are trained directly on RGB datasets. +However, tracking methods trained on RGB datasets suffer a significant drop-off +in TIR data due to the domain shift issue. To this end, in this work, we +propose a Progressive Domain Adaptation framework for TIR Tracking (PDAT), +which transfers useful knowledge learned from RGB tracking to TIR tracking. The +framework makes full use of large-scale labeled RGB datasets without requiring +time-consuming and labor-intensive labeling of large-scale TIR data. +Specifically, we first propose an adversarial-based global domain adaptation +module to reduce domain gap on the feature level coarsely. Second, we design a +clustering-based subdomain adaptation method to further align the feature +distributions of the RGB and TIR datasets finely. These two domain adaptation +modules gradually eliminate the discrepancy between the two domains, and thus +learn domain-invariant fine-grained features through progressive training. +Additionally, we collect a largescale TIR dataset with over 1.48 million +unlabeled TIR images for training the proposed domain adaptation framework. +Experimental results on five TIR tracking benchmarks show that the proposed +method gains a nearly 6% success rate, demonstrating its effectiveness. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ An Efficient Instance Segmentation Framework Using Segmentation + Foundation Models with Oriented Bounding Box Prompts + + +
+ Instance segmentation in unmanned aerial vehicle measurement is a +long-standing challenge. Since horizontal bounding boxes introduce many +interference objects, oriented bounding boxes (OBBs) are usually used for +instance identification. However, based on ``segmentation within bounding box'' +paradigm, current instance segmentation methods using OBBs are overly dependent +on bounding box detection performance. To tackle this, this paper proposes +OBSeg, an efficient instance segmentation framework using OBBs. OBSeg is based +on box prompt-based segmentation foundation models (BSMs), e.g., Segment +Anything Model. Specifically, OBSeg first detects OBBs to distinguish instances +and provide coarse localization information. Then, it predicts OBB +prompt-related masks for fine segmentation. Since OBBs only serve as prompts, +OBSeg alleviates the over-dependence on bounding box detection performance of +current instance segmentation methods using OBBs. In addition, to enable BSMs +to handle OBB prompts, we propose a novel OBB prompt encoder. To make OBSeg +more lightweight and further improve the performance of lightweight distilled +BSMs, a Gaussian smoothing-based knowledge distillation method is introduced. +Experiments demonstrate that OBSeg outperforms current instance segmentation +methods on multiple public datasets. The code is available at +https://github.com/zhen6618/OBBInstanceSegmentation. + +
+
+
+
+
+ + ♻ ☆ Collaborative Group: Composed Image Retrieval via Consensus Learning + from Noisy Annotations + + +
+ Composed image retrieval extends content-based image retrieval systems by +enabling users to search using reference images and captions that describe +their intention. Despite great progress in developing image-text compositors to +extract discriminative visual-linguistic features, we identify a hitherto +overlooked issue, triplet ambiguity, which impedes robust feature extraction. +Triplet ambiguity refers to a type of semantic ambiguity that arises between +the reference image, the relative caption, and the target image. It is mainly +due to the limited representation of the annotated text, resulting in many +noisy triplets where multiple visually dissimilar candidate images can be +matched to an identical reference pair (i.e., a reference image + a relative +caption). To address this challenge, we propose the Consensus Network +(Css-Net), inspired by the psychological concept that groups outperform +individuals. Css-Net comprises two core components: (1) a consensus module with +four diverse compositors, each generating distinct image-text embeddings, +fostering complementary feature extraction and mitigating dependence on any +single, potentially biased compositor; (2) a Kullback-Leibler divergence loss +that encourages learning of inter-compositor interactions to promote consensual +outputs. During evaluation, the decisions of the four compositors are combined +through a weighting scheme, enhancing overall agreement. On benchmark datasets, +particularly FashionIQ, Css-Net demonstrates marked improvements. Notably, it +achieves significant recall gains, with a 2.77% increase in R@10 and 6.67% +boost in R@50, underscoring its competitiveness in addressing the fundamental +limitations of existing methods. + +
+
+ comment: Accepted by Knowledge-Based Systems (KBS) +
+
+
+
+
+ + ♻ ☆ CAST: Cross-Attention in Space and Time for Video Action Recognition NeurIPS 2023 + + +
+ Recognizing human actions in videos requires spatial and temporal +understanding. Most existing action recognition models lack a balanced +spatio-temporal understanding of videos. In this work, we propose a novel +two-stream architecture, called Cross-Attention in Space and Time (CAST), that +achieves a balanced spatio-temporal understanding of videos using only RGB +input. Our proposed bottleneck cross-attention mechanism enables the spatial +and temporal expert models to exchange information and make synergistic +predictions, leading to improved performance. We validate the proposed method +with extensive experiments on public benchmarks with different characteristics: +EPIC-KITCHENS-100, Something-Something-V2, and Kinetics-400. Our method +consistently shows favorable performance across these datasets, while the +performance of existing methods fluctuates depending on the dataset +characteristics. + +
+
+ comment: This is an accepted NeurIPS 2023. Project webpage is available at + https://jong980812.github.io/CAST.github.io/ Code is available at + https://github.com/KHU-VLL/CAST +
+
+
+
+
+ + ♻ ☆ TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot + Image Classification ICASSP 2024 + + +
+ Few-shot image classification aims to classify images from unseen novel +classes with few samples. Recent works demonstrate that deep local descriptors +exhibit enhanced representational capabilities compared to image-level +features. However, most existing methods solely rely on either employing all +local descriptors or directly utilizing partial descriptors, potentially +resulting in the loss of crucial information. Moreover, these methods primarily +emphasize the selection of query descriptors while overlooking support +descriptors. In this paper, we propose a novel Task-Aware Adaptive Local +Descriptors Selection Network (TALDS-Net), which exhibits the capacity for +adaptive selection of task-aware support descriptors and query descriptors. +Specifically, we compare the similarity of each local support descriptor with +other local support descriptors to obtain the optimal support descriptor subset +and then compare the query descriptors with the optimal support subset to +obtain discriminative query descriptors. Extensive experiments demonstrate that +our TALDS-Net outperforms state-of-the-art methods on both general and +fine-grained datasets. + +
+
+ comment: 4 pages, 1 figures, is accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Asynchronous Blob Tracker for Event Cameras + + +
+ Event-based cameras are popular for tracking fast-moving objects due to their +high temporal resolution, low latency, and high dynamic range. In this paper, +we propose a novel algorithm for tracking event blobs using raw events +asynchronously in real time. We introduce the concept of an event blob as a +spatio-temporal likelihood of event occurrence where the conditional spatial +likelihood is blob-like. Many real-world objects such as car headlights or any +quickly moving foreground objects generate event blob data. The proposed +algorithm uses a nearest neighbour classifier with a dynamic threshold criteria +for data association coupled with an extended Kalman filter to track the event +blob state. Our algorithm achieves highly accurate blob tracking, velocity +estimation, and shape estimation even under challenging lighting conditions and +high-speed motions (> 11000 pixels/s). The microsecond time resolution achieved +means that the filter output can be used to derive secondary information such +as time-to-contact or range estimation, that will enable applications to +real-world problems such as collision avoidance in autonomous driving. + +
+
+ comment: 18 pages, 16 figures, Manuscript was accepted on August 7, 2024, by + IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ Rethinking Barely-Supervised Volumetric Medical Image Segmentation from + an Unsupervised Domain Adaptation Perspective + + +
+ This paper investigates an extremely challenging problem: barely-supervised +volumetric medical image segmentation (BSS). A BSS training dataset consists of +two parts: 1) a barely-annotated labeled set, where each labeled image contains +only a single-slice annotation, and 2) an unlabeled set comprising numerous +unlabeled volumetric images. State-of-the-art BSS methods employ a +registration-based paradigm, which uses inter-slice image registration to +propagate single-slice annotations into volumetric pseudo labels, constructing +a completely annotated labeled set, to which a semi-supervised segmentation +scheme can be applied. However, the paradigm has a critical limitation: the +pseudo-labels generated by image registration are unreliable and noisy. +Motivated by this, we propose a new perspective: instead of solving BSS within +a semi-supervised learning scheme, this work formulates BSS as an unsupervised +domain adaptation problem. To this end, we propose a novel BSS framework, +\textbf{B}arely-supervised learning \textbf{via} unsupervised domain +\textbf{A}daptation (BvA), as an alternative to the dominant registration +paradigm. Specifically, we first design a novel noise-free labeled data +construction algorithm (NFC) for slice-to-volume labeled data synthesis. Then, +we introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the +domain shifts. Extensive experiments demonstrate that our method provides a +promising alternative for BSS. Remarkably, the proposed method, trained on the +left atrial segmentation dataset with \textbf{only one} barely-labeled image, +achieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%. +The code is available at +\href{https://github.com/Senyh/BvA}{\textit{\texttt{https://github.com/Senyh/BvA}}}. + +
+
+
+
+
+ + ♻ ☆ Planning and Rendering: Towards Product Poster Generation with Diffusion + Models + + +
+ Product poster generation significantly optimizes design efficiency and +reduces production costs. Prevailing methods predominantly rely on +image-inpainting methods to generate clean background images for given +products. Subsequently, poster layout generation methods are employed to +produce corresponding layout results. However, the background images may not be +suitable for accommodating textual content due to their complexity, and the +fixed location of products limits the diversity of layout results. To alleviate +these issues, we propose a novel product poster generation framework based on +diffusion models named P\&R. The P\&R draws inspiration from the workflow of +designers in creating posters, which consists of two stages: Planning and +Rendering. At the planning stage, we propose a PlanNet to generate the layout +of the product and other visual components considering both the appearance +features of the product and semantic features of the text, which improves the +diversity and rationality of the layouts. At the rendering stage, we propose a +RenderNet to generate the background for the product while considering the +generated layout, where a spatial fusion module is introduced to fuse the +layout of different visual components. To foster the advancement of this field, +we propose the first product poster generation dataset PPG30k, comprising 30k +exquisite product poster images along with comprehensive image and text +annotations. Our method outperforms the state-of-the-art product poster +generation methods on PPG30k. The PPG30k will be released soon. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Human-like Similarities of Automatic Facial Expression + Recognition: An Empirical Exploration through Explainable AI + + +
+ Facial expression recognition is vital for human behavior analysis, and deep +learning has enabled models that can outperform humans. However, it is unclear +how closely they mimic human processing. This study aims to explore the +similarity between deep neural networks and human perception by comparing +twelve different networks, including both general object classifiers and +FER-specific models. We employ an innovative global explainable AI method to +generate heatmaps, revealing crucial facial regions for the twelve networks +trained on six facial expressions. We assess these results both quantitatively +and qualitatively, comparing them to ground truth masks based on Friesen and +Ekman's description and among them. We use Intersection over Union (IoU) and +normalized correlation coefficients for comparisons. We generate 72 heatmaps to +highlight critical regions for each expression and architecture. Qualitatively, +models with pre-trained weights show more similarity in heatmaps compared to +those without pre-training. Specifically, eye and nose areas influence certain +facial expressions, while the mouth is consistently important across all models +and expressions. Quantitatively, we find low average IoU values (avg. 0.2702) +across all expressions and architectures. The best-performing architecture +averages 0.3269, while the worst-performing one averages 0.2066. Dendrograms, +built with the normalized correlation coefficient, reveal two main clusters for +most expressions: models with pre-training and models without pre-training. +Findings suggest limited alignment between human and AI facial expression +recognition, with network architectures influencing the similarity, as similar +architectures prioritize similar facial regions. + +
+
+ comment: Multimed Tools Appl (2024) +
+
+
+
+
+ + ♻ ☆ Learning Exposure Correction in Dynamic Scenes + + +
+ Exposure correction aims to enhance visual data suffering from improper +exposures, which can greatly improve satisfactory visual effects. However, +previous methods mainly focus on the image modality, and the video counterpart +is less explored in the literature. Directly applying prior image-based methods +to videos results in temporal incoherence with low visual quality. Through +thorough investigation, we find that the development of relevant communities is +limited by the absence of a benchmark dataset. Therefore, in this paper, we +construct the first real-world paired video dataset, including both +underexposure and overexposure dynamic scenes. To achieve spatial alignment, we +utilize two DSLR cameras and a beam splitter to simultaneously capture improper +and normal exposure videos. Additionally, we propose an end-to-end video +exposure correction network, in which a dual-stream module is designed to deal +with both underexposure and overexposure factors, enhancing the illumination +based on Retinex theory. The extensive experiments based on various metrics and +user studies demonstrate the significance of our dataset and the effectiveness +of our method. The code and dataset are available at +https://github.com/kravrolens/VECNet. + +
+
+ comment: To be published at ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Computer Vision based Activity Recognition and Fall + Detection of the Elderly: a Systematic Review + + +
+ As the percentage of elderly people in developed countries increases +worldwide, the healthcare of this collective is a worrying matter, especially +if it includes the preservation of their autonomy. In this direction, many +studies are being published on Ambient Assisted Living (AAL) systems, which +help to reduce the preoccupations raised by the independent living of the +elderly. In this study, a systematic review of the literature is presented on +fall detection and Human Activity Recognition (HAR) for the elderly, as the two +main tasks to solve to guarantee the safety of elderly people living alone. To +address the current tendency to perform these two tasks, the review focuses on +the use of Deep Learning (DL) based approaches on computer vision data. In +addition, different collections of data like DL models, datasets or hardware +(e.g. depth or thermal cameras) are gathered from the reviewed studies and +provided for reference in future studies. Strengths and weaknesses of existing +approaches are also discussed and, based on them, our recommendations for +future works are provided. + +
+
+
+
+
+ + ♻ ☆ RefSAM: Efficiently Adapting Segmenting Anything Model for Referring + Video Object Segmentation + + +
+ The Segment Anything Model (SAM) has gained significant attention for its +impressive performance in image segmentation. However, it lacks proficiency in +referring video object segmentation (RVOS) due to the need for precise +user-interactive prompts and a limited understanding of different modalities, +such as language and vision. This paper presents the RefSAM model, which +explores the potential of SAM for RVOS by incorporating multi-view information +from diverse modalities and successive frames at different timestamps in an +online manner. Our proposed approach adapts the original SAM model to enhance +cross-modality learning by employing a lightweight Cross-Modal MLP that +projects the text embedding of the referring expression into sparse and dense +embeddings, serving as user-interactive prompts. Additionally, we have +introduced the hierarchical dense attention module to fuse hierarchical visual +semantic information with sparse embeddings to obtain fine-grained dense +embeddings, and an implicit tracking module to generate a tracking token and +provide historical information for the mask decoder. Furthermore, we employ a +parameter-efficient tuning strategy to align and fuse the language and vision +features effectively. Through comprehensive ablation studies, we demonstrate +our model's practical and effective design choices. Extensive experiments +conducted on Refer-Youtube-VOS, Ref-DAVIS17, and three referring image +segmentation datasets validate the superiority and effectiveness of our RefSAM +model over existing methods. + +
+
+
+
+
+ + ♻ ☆ PointRWKV: Efficient RWKV-Like Model for Hierarchical Point Cloud + Learning + + +
+ Transformers have revolutionized the point cloud learning task, but the +quadratic complexity hinders its extension to long sequence and makes a burden +on limited computational resources. The recent advent of RWKV, a fresh breed of +deep sequence models, has shown immense potential for sequence modeling in NLP +tasks. In this paper, we present PointRWKV, a model of linear complexity +derived from the RWKV model in the NLP field with necessary modifications for +point cloud learning tasks. Specifically, taking the embedded point patches as +input, we first propose to explore the global processing capabilities within +PointRWKV blocks using modified multi-headed matrix-valued states and a dynamic +attention recurrence mechanism. To extract local geometric features +simultaneously, we design a parallel branch to encode the point cloud +efficiently in a fixed radius near-neighbors graph with a graph stabilizer. +Furthermore, we design PointRWKV as a multi-scale framework for hierarchical +feature learning of 3D point clouds, facilitating various downstream tasks. +Extensive experiments on different point cloud learning tasks show our proposed +PointRWKV outperforms the transformer- and mamba-based counterparts, while +significantly saving about 42\% FLOPs, demonstrating the potential option for +constructing foundational 3D models. + +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes and formatting improvements. V3: improvements from + journal revisions +
+
+
+
+
+ + ♻ ☆ Correlation-Embedded Transformer Tracking: A Single-Branch Framework + + +
+ Developing robust and discriminative appearance models has been a +long-standing research challenge in visual object tracking. In the prevalent +Siamese-based paradigm, the features extracted by the Siamese-like networks are +often insufficient to model the tracked targets and distractor objects, thereby +hindering them from being robust and discriminative simultaneously. While most +Siamese trackers focus on designing robust correlation operations, we propose a +novel single-branch tracking framework inspired by the transformer. Unlike the +Siamese-like feature extraction, our tracker deeply embeds cross-image feature +correlation in multiple layers of the feature network. By extensively matching +the features of the two images through multiple layers, it can suppress +non-target features, resulting in target-aware feature extraction. The output +features can be directly used for predicting target locations without +additional correlation steps. Thus, we reformulate the two-branch Siamese +tracking as a conceptually simple, fully transformer-based Single-Branch +Tracking pipeline, dubbed SBT. After conducting an in-depth analysis of the SBT +baseline, we summarize many effective design principles and propose an improved +tracker dubbed SuperSBT. SuperSBT adopts a hierarchical architecture with a +local modeling layer to enhance shallow-level features. A unified relation +modeling is proposed to remove complex handcrafted layer pattern designs. +SuperSBT is further improved by masked image modeling pre-training, integrating +temporal modeling, and equipping with dedicated prediction heads. Thus, +SuperSBT outperforms the SBT baseline by 4.7%,3.0%, and 4.5% AUC scores in +LaSOT, TrackingNet, and GOT-10K. Notably, SuperSBT greatly raises the speed of +SBT from 37 FPS to 81 FPS. Extensive experiments show that our method achieves +superior results on eight VOT benchmarks. + +
+
+ comment: Extension of SBT paper, accepted by TPAMI +
+
+
+
+
+ + ♻ ☆ Learning from the Web: Language Drives Weakly-Supervised Incremental + Learning for Semantic Segmentation ECCV 2024 + + +
+ Current weakly-supervised incremental learning for semantic segmentation +(WILSS) approaches only consider replacing pixel-level annotations with +image-level labels, while the training images are still from well-designed +datasets. In this work, we argue that widely available web images can also be +considered for the learning of new classes. To achieve this, firstly we +introduce a strategy to select web images which are similar to previously seen +examples in the latent space using a Fourier-based domain discriminator. Then, +an effective caption-driven reharsal strategy is proposed to preserve +previously learnt classes. To our knowledge, this is the first work to rely +solely on web images for both the learning of new concepts and the preservation +of the already learned ones in WILSS. Experimental results show that the +proposed approach can reach state-of-the-art performances without using +manually selected and annotated data in the incremental steps. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Image-Based Virtual Try-On: A Survey + + +
+ Image-based virtual try-on aims to synthesize a naturally dressed person +image with a clothing image, which revolutionizes online shopping and inspires +related topics within image generation, showing both research significance and +commercial potential. However, there is a gap between current research progress +and commercial applications and an absence of comprehensive overview of this +field to accelerate the development.In this survey, we provide a comprehensive +analysis of the state-of-the-art techniques and methodologies in aspects of +pipeline architecture, person representation and key modules such as try-on +indication, clothing warping and try-on stage. We additionally apply CLIP to +assess the semantic alignment of try-on results, and evaluate representative +methods with uniformly implemented evaluation metrics on the same dataset.In +addition to quantitative and qualitative evaluation of current open-source +methods, unresolved issues are highlighted and future research directions are +prospected to identify key trends and inspire further exploration. The +uniformly implemented evaluation metrics, dataset and collected methods will be +made public available at +https://github.com/little-misfit/Survey-Of-Virtual-Try-On. + +
+
+ comment: 30 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ DocKylin: A Large Multimodal Model for Visual Document Understanding + with Efficient Visual Slimming + + +
+ Current multimodal large language models (MLLMs) face significant challenges +in visual document understanding (VDU) tasks due to the high resolution, dense +text, and complex layouts typical of document images. These characteristics +demand a high level of detail perception ability from MLLMs. While increasing +input resolution improves detail perception capability, it also leads to longer +sequences of visual tokens, increasing computational costs and straining the +models' ability to handle long contexts. To address these challenges, we +introduce DocKylin, a document-centric MLLM that performs visual content +slimming at both the pixel and token levels, thereby reducing token sequence +length in VDU scenarios. We introduce an Adaptive Pixel Slimming (APS) +preprocessing module to perform pixel-level slimming, increasing the proportion +of informative pixels. Moreover, we propose a novel Dynamic Token Slimming +(DTS) module to conduct token-level slimming, filtering essential tokens and +removing others to adaptively create a more compact visual sequence. +Experiments demonstrate DocKylin's promising performance across various VDU +benchmarks and the effectiveness of each component. + +
+
+
+
+
+ + ♻ ☆ Towards reliable respiratory disease diagnosis based on cough sounds and + vision transformers + + +
+ Recent advancements in deep learning techniques have sparked performance +boosts in various real-world applications including disease diagnosis based on +multi-modal medical data. Cough sound data-based respiratory disease (e.g., +COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also +attracted much attention. However, existing works usually utilise traditional +machine learning or deep models of moderate scales. On the other hand, the +developed approaches are trained and evaluated on small-scale data due to the +difficulty of curating and annotating clinical data on scale. To address these +issues in prior works, we create a unified framework to evaluate various deep +models from lightweight Convolutional Neural Networks (e.g., ResNet18) to +modern vision transformers and compare their performance in respiratory disease +classification. Based on the observations from such an extensive empirical +study, we propose a novel approach to cough-based disease classification based +on both self-supervised and supervised learning on a large-scale cough data +set. Experimental results demonstrate our proposed approach outperforms prior +arts consistently on two benchmark datasets for COVID-19 diagnosis and a +proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%. + +
+
+
+
+
+ + ♻ ☆ Learn Suspected Anomalies from Event Prompts for Video Anomaly Detection + + +
+ Most models for weakly supervised video anomaly detection (WS-VAD) rely on +multiple instance learning, aiming to distinguish normal and abnormal snippets +without specifying the type of anomaly. However, the ambiguous nature of +anomaly definitions across contexts may introduce inaccuracy in discriminating +abnormal and normal events. To show the model what is anomalous, a novel +framework is proposed to guide the learning of suspected anomalies from event +prompts. Given a textual prompt dictionary of potential anomaly events and the +captions generated from anomaly videos, the semantic anomaly similarity between +them could be calculated to identify the suspected events for each video +snippet. It enables a new multi-prompt learning process to constrain the +visual-semantic features across all videos, as well as provides a new way to +label pseudo anomalies for self-training. To demonstrate its effectiveness, +comprehensive experiments and detailed ablation studies are conducted on four +datasets, namely XD-Violence, UCF-Crime, TAD, and ShanghaiTech. Our proposed +model outperforms most state-of-the-art methods in terms of AP or AUC (86.5\%, +\hl{90.4}\%, 94.4\%, and 97.4\%). Furthermore, it shows promising performance +in open-set and cross-dataset cases. The data, code, and models can be found +at: \url{https://github.com/shiwoaz/lap}. + +
+
+
+
+
+ + ♻ ☆ TagCLIP: Improving Discrimination Ability of Open-Vocabulary Semantic + Segmentation + + +
+ Contrastive Language-Image Pre-training (CLIP) has recently shown great +promise in pixel-level zero-shot learning tasks. However, existing approaches +utilizing CLIP's text and patch embeddings to generate semantic masks often +misidentify input pixels from unseen classes, leading to confusion between +novel classes and semantically similar ones. In this work, we propose a novel +approach, TagCLIP (Trusty-aware guided CLIP), to address this issue. We +disentangle the ill-posed optimization problem into two parallel processes: +semantic matching performed individually and reliability judgment for improving +discrimination ability. Building on the idea of special tokens in language +modeling representing sentence-level embeddings, we introduce a trusty token +that enables distinguishing novel classes from known ones in prediction. To +evaluate our approach, we conduct experiments on two benchmark datasets, PASCAL +VOC 2012, COCO-Stuff 164K and PASCAL Context. Our results show that TagCLIP +improves the Intersection over Union (IoU) of unseen classes by 7.4%, 1.7% and +2.1%, respectively, with negligible overheads. The code is available at +https://github.com/dvlab-research/TagCLIP. + +
+
+ comment: TPAMI2024 +
+
+
+
+
+ + ♻ ☆ Cross-Platform Video Person ReID: A New Benchmark Dataset and Adaptation + Approach ECCV 2024 + + +
+ In this paper, we construct a large-scale benchmark dataset for +Ground-to-Aerial Video-based person Re-Identification, named G2A-VReID, which +comprises 185,907 images and 5,576 tracklets, featuring 2,788 distinct +identities. To our knowledge, this is the first dataset for video ReID under +Ground-to-Aerial scenarios. G2A-VReID dataset has the following +characteristics: 1) Drastic view changes; 2) Large number of annotated +identities; 3) Rich outdoor scenarios; 4) Huge difference in resolution. +Additionally, we propose a new benchmark approach for cross-platform ReID by +transforming the cross-platform visual alignment problem into visual-semantic +alignment through vision-language model (i.e., CLIP) and applying a +parameter-efficient Video Set-Level-Adapter module to adapt image-based +foundation model to video ReID tasks, termed VSLA-CLIP. Besides, to further +reduce the great discrepancy across the platforms, we also devise the +platform-bridge prompts for efficient visual feature alignment. Extensive +experiments demonstrate the superiority of the proposed method on all existing +video ReID datasets and our proposed G2A-VReID dataset. + +
+
+ comment: Published at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ The Impact of Print-Scanning in Heterogeneous Morph Evaluation Scenarios + + +
+ Face morphing attacks pose an increasing threat to face recognition (FR) +systems. A morphed photo contains biometric information from two different +subjects to take advantage of vulnerabilities in FRs. These systems are +particularly susceptible to attacks when the morphs are subjected to +print-scanning to mask the artifacts generated during the morphing process. We +investigate the impact of print-scanning on morphing attack detection through a +series of evaluations on heterogeneous morphing attack scenarios. Our +experiments show that we can increase the Mated Morph Presentation Match Rate +(MMPMR) by up to 8.48%. Furthermore, when a Single-image Morphing Attack +Detection (S-MAD) algorithm is not trained to detect print-scanned morphs the +Morphing Attack Classification Error Rate (MACER) can increase by up to 96.12%, +indicating significant vulnerability. + +
+
+ comment: Accepted as a special sessions paper at IJCB 2024 +
+
+
+
+
+ + ♻ ☆ AIGCs Confuse AI Too: Investigating and Explaining Synthetic + Image-induced Hallucinations in Large Vision-Language Models + + +
+ The evolution of Artificial Intelligence Generated Contents (AIGCs) is +advancing towards higher quality. The growing interactions with AIGCs present a +new challenge to the data-driven AI community: While AI-generated contents have +played a crucial role in a wide range of AI models, the potential hidden risks +they introduce have not been thoroughly examined. Beyond human-oriented forgery +detection, AI-generated content poses potential issues for AI models originally +designed to process natural data. In this study, we underscore the exacerbated +hallucination phenomena in Large Vision-Language Models (LVLMs) caused by +AI-synthetic images. Remarkably, our findings shed light on a consistent AIGC +\textbf{hallucination bias}: the object hallucinations induced by synthetic +images are characterized by a greater quantity and a more uniform position +distribution, even these synthetic images do not manifest unrealistic or +additional relevant visual features compared to natural images. Moreover, our +investigations on Q-former and Linear projector reveal that synthetic images +may present token deviations after visual projection, thereby amplifying the +hallucination bias. + +
+
+
+
+
+ + ♻ ☆ Enhancing Representation in Radiography-Reports Foundation Model: A + Granular Alignment Algorithm Using Masked Contrastive Learning + + +
+ Recently, multi-modal vision-language foundation models have gained +significant attention in the medical field. While these models offer great +opportunities, they still face crucial challenges, such as the requirement for +fine-grained knowledge understanding in computer-aided diagnosis and the +capability of utilizing very limited or even no task-specific labeled data in +real-world clinical applications. In this study, we present MaCo, a masked +contrastive chest X-ray foundation model that tackles these challenges. MaCo +explores masked contrastive learning to simultaneously achieve fine-grained +image understanding and zero-shot learning for a variety of medical imaging +tasks. It designs a correlation weighting mechanism to adjust the correlation +between masked chest X-ray image patches and their corresponding reports, +thereby enhancing the model's representation learning capabilities. To evaluate +the performance of MaCo, we conducted extensive experiments using 6 well-known +open-source X-ray datasets. The experimental results demonstrate the +superiority of MaCo over 10 state-of-the-art approaches across tasks such as +classification, segmentation, detection, and phrase grounding. These findings +highlight the significant potential of MaCo in advancing a wide range of +medical image analysis tasks. + +
+
+
+
+
+ + ♻ ☆ BrainVis: Exploring the Bridge between Brain and Visual Signals via + Image Reconstruction + + +
+ Analyzing and reconstructing visual stimuli from brain signals effectively +advances the understanding of human visual system. However, the EEG signals are +complex and contain significant noise. This leads to substantial limitations in +existing works of visual stimuli reconstruction from EEG, such as difficulties +in aligning EEG embeddings with the fine-grained semantic information and a +heavy reliance on additional large self-collected dataset for training. To +address these challenges, we propose a novel approach called BrainVis. Firstly, +we divide the EEG signals into various units and apply a self-supervised +approach on them to obtain EEG time-domain features, in an attempt to ease the +training difficulty. Additionally, we also propose to utilize the +frequency-domain features to enhance the EEG representations. Then, we +simultaneously align EEG time-frequency embeddings with the interpolation of +the coarse and fine-grained semantics in the CLIP space, to highlight the +primary visual components and reduce the cross-modal alignment difficulty. +Finally, we adopt the cascaded diffusion models to reconstruct images. Using +only 10\% training data of the previous work, our proposed BrainVis outperforms +state of the arts in both semantic fidelity reconstruction and generation +quality. The code is available at https://github.com/RomGai/BrainVis. + +
+
+
+
+
+ + ♻ ☆ GISR: Geometric Initialization and Silhouette-based Refinement for + Single-View Robot Pose and Configuration Estimation + + +
+ In autonomous robotics, measurement of the robot's internal state and +perception of its environment, including interaction with other agents such as +collaborative robots, are essential. Estimating the pose of the robot arm from +a single view has the potential to replace classical eye-to-hand calibration +approaches and is particularly attractive for online estimation and dynamic +environments. In addition to its pose, recovering the robot configuration +provides a complete spatial understanding of the observed robot that can be +used to anticipate the actions of other agents in advanced robotics use cases. +Furthermore, this additional redundancy enables the planning and execution of +recovery protocols in case of sensor failures or external disturbances. We +introduce GISR - a deep configuration and robot-to-camera pose estimation +method that prioritizes execution in real-time. GISR consists of two modules: +(i) a geometric initialization module that efficiently computes an approximate +robot pose and configuration, and (ii) a deep iterative silhouette-based +refinement module that arrives at a final solution in just a few iterations. We +evaluate GISR on publicly available data and show that it outperforms existing +methods of the same class in terms of both speed and accuracy, and can compete +with approaches that rely on ground-truth proprioception and recover only the +pose. + +
+
+ comment: IEEE Robotics and Automation Letters (under revision), code available + at http://github.com/iwhitey/GISR-robot +
+
+
+
+
+ + ♻ ☆ IDNet: A Novel Dataset for Identity Document Analysis and Fraud + Detection + + +
+ Effective fraud detection and analysis of government-issued identity +documents, such as passports, driver's licenses, and identity cards, are +essential in thwarting identity theft and bolstering security on online +platforms. The training of accurate fraud detection and analysis tools depends +on the availability of extensive identity document datasets. However, current +publicly available benchmark datasets for identity document analysis, including +MIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a +limited number of samples, cover insufficient varieties of fraud patterns, and +seldom include alterations in critical personal identifying fields like +portrait images, limiting their utility in training models capable of detecting +realistic frauds while preserving privacy. + In response to these shortcomings, our research introduces a new benchmark +dataset, IDNet, designed to advance privacy-preserving fraud detection efforts. +The IDNet dataset comprises 837,060 images of synthetically generated identity +documents, totaling approximately 490 gigabytes, categorized into 20 types from +$10$ U.S. states and 10 European countries. We evaluate the utility and present +use cases of the dataset, illustrating how it can aid in training +privacy-preserving fraud detection methods, facilitating the generation of +camera and video capturing of identity documents, and testing schema +unification and other identity document management functionalities. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Projected Stochastic Gradient Descent with Quantum Annealed Binary + Gradients + + +
+ We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards +training neural networks with binary weights, known as binary neural networks +(BNNs), on quantum hardware. BNNs reduce the computational requirements and +energy consumption of deep learning models with minimal loss in accuracy. +However, training them in practice remains to be an open challenge. Most known +BNN-optimisers either rely on projected updates or binarise weights +post-training. Instead, QP-SBGD approximately maps the gradient onto binary +variables, by solving a quadratic constrained binary optimisation. Under +practically reasonable assumptions, we show that this update rule converges +with a rate of $\mathcal{O}(1 / \sqrt{T})$. Moreover, we show how the +$\mathcal{NP}$-hard projection can be effectively executed on an adiabatic +quantum annealer, harnessing recent advancements in quantum computation. We +also introduce a projected version of this update rule and prove that if a +fixed point exists in the binary variable space, the modified updates will +converge to it. Last but not least, our algorithm is implemented layer-wise, +making it suitable to train larger networks on resource-limited quantum +hardware. Through extensive evaluations, we show that QP-SBGD outperforms or is +on par with competitive and well-established baselines such as BinaryConnect, +signSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as +well as binary graph neural networks. + +
+
+
+
+
+ + ♻ ☆ Co-synthesis of Histopathology Nuclei Image-Label Pairs using a + Context-Conditioned Joint Diffusion Model ECCV 2024 + + +
+ In multi-class histopathology nuclei analysis tasks, the lack of training +data becomes a main bottleneck for the performance of learning-based methods. +To tackle this challenge, previous methods have utilized generative models to +increase data by generating synthetic samples. However, existing methods often +overlook the importance of considering the context of biological tissues (e.g., +shape, spatial layout, and tissue type) in the synthetic data. Moreover, while +generative models have shown superior performance in synthesizing realistic +histopathology images, none of the existing methods are capable of producing +image-label pairs at the same time. In this paper, we introduce a novel +framework for co-synthesizing histopathology nuclei images and paired semantic +labels using a context-conditioned joint diffusion model. We propose +conditioning of a diffusion model using nucleus centroid layouts with +structure-related text prompts to incorporate spatial and structural context +information into the generation targets. Moreover, we enhance the granularity +of our synthesized semantic labels by generating instance-wise nuclei labels +using distance maps synthesized concurrently in conjunction with the images and +semantic labels. We demonstrate the effectiveness of our framework in +generating high-quality samples on multi-institutional, multi-organ, and +multi-modality datasets. Our synthetic data consistently outperforms existing +augmentation methods in the downstream tasks of nuclei segmentation and +classification. + +
+
+ comment: ECCV 2024 accepted +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ SpannerLib: Embedding Declarative Information Extraction in an + Imperative Workflow + + +
+ Document spanners have been proposed as a formal framework for declarative +Information Extraction (IE) from text, following IE products from the industry +and academia. Over the past decade, the framework has been studied thoroughly +in terms of expressive power, complexity, and the ability to naturally combine +text analysis with relational querying. This demonstration presents SpannerLib +a library for embedding document spanners in Python code. SpannerLib +facilitates the development of IE programs by providing an implementation of +Spannerlog (Datalog-based documentspanners) that interacts with the Python code +in two directions: rules can be embedded inside Python, and they can invoke +custom Python code (e.g., calls to ML-based NLP models) via user-defined +functions. The demonstration scenarios showcase IE programs, with increasing +levels of complexity, within Jupyter Notebook. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Laser: Parameter-Efficient LLM Bi-Tuning for Sequential Recommendation + with Collaborative Information + + +
+ Sequential recommender systems are essential for discerning user preferences +from historical interactions and facilitating targeted recommendations. Recent +innovations employing Large Language Models (LLMs) have advanced the field by +encoding item semantics, yet they often necessitate substantial parameter +tuning and are resource-demanding. Moreover, these works fails to consider the +diverse characteristics of different types of users and thus diminishes the +recommendation accuracy. In this paper, we propose a parameter-efficient Large +Language Model Bi-Tuning framework for sequential recommendation with +collaborative information (Laser). Specifically, Bi-Tuning works by inserting +trainable virtual tokens at both the prefix and suffix of the input sequence +and freezing the LLM parameters, thus optimizing the LLM for the sequential +recommendation. In our Laser, the prefix is utilized to incorporate user-item +collaborative information and adapt the LLM to the recommendation task, while +the suffix converts the output embeddings of the LLM from the language space to +the recommendation space for the follow-up item recommendation. Furthermore, to +capture the characteristics of different types of users when integrating the +collaborative information via the prefix, we introduce M-Former, a lightweight +MoE-based querying transformer that uses a set of query experts to integrate +diverse user-specific collaborative information encoded by frozen ID-based +sequential recommender systems, significantly improving the accuracy of +recommendations. Extensive experiments on real-world datasets demonstrate that +Laser can parameter-efficiently adapt LLMs to effective recommender systems, +significantly outperforming state-of-the-art methods. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Blockchain-based Federated Recommendation with Incentive Mechanism + + +
+ Nowadays, federated recommendation technology is rapidly evolving to help +multiple organisations share data and train models while meeting user privacy, +data security and government regulatory requirements. However, federated +recommendation increases customer system costs such as power, computational and +communication resources. Besides, federated recommendation systems are also +susceptible to model attacks and data poisoning by participating malicious +clients. Therefore, most customers are unwilling to participate in federated +recommendation without any incentive. To address these problems, we propose a +blockchain-based federated recommendation system with incentive mechanism to +promote more trustworthy, secure, and efficient federated recommendation +service. First, we construct a federated recommendation system based on NeuMF +and FedAvg. Then we introduce a reverse auction mechanism to select optimal +clients that can maximize the social surplus. Finally, we employ blockchain for +on-chain evidence storage of models to ensure the safety of the federated +recommendation system. The experimental results show that our proposed +incentive mechanism can attract clients with superior training data to engage +in the federal recommendation at a lower cost, which can increase the economic +benefit of federal recommendation by 54.9\% while improve the recommendation +performance. Thus our work provides theoretical and technological support for +the construction of a harmonious and healthy ecological environment for the +application of federal recommendation. + +
+
+ comment: This paper has been accepted on 2024 Blockchain and Web3 Technology + Innovation and Application Exchange Conference (BWTAC 2024) +
+
+
+
+
+ + ♻ ☆ rerankers: A Lightweight Python Library to Unify Ranking Methods + + +
+ This paper presents rerankers, a Python library which provides an easy-to-use +interface to the most commonly used re-ranking approaches. Re-ranking is an +integral component of many retrieval pipelines; however, there exist numerous +approaches to it, relying on different implementation methods. rerankers +unifies these methods into a single user-friendly interface, allowing +practitioners and researchers alike to explore different methods while only +changing a single line of Python code. Moreover ,rerankers ensures that its +implementations are done with the fewest dependencies possible, and re-uses the +original implementation whenever possible, guaranteeing that our simplified +interface results in no performance degradation compared to more complex ones. +The full source code and list of supported models are updated regularly and +available at https://github.com/answerdotai/rerankers. + +
+
+
+
+
+ + ♻ ☆ Impedance vs. Power Side-channel Vulnerabilities: A Comparative Study + + +
+ In recent times, impedance side-channel analysis has emerged as a potent +strategy for adversaries seeking to extract sensitive information from +computing systems. It leverages variations in the intrinsic impedance of a +chip's internal structure across different logic states. In this study, we +conduct a comparative analysis between the newly explored impedance side +channel and the well-established power side channel. Through experimental +evaluation, we investigate the efficacy of these two side channels in +extracting the cryptographic key from the Advanced Encryption Standard (AES) +and analyze their performance. Our results indicate that impedance analysis +demonstrates a higher potential for cryptographic key extraction compared to +power side-channel analysis. Moreover, we identify scenarios where power +side-channel analysis does not yield satisfactory results, whereas impedance +analysis proves to be more robust and effective. This work not only underscores +the significance of impedance side-channel analysis in enhancing cryptographic +security but also emphasizes the necessity for a deeper understanding of its +mechanisms and implications. + +
+
+
+
+
+
+
+
+ + Machine Learning 51 + +
+
+
+ + ☆ Double Machine Learning at Scale to Predict Causal Impact of Customer + Actions ECML + + +
+ Causal Impact (CI) of customer actions are broadly used across the industry +to inform both short- and long-term investment decisions of various types. In +this paper, we apply the double machine learning (DML) methodology to estimate +the CI values across 100s of customer actions of business interest and 100s of +millions of customers. We operationalize DML through a causal ML library based +on Spark with a flexible, JSON-driven model configuration approach to estimate +CI at scale (i.e., across hundred of actions and millions of customers). We +outline the DML methodology and implementation, and associated benefits over +the traditional potential outcomes based CI model. We show population-level as +well as customer-level CI values along with confidence intervals. The +validation metrics show a 2.2% gain over the baseline methods and a 2.5X gain +in the computational time. Our contribution is to advance the scalable +application of CI, while also providing an interface that allows faster +experimentation, cross-platform support, ability to onboard new use cases, and +improves accessibility of underlying code for partner teams. + +
+
+ comment: 16 pages, 11 figures. Accepted at the European Conference on Machine + Learning and Principles and Practice of Knowledge Discovery in Databases + (ECML PKDD) 2023, Turin, Italy +
+
+
+
+
+ + ☆ Generative Principal Component Regression via Variational Inference + + +
+ The ability to manipulate complex systems, such as the brain, to modify +specific outcomes has far-reaching implications, particularly in the treatment +of psychiatric disorders. One approach to designing appropriate manipulations +is to target key features of predictive models. While generative latent +variable models, such as probabilistic principal component analysis (PPCA), is +a powerful tool for identifying targets, they struggle incorporating +information relevant to low-variance outcomes into the latent space. When +stimulation targets are designed on the latent space in such a scenario, the +intervention can be suboptimal with minimal efficacy. To address this problem, +we develop a novel objective based on supervised variational autoencoders +(SVAEs) that enforces such information is represented in the latent space. The +novel objective can be used with linear models, such as PPCA, which we refer to +as generative principal component regression (gPCR). We show in simulations +that gPCR dramatically improves target selection in manipulation as compared to +standard PCR and SVAEs. As part of these simulations, we develop a metric for +detecting when relevant information is not properly incorporated into the +loadings. We then show in two neural datasets related to stress and social +behavior in which gPCR dramatically outperforms PCR in predictive performance +and that SVAEs exhibit low incorporation of relevant information into the +loadings. Overall, this work suggests that our method significantly improves +target selection for manipulation using latent variable models over competitor +inference schemes. + +
+
+
+
+
+ + ☆ TimeDiT: General-purpose Diffusion Transformers for Time Series + Foundation Model ICML 2024 + + +
+ With recent advances in building foundation models for texts and video data, +there is a surge of interest in foundation models for time series. A family of +models have been developed, utilizing a temporal auto-regressive generative +Transformer architecture, whose effectiveness has been proven in Large Language +Models. While the empirical results are promising, almost all existing time +series foundation models have only been tested on well-curated ``benchmark'' +datasets very similar to texts. However, real-world time series exhibit unique +challenges, such as variable channel sizes across domains, missing values, and +varying signal sampling intervals due to the multi-resolution nature of +real-world data. Additionally, the uni-directional nature of temporally +auto-regressive decoding limits the incorporation of domain knowledge, such as +physical laws expressed as partial differential equations (PDEs). To address +these challenges, we introduce the Time Diffusion Transformer (TimeDiT), a +general foundation model for time series that employs a denoising diffusion +paradigm instead of temporal auto-regressive generation. TimeDiT leverages the +Transformer architecture to capture temporal dependencies and employs diffusion +processes to generate high-quality candidate samples without imposing stringent +assumptions on the target distribution via novel masking schemes and a channel +alignment strategy. Furthermore, we propose a finetuning-free model editing +strategy that allows the seamless integration of external knowledge during the +sampling process without updating any model parameters. Extensive experiments +conducted on a varity of tasks such as forecasting, imputation, and anomaly +detection, demonstrate the effectiveness of TimeDiT. + +
+
+ comment: 23 Pages, 6 Figures, 11 Tables. First present at ICML 2024 Workshop + on Foundation Models in the Wild +
+
+
+
+
+ + ☆ On the Benefits of Memory for Modeling Time-Dependent PDEs + + +
+ Data-driven techniques have emerged as a promising alternative to traditional +numerical methods for solving partial differential equations (PDEs). These +techniques frequently offer a better trade-off between computational cost and +accuracy for many PDE families of interest. For time-dependent PDEs, existing +methodologies typically treat PDEs as Markovian systems, i.e., the evolution of +the system only depends on the ``current state'', and not the past states. +However, distortion of the input signals -- e.g., due to discretization or +low-pass filtering -- can render the evolution of the distorted signals +non-Markovian. In this work, motivated by the Mori-Zwanzig theory of model +reduction, we investigate the impact of architectures with memory for modeling +PDEs: that is, when past states are explicitly used to predict the future. We +introduce Memory Neural Operator (MemNO), a network based on the recent SSM +architectures and Fourier Neural Operator (FNO). We empirically demonstrate on +a variety of PDE families of interest that when the input is given on a +low-resolution grid, MemNO significantly outperforms the baselines without +memory, achieving more than 6 times less error on unseen PDEs. Via a +combination of theory and experiments, we show that the effect of memory is +particularly significant when the solution of the PDE has high frequency +Fourier components (e.g., low-viscosity fluid dynamics), and it also increases +robustness to observation noise. + +
+
+
+
+
+ + ☆ QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of + DWI Data MICCAI 2024 + + +
+ We propose an image-conditioned diffusion model to estimate high angular +resolution diffusion weighted imaging (DWI) from a low angular resolution +acquisition. Our model, which we call QID$^2$, takes as input a set of low +angular resolution DWI data and uses this information to estimate the DWI data +associated with a target gradient direction. We leverage a U-Net architecture +with cross-attention to preserve the positional information of the reference +images, further guiding the target image generation. We train and evaluate +QID$^2$ on single-shell DWI samples curated from the Human Connectome Project +(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to +produce low angular resolution DWI data and train QID$^2$ to reconstruct the +missing high angular resolution samples. We compare QID$^2$ with two +state-of-the-art GAN models. Our results demonstrate that QID$^2$ not only +achieves higher-quality generated images, but it consistently outperforms the +GAN models in downstream tensor estimation across multiple metrics. Taken +together, this study highlights the potential of diffusion models, and QID$^2$ +in particular, for q-space up-sampling, thus offering a promising toolkit for +clinical and research applications. + +
+
+ comment: Accepted at MICCAI 2024 International Workshop on Computational + Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work +
+
+
+
+
+ + ☆ A Lesion-aware Edge-based Graph Neural Network for Predicting Language + Ability in Patients with Post-stroke Aphasia MICCAI 2024 + + +
+ We propose a lesion-aware graph neural network (LEGNet) to predict language +ability from resting-state fMRI (rs-fMRI) connectivity in patients with +post-stroke aphasia. Our model integrates three components: an edge-based +learning module that encodes functional connectivity between brain regions, a +lesion encoding module, and a subgraph learning module that leverages +functional similarities for prediction. We use synthetic data derived from the +Human Connectome Project (HCP) for hyperparameter tuning and model pretraining. +We then evaluate the performance using repeated 10-fold cross-validation on an +in-house neuroimaging dataset of post-stroke aphasia. Our results demonstrate +that LEGNet outperforms baseline deep learning methods in predicting language +ability. LEGNet also exhibits superior generalization ability when tested on a +second in-house dataset that was acquired under a slightly different +neuroimaging protocol. Taken together, the results of this study highlight the +potential of LEGNet in effectively learning the relationships between rs-fMRI +connectivity and language ability in a patient cohort with brain lesions for +improved post-stroke aphasia evaluation. + +
+
+ comment: Accepted at MICCAI 2024 International Workshop on Machine Learning in + Clinical Neuroimaging (MLCN) +
+
+
+
+
+ + ☆ K-Origins: Better Colour Quantification for Neural Networks + + +
+ K-Origins is a neural network layer designed to improve image-based network +performances when learning colour, or intensities, is beneficial. Over 250 +encoder-decoder convolutional networks are trained and tested on 16-bit +synthetic data, demonstrating that K-Origins improves semantic segmentation +accuracy in two scenarios: object detection with low signal-to-noise ratios, +and segmenting multiple objects that are identical in shape but vary in colour. +K-Origins generates output features from the input features, $\textbf{X}$, by +the equation $\textbf{Y}_k = \textbf{X}-\textbf{J}\cdot w_k$ for each trainable +parameter $w_k$, where $\textbf{J}$ is a matrix of ones. Additionally, networks +with varying receptive fields were trained to determine optimal network depths +based on the dimensions of target classes, suggesting that receptive field +lengths should exceed object sizes. By ensuring a sufficient receptive field +length and incorporating K-Origins, we can achieve better semantic network +performance. + +
+
+ comment: 16 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Reinforcement Learning-enabled Satellite Constellation Reconfiguration + and Retasking for Mission-Critical Applications + + +
+ The development of satellite constellation applications is rapidly advancing +due to increasing user demands, reduced operational costs, and technological +advancements. However, a significant gap in the existing literature concerns +reconfiguration and retasking issues within satellite constellations, which is +the primary focus of our research. In this work, we critically assess the +impact of satellite failures on constellation performance and the associated +task requirements. To facilitate this analysis, we introduce a system modeling +approach for GPS satellite constellations, enabling an investigation into +performance dynamics and task distribution strategies, particularly in +scenarios where satellite failures occur during mission-critical operations. +Additionally, we introduce reinforcement learning (RL) techniques, specifically +Q-learning, Policy Gradient, Deep Q-Network (DQN), and Proximal Policy +Optimization (PPO), for managing satellite constellations, addressing the +challenges posed by reconfiguration and retasking following satellite failures. +Our results demonstrate that DQN and PPO achieve effective outcomes in terms of +average rewards, task completion rates, and response times. + +
+
+ comment: Accepted for publication in the IEEE Military Communications + Conference (IEEE MILCOM 2024) +
+
+
+
+
+ + ♻ ☆ PID Accelerated Temporal Difference Algorithms + + +
+ Long-horizon tasks, which have a large discount factor, pose a challenge for +most conventional reinforcement learning (RL) algorithms. Algorithms such as +Value Iteration and Temporal Difference (TD) learning have a slow convergence +rate and become inefficient in these tasks. When the transition distributions +are given, PID VI was recently introduced to accelerate the convergence of +Value Iteration using ideas from control theory. Inspired by this, we introduce +PID TD Learning and PID Q-Learning algorithms for the RL setting, in which only +samples from the environment are available. We give a theoretical analysis of +the convergence of PID TD Learning and its acceleration compared to the +conventional TD Learning. We also introduce a method for adapting PID gains in +the presence of noise and empirically verify its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Improving Rare Word Translation With Dictionaries and Attention Masking + + +
+ In machine translation, rare words continue to be a problem for the dominant +encoder-decoder architecture, especially in low-resource and out-of-domain +translation settings. Human translators solve this problem with monolingual or +bilingual dictionaries. In this paper, we propose appending definitions from a +bilingual dictionary to source sentences and using attention masking to link +together rare words with their definitions. We find that including definitions +for rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1. + +
+
+ comment: 11 pages, 3 figures, 3 tables. Accepted at AMTA 2024 +
+
+
+
+
+ + ♻ ☆ Low-Rank Quantization-Aware Training for LLMs + + +
+ Large language models (LLMs) are omnipresent, however their practical +deployment is challenging due to their ever increasing computational and memory +demands. Quantization is one of the most effective ways to make them more +compute and memory efficient. Quantization-aware training (QAT) methods, +generally produce the best quantized performance, however it comes at the cost +of potentially long training time and excessive memory usage, making it +impractical when applying for LLMs. Inspired by parameter-efficient fine-tuning +(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a +lightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several +components to save memory without sacrificing predictive performance: (a) +low-rank auxiliary weights that are aware of the quantization grid; (b) a +downcasting operator using fixed-point or double-packed integers and (c) +checkpointing. Unlike most related work, our method (i) is inference-efficient, +leading to no additional overhead compared to traditional PTQ; (ii) can be seen +as a general extended pretraining framework, meaning that the resulting model +can still be utilized for any downstream task afterwards; (iii) can be applied +across a wide range of quantization settings, such as different choices +quantization granularity, activation quantization, and seamlessly combined with +many PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families +and validate its effectiveness on several downstream tasks. Our method +outperforms common post-training quantization (PTQ) approaches and reaches the +same model performance as full-model QAT at the fraction of its memory usage. +Specifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of +memory. Our source code is available at +https://github.com/qualcomm-ai-research/LR-QAT + +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in +fields of materials science, chemistry, pharmacology just to name a few. +Conventional MD simulations are plagued by numerical stability as well as long +equilibration time issues, which limits broader applications of MD simulations. +Recently, a surge of deep learning approaches have been devised for +time-coarsened dynamics, which learns the state transition mechanism over much +larger time scales to overcome these limitations. However, only a few methods +target the underlying Boltzmann distribution by resampling techniques, where +proposals are rarely accepted as new states with low efficiency. In this work, +we propose a force-guided bridge matching model, FBM, a novel framework that +first incorporates physical priors into bridge matching for full-atom +time-coarsened dynamics. With the guidance of our well-designed intermediate +force field, FBM is feasible to target the Boltzmann-like distribution by +direct inference without extra steps. Experiments on small peptides verify our +superiority in terms of comprehensive metrics and demonstrate transferability +to unseen peptide systems. + +
+
+
+
+
+ + ♻ ☆ Verifiable cloud-based variational quantum algorithms + + +
+ Variational quantum algorithms (VQAs) have shown potential for quantum +advantage with noisy intermediate-scale quantum (NISQ) devices for quantum +machine learning (QML). However, given the high cost and limited availability +of quantum resources, delegating VQAs via cloud networks is a more practical +solution for clients with limited quantum capabilities. Recently, Shingu et +al.[Physical Review A, 105, 022603 (2022)] proposed a variational secure cloud +quantum computing protocol, utilizing ancilla-driven quantum computation (ADQC) +for cloud-based VQAs with minimal quantum resource consumption. However, their +protocol lacks verifiability, which exposes it to potential malicious behaviors +by the server. Additionally, channel loss requires frequent re-delegation as +the size of the delegated variational circuit grows, complicating verification +due to increased circuit complexity. This paper introduces a new protocol to +address these challenges and enhance both verifiability and tolerance to +channel loss in cloud-based VQAs. + +
+
+
+
+
+ + ♻ ☆ Bayesian Learning in a Nonlinear Multiscale State-Space Model + + +
+ The ubiquity of multiscale interactions in complex systems is +well-recognized, with development and heredity serving as a prime example of +how processes at different temporal scales influence one another. This work +introduces a novel multiscale state-space model to explore the dynamic +interplay between systems interacting across different time scales, with +feedback between each scale. We propose a Bayesian learning framework to +estimate unknown states by learning the unknown process noise covariances +within this multiscale model. We develop a Particle Gibbs with Ancestor +Sampling (PGAS) algorithm for inference and demonstrate through simulations the +efficacy of our approach. + +
+
+ comment: Corrected a typo +
+
+
+
+
+ + ♻ ☆ Foundation Models for Music: A Survey + + +
+ In recent years, foundation models (FMs) such as large language models (LLMs) +and latent diffusion models (LDMs) have profoundly impacted diverse sectors, +including music. This comprehensive review examines state-of-the-art (SOTA) +pre-trained models and foundation models in music, spanning from representation +learning, generative learning and multimodal learning. We first contextualise +the significance of music in various industries and trace the evolution of AI +in music. By delineating the modalities targeted by foundation models, we +discover many of the music representations are underexplored in FM development. +Then, emphasis is placed on the lack of versatility of previous methods on +diverse music applications, along with the potential of FMs in music +understanding, generation and medical application. By comprehensively exploring +the details of the model pre-training paradigm, architectural choices, +tokenisation, finetuning methodologies and controllability, we emphasise the +important topics that should have been well explored, like instruction tuning +and in-context learning, scaling law and emergent ability, as well as +long-sequence modelling etc. A dedicated section presents insights into music +agents, accompanied by a thorough analysis of datasets and evaluations +essential for pre-training and downstream tasks. Finally, by underscoring the +vital importance of ethical considerations, we advocate that following research +on FM for music should focus more on such issues as interpretability, +transparency, human responsibility, and copyright issues. The paper offers +insights into future challenges and trends on FMs for music, aiming to shape +the trajectory of human-AI collaboration in the music realm. + +
+
+
+
+
+ + ♻ ☆ Different Victims, Same Layout: Email Visual Similarity Detection for + Enhanced Email Protection CCS 2024 + + +
+ In the pursuit of an effective spam detection system, the focus has often +been on identifying known spam patterns either through rule-based detection +systems or machine learning (ML) solutions that rely on keywords. However, both +systems are susceptible to evasion techniques and zero-day attacks that can be +achieved at low cost. Therefore, an email that bypassed the defense system once +can do it again in the following days, even though rules are updated or the ML +models are retrained. The recurrence of failures to detect emails that exhibit +layout similarities to previously undetected spam is concerning for customers +and can erode their trust in a company. Our observations show that threat +actors reuse email kits extensively and can bypass detection with little +effort, for example, by making changes to the content of emails. In this work, +we propose an email visual similarity detection approach, named Pisco, to +improve the detection capabilities of an email threat defense system. We apply +our proof of concept to some real-world samples received from different +sources. Our results show that email kits are being reused extensively and +visually similar emails are sent to our customers at various time intervals. +Therefore, this method could be very helpful in situations where detection +features that rely on textual features and keywords are bypassed, an occurrence +our observations show happens frequently. + +
+
+ comment: To be published in the proceedings of the ACM Conference on Computer + and Communications Security (ACM CCS 2024) +
+
+
+
+
+ + ♻ ☆ On the Convergence of Gradient Descent for Large Learning Rates + + +
+ A vast literature on convergence guarantees for gradient descent and derived +methods exists at the moment. However, a simple practical situation remains +unexplored: when a fixed step size is used, can we expect gradient descent to +converge starting from any initialization? We provide fundamental impossibility +results showing that convergence becomes impossible no matter the +initialization if the step size gets too big. Looking at the asymptotic value +of the gradient norm along the optimization trajectory, we see that there is a +phase transition as the step size crosses a critical value. This has been +observed by practitioners, yet the true mechanisms through which this happens +remain unclear beyond heuristics. Using results from dynamical systems theory, +we provide a proof of this in the case of linear neural networks with a squared +loss. We also prove the impossibility of convergence for more general losses +without requiring strong assumptions such as Lipschitz continuity for the +gradient. We validate our findings through experiments with non-linear +networks. + +
+
+
+
+
+ + ♻ ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+ comment: accepted by IEEE RA-L +
+
+
+
+
+ + ♻ ☆ An embedding-based distance for temporal graphs + + +
+ Temporal graphs are commonly used to represent time-resolved relations +between entities in many natural and artificial systems. Many techniques were +devised to investigate the evolution of temporal graphs by comparing their +state at different time points. However, quantifying the similarity between +temporal graphs as a whole is an open problem. Here, we use embeddings based on +time-respecting random walks to introduce a new notion of distance between +temporal graphs. This distance is well-defined for pairs of temporal graphs +with different numbers of nodes and different time spans. We study the case of +a matched pair of graphs, when a known relation exists between their nodes, and +the case of unmatched graphs, when such a relation is unavailable and the +graphs may be of different sizes. We use empirical and synthetic temporal +network data to show that the distance we introduce discriminates graphs with +different topological and temporal properties. We provide an efficient +implementation of the distance computation suitable for large-scale temporal +graphs. + +
+
+
+
+
+ + ♻ ☆ Heterogeneity-Informed Meta-Parameter Learning for Spatiotemporal Time + Series Forecasting KDD'24 + + +
+ Spatiotemporal time series forecasting plays a key role in a wide range of +real-world applications. While significant progress has been made in this area, +fully capturing and leveraging spatiotemporal heterogeneity remains a +fundamental challenge. Therefore, we propose a novel Heterogeneity-Informed +Meta-Parameter Learning scheme. Specifically, our approach implicitly captures +spatiotemporal heterogeneity through learning spatial and temporal embeddings, +which can be viewed as a clustering process. Then, a novel spatiotemporal +meta-parameter learning paradigm is proposed to learn spatiotemporal-specific +parameters from meta-parameter pools, which is informed by the captured +heterogeneity. Based on these ideas, we develop a Heterogeneity-Informed +Spatiotemporal Meta-Network (HimNet) for spatiotemporal time series +forecasting. Extensive experiments on five widely-used benchmarks demonstrate +our method achieves state-of-the-art performance while exhibiting superior +interpretability. Our code is available at +https://github.com/XDZhelheim/HimNet. + +
+
+ comment: Published in KDD'24 Research Track +
+
+
+
+
+ + ♻ ☆ FairX: A comprehensive benchmarking tool for model analysis using + fairness, utility, and explainability + + +
+ We present FairX, an open-source Python-based benchmarking tool designed for +the comprehensive analysis of models under the umbrella of fairness, utility, +and eXplainability (XAI). FairX enables users to train benchmarking +bias-mitigation models and evaluate their fairness using a wide array of +fairness metrics, data utility metrics, and generate explanations for model +predictions, all within a unified framework. Existing benchmarking tools do not +have the way to evaluate synthetic data generated from fair generative models, +also they do not have the support for training fair generative models either. +In FairX, we add fair generative models in the collection of our fair-model +library (pre-processing, in-processing, post-processing) and evaluation metrics +for evaluating the quality of synthetic fair data. This version of FairX +supports both tabular and image datasets. It also allows users to provide their +own custom datasets. The open-source FairX benchmarking package is publicly +available at \url{https://github.com/fahim-sikder/FairX}. + +
+
+
+
+
+ + ♻ ☆ Behavioral Learning of Dish Rinsing and Scrubbing based on Interruptive + Direct Teaching Considering Assistance Rate + + +
+ Robots are expected to manipulate objects in a safe and dexterous way. For +example, washing dishes is a dexterous operation that involves scrubbing the +dishes with a sponge and rinsing them with water. It is necessary to learn it +safely without splashing water and without dropping the dishes. In this study, +we propose a safe and dexterous manipulation system. The robot learns a +dynamics model of the object by estimating the state of the object and the +robot itself, the control input, and the amount of human assistance required +(assistance rate) after the human corrects the initial trajectory of the +robot's hands by interruptive direct teaching. By backpropagating the error +between the estimated and the reference value using the acquired dynamics +model, the robot can generate a control input that approaches the reference +value, for example, so that human assistance is not required and the dish does +not move excessively. This allows for adaptive rinsing and scrubbing of dishes +with unknown shapes and properties. As a result, it is possible to generate +safe actions that require less human assistance. + +
+
+ comment: Accepted at Advanced Robotics +
+
+
+
+
+ + ♻ ☆ Towards Explainable Traffic Flow Prediction with Large Language Models + + +
+ Traffic forecasting is crucial for intelligent transportation systems. It has +experienced significant advancements thanks to the power of deep learning in +capturing latent patterns of traffic data. However, recent deep-learning +architectures require intricate model designs and lack an intuitive +understanding of the mapping from input data to predicted results. Achieving +both accuracy and explainability in traffic prediction models remains a +challenge due to the complexity of traffic data and the inherent opacity of +deep learning models. To tackle these challenges, we propose a Traffic flow +Prediction model based on Large Language Models (LLMs) to generate explainable +traffic predictions, named xTP-LLM. By transferring multi-modal traffic data +into natural language descriptions, xTP-LLM captures complex time-series +patterns and external factors from comprehensive traffic data. The LLM +framework is fine-tuned using language-based instructions to align with +spatial-temporal traffic flow data. Empirically, xTP-LLM shows competitive +accuracy compared with deep learning baselines, while providing an intuitive +and reliable explanation for predictions. This paper contributes to advancing +explainable traffic prediction models and lays a foundation for future +exploration of LLM applications in transportation. To the best of our +knowledge, this is the first study to use LLM for explainable prediction of +traffic flows. + +
+
+ comment: 31pages, 16 figures +
+
+
+
+
+ + ♻ ☆ OceanGPT: A Large Language Model for Ocean Science Tasks ACL2024 + + +
+ Ocean science, which delves into the oceans that are reservoirs of life and +biodiversity, is of great significance given that oceans cover over 70% of our +planet's surface. Recently, advances in Large Language Models (LLMs) have +transformed the paradigm in science. Despite the success in other domains, +current LLMs often fall short in catering to the needs of domain experts like +oceanographers, and the potential of LLMs for ocean science is under-explored. +The intrinsic reasons are the immense and intricate nature of ocean data as +well as the necessity for higher granularity and richness in knowledge. To +alleviate these issues, we introduce OceanGPT, the first-ever large language +model in the ocean domain, which is expert in various ocean science tasks. We +also propose OceanGPT, a novel framework to automatically obtain a large volume +of ocean domain instruction data, which generates instructions based on +multi-agent collaboration. Additionally, we construct the first oceanography +benchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean +domain. Though comprehensive experiments, OceanGPT not only shows a higher +level of knowledge expertise for oceans science tasks but also gains +preliminary embodied intelligence capabilities in ocean technology. + +
+
+ comment: ACL2024. Project Website: http://oceangpt.zjukg.cn/ +
+
+
+
+
+ + ♻ ☆ Statistical Context Detection for Deep Lifelong Reinforcement Learning + + +
+ Context detection involves labeling segments of an online stream of data as +belonging to different tasks. Task labels are used in lifelong learning +algorithms to perform consolidation or other procedures that prevent +catastrophic forgetting. Inferring task labels from online experiences remains +a challenging problem. Most approaches assume finite and low-dimension +observation spaces or a preliminary training phase during which task labels are +learned. Moreover, changes in the transition or reward functions can be +detected only in combination with a policy, and therefore are more difficult to +detect than changes in the input distribution. This paper presents an approach +to learning both policies and labels in an online deep reinforcement learning +setting. The key idea is to use distance metrics, obtained via optimal +transport methods, i.e., Wasserstein distance, on suitable latent action-reward +spaces to measure distances between sets of data points from past and current +streams. Such distances can then be used for statistical tests based on an +adapted Kolmogorov-Smirnov calculation to assign labels to sequences of +experiences. A rollback procedure is introduced to learn multiple policies by +ensuring that only the appropriate data is used to train the corresponding +policy. The combination of task detection and policy deployment allows for the +optimization of lifelong reinforcement learning agents without an oracle that +provides task labels. The approach is tested using two benchmarks and the +results show promising performance when compared with related context detection +algorithms. The results suggest that optimal transport statistical methods +provide an explainable and justifiable procedure for online context detection +and reward optimization in lifelong reinforcement learning. + +
+
+ comment: 10 pages excluding references and bibliography. Accepted at CoLLAs + 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Cell Tracking with a Time-Symmetric Deep Learning Approach + + +
+ The accurate tracking of live cells using video microscopy recordings remains +a challenging task for popular state-of-the-art image processing based object +tracking methods. In recent years, several existing and new applications have +attempted to integrate deep-learning based frameworks for this task, but most +of them still heavily rely on consecutive frame based tracking embedded in +their architecture or other premises that hinder generalized learning. To +address this issue, we aimed to develop a new deep-learning based tracking +method that relies solely on the assumption that cells can be tracked based on +their spatio-temporal neighborhood, without restricting it to consecutive +frames. The proposed method has the additional benefit that the motion patterns +of the cells can be learned completely by the predictor without any prior +assumptions, and it has the potential to handle a large number of video frames +with heavy artifacts. The efficacy of the proposed method is demonstrated +through biologically motivated validation strategies and compared against +multiple state-of-the-art cell tracking methods. + +
+
+
+
+
+ + ♻ ☆ Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph + Neural Networks for Drug Response Prediction + + +
+ Graph Neural Networks have been widely applied in critical decision-making +areas that demand interpretable predictions, leading to the flourishing +development of interpretability algorithms. However, current graph +interpretability algorithms tend to emphasize generality and often overlook +biological significance, thereby limiting their applicability in predicting +cancer drug responses. In this paper, we propose a novel post-hoc +interpretability algorithm for cancer drug response prediction, CETExplainer, +which incorporates a controllable edge-type-specific weighting mechanism. It +considers the mutual information between subgraphs and predictions, proposing a +structural scoring approach to provide fine-grained, biologically meaningful +explanations for predictive models. We also introduce a method for constructing +ground truth based on real-world datasets to quantitatively evaluate the +proposed interpretability algorithm. Empirical analysis on the real-world +dataset demonstrates that CETExplainer achieves superior stability and improves +explanation quality compared to leading algorithms, thereby offering a robust +and insightful tool for cancer drug prediction. + +
+
+
+
+
+ + ♻ ☆ GANs Conditioning Methods: A Survey + + +
+ In recent years, Generative Adversarial Networks (GANs) have seen significant +advancements, leading to their widespread adoption across various fields. The +original GAN architecture enables the generation of images without any specific +control over the content, making it an unconditional generation process. +However, many practical applications require precise control over the generated +output, which has led to the development of conditional GANs (cGANs) that +incorporate explicit conditioning to guide the generation process. cGANs extend +the original framework by incorporating additional information (conditions), +enabling the generation of samples that adhere to that specific criteria. +Various conditioning methods have been proposed, each differing in how they +integrate the conditioning information into both the generator and the +discriminator networks. In this work, we review the conditioning methods +proposed for GANs, exploring the characteristics of each method and +highlighting their unique mechanisms and theoretical foundations. Furthermore, +we conduct a comparative analysis of these methods, evaluating their +performance on various image datasets. Through these analyses, we aim to +provide insights into the strengths and limitations of various conditioning +techniques, guiding future research and application in generative modeling. + +
+
+
+
+
+ + ♻ ☆ A Survey on Stability of Learning with Limited Labelled Data and its + Sensitivity to the Effects of Randomness + + +
+ Learning with limited labelled data, such as prompting, in-context learning, +fine-tuning, meta-learning or few-shot learning, aims to effectively train a +model using only a small amount of labelled samples. However, these approaches +have been observed to be excessively sensitive to the effects of uncontrolled +randomness caused by non-determinism in the training process. The randomness +negatively affects the stability of the models, leading to large variances in +results across training runs. When such sensitivity is disregarded, it can +unintentionally, but unfortunately also intentionally, create an imaginary +perception of research progress. Recently, this area started to attract +research attention and the number of relevant studies is continuously growing. +In this survey, we provide a comprehensive overview of 415 papers addressing +the effects of randomness on the stability of learning with limited labelled +data. We distinguish between four main tasks addressed in the papers +(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness +effects), providing findings for each one. Furthermore, we identify and discuss +seven challenges and open problems together with possible directions to +facilitate further research. The ultimate goal of this survey is to emphasise +the importance of this growing research area, which so far has not received an +appropriate level of attention, and reveal impactful directions for future +research. + +
+
+ comment: Accepted to ACM Comput. Surv. 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Heterogeneous Graph Learning via Random Projection + + +
+ Heterogeneous Graph Neural Networks (HGNNs) are powerful tools for deep +learning on heterogeneous graphs. Typical HGNNs require repetitive message +passing during training, limiting efficiency for large-scale real-world graphs. +Recent pre-computation-based HGNNs use one-time message passing to transform a +heterogeneous graph into regular-shaped tensors, enabling efficient mini-batch +training. Existing pre-computation-based HGNNs can be mainly categorized into +two styles, which differ in how much information loss is allowed and +efficiency. We propose a hybrid pre-computation-based HGNN, named Random +Projection Heterogeneous Graph Neural Network (RpHGNN), which combines the +benefits of one style's efficiency with the low information loss of the other +style. To achieve efficiency, the main framework of RpHGNN consists of +propagate-then-update iterations, where we introduce a Random Projection +Squashing step to ensure that complexity increases only linearly. To achieve +low information loss, we introduce a Relation-wise Neighbor Collection +component with an Even-odd Propagation Scheme, which aims to collect +information from neighbors in a finer-grained way. Experimental results +indicate that our approach achieves state-of-the-art results on seven small and +large benchmark datasets while also being 230% faster compared to the most +effective baseline. Surprisingly, our approach not only surpasses +pre-processing-based baselines but also outperforms end-to-end methods. + +
+
+ comment: Accepted by IEEE Transactions on Knowledge and Data Engineering + (TKDE) +
+
+
+
+
+ + ♻ ☆ KTO: Model Alignment as Prospect Theoretic Optimization ICML 2024 + + +
+ Kahneman & Tversky's $\textit{prospect theory}$ tells us that humans perceive +random variables in a biased but well-defined manner (1992); for example, +humans are famously loss-averse. We show that objectives for aligning LLMs with +human feedback implicitly incorporate many of these biases -- the success of +these objectives (e.g., DPO) over cross-entropy minimization can partly be +ascribed to them belonging to a family of loss functions that we call +$\textit{human-aware losses}$ (HALOs). However, the utility functions these +methods attribute to humans still differ from those in the prospect theory +literature. Using a Kahneman-Tversky model of human utility, we propose a HALO +that directly maximizes the utility of generations instead of maximizing the +log-likelihood of preferences, as current methods do. We call this approach +KTO, and it matches or exceeds the performance of preference-based methods at +scales from 1B to 30B, despite only learning from a binary signal of whether an +output is desirable. More broadly, our work suggests that there is no one HALO +that is universally superior; the best loss depends on the inductive biases +most appropriate for a given setting, an oft-overlooked consideration. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ End-to-end Feature Selection Approach for Learning Skinny Trees AISTATS 2024 + + +
+ We propose a new optimization-based approach for feature selection in tree +ensembles, an important problem in statistics and machine learning. Popular +tree ensemble toolkits e.g., Gradient Boosted Trees and Random Forests support +feature selection post-training based on feature importance scores, while very +popular, they are known to have drawbacks. We propose Skinny Trees: an +end-to-end toolkit for feature selection in tree ensembles where we train a +tree ensemble while controlling the number of selected features. Our +optimization-based approach learns an ensemble of differentiable trees, and +simultaneously performs feature selection using a grouped $\ell_0$-regularizer. +We use first-order methods for optimization and present convergence guarantees +for our approach. We use a dense-to-sparse regularization scheduling scheme +that can lead to more expressive and sparser tree ensembles. On 15 synthetic +and real-world datasets, Skinny Trees can achieve $1.5\!\times\! +-~620~\!\times\!$ feature compression rates, leading up to $10\times$ faster +inference over dense trees, without any loss in performance. Skinny Trees lead +to superior feature selection than many existing toolkits e.g., in terms of AUC +performance for 25\% feature budget, Skinny Trees outperforms LightGBM by +$10.2\%$ (up to $37.7\%$), and Random Forests by $3\%$ (up to $12.5\%$). + +
+
+ comment: Accepted in AISTATS 2024 +
+
+
+
+
+ + ♻ ☆ Fair Mixed Effects Support Vector Machine + + +
+ To ensure unbiased and ethical automated predictions, fairness must be a core +principle in machine learning applications. Fairness in machine learning aims +to mitigate biases present in the training data and model imperfections that +could lead to discriminatory outcomes. This is achieved by preventing the model +from making decisions based on sensitive characteristics like ethnicity or +sexual orientation. A fundamental assumption in machine learning is the +independence of observations. However, this assumption often does not hold true +for data describing social phenomena, where data points are often clustered +based. Hence, if the machine learning models do not account for the cluster +correlations, the results may be biased. Especially high is the bias in cases +where the cluster assignment is correlated to the variable of interest. We +present a fair mixed effects support vector machine algorithm that can handle +both problems simultaneously. With a reproducible simulation study we +demonstrate the impact of clustered data on the quality of fair machine +learning predictions. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes and formatting improvements. V3: improvements from + journal revisions +
+
+
+
+
+ + ♻ ☆ Contrastive Learning and Abstract Concepts: The Case of Natural Numbers + + +
+ Contrastive Learning (CL) has been successfully applied to classification and +other downstream tasks related to concrete concepts, such as objects contained +in the ImageNet dataset. No attempts seem to have been made so far in applying +this promising scheme to more abstract entities. A prominent example of these +could be the concept of (discrete) Quantity. CL can be frequently interpreted +as a self-supervised scheme guided by some profound and ubiquitous conservation +principle (e.g. conservation of identity in object classification tasks). In +this introductory work we apply a suitable conservation principle to the +semi-abstract concept of natural numbers by which discrete quantities can be +estimated or predicted. We experimentally show, by means of a toy problem, that +contrastive learning can be trained to count at a glance with high accuracy +both at human as well as at super-human ranges.. We compare this with the +results of a trained-to-count at a glance supervised learning (SL) neural +network scheme of similar architecture. We show that both schemes exhibit +similar good performance on baseline experiments, where the distributions of +the training and testing stages are equal. Importantly, we demonstrate that in +some generalization scenarios, where training and testing distributions differ, +CL boasts more robust and much better error performance. + +
+
+
+
+
+ + ♻ ☆ NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment + + +
+ Aligning Large Language Models (LLMs) with human values and preferences is +essential for making them helpful and safe. However, building efficient tools +to perform alignment can be challenging, especially for the largest and most +competent LLMs which often contain tens or hundreds of billions of parameters. +We create NeMo-Aligner, a toolkit for model alignment that can efficiently +scale to a thousand GPUs for training the largest open-source LLMs such as +Nemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized +and scalable implementations for major paradigms of model alignment such as: +Reinforcement Learning from Human Feedback (RLHF), Direct Preference +Optimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally, +our toolkit supports running most of the alignment techniques in a Parameter +Efficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for +extensibility, allowing support for other alignment techniques with minimal +effort. It is open-sourced with Apache 2.0 License and we invite community +contributions at https://github.com/NVIDIA/NeMo-Aligner + +
+
+ comment: 16 pages, 4 figures, Accepted to COLM 2024 +
+
+
+
+
+ + ♻ ☆ On the Optimality of Misspecified Spectral Algorithms + + +
+ In the misspecified spectral algorithms problem, researchers usually assume +the underground true function $f_{\rho}^{*} \in [\mathcal{H}]^{s}$, a +less-smooth interpolation space of a reproducing kernel Hilbert space (RKHS) +$\mathcal{H}$ for some $s\in (0,1)$. The existing minimax optimal results +require $\|f_{\rho}^{*}\|_{L^{\infty}}<\infty$ which implicitly requires $s > +\alpha_{0}$ where $\alpha_{0}\in (0,1)$ is the embedding index, a constant +depending on $\mathcal{H}$. Whether the spectral algorithms are optimal for all +$s\in (0,1)$ is an outstanding problem lasting for years. In this paper, we +show that spectral algorithms are minimax optimal for any +$\alpha_{0}-\frac{1}{\beta} < s < 1$, where $\beta$ is the eigenvalue decay +rate of $\mathcal{H}$. We also give several classes of RKHSs whose embedding +index satisfies $ \alpha_0 = \frac{1}{\beta} $. Thus, the spectral algorithms +are minimax optimal for all $s\in (0,1)$ on these RKHSs. + +
+
+ comment: 50 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Safety Constrained Multi-Agent Reinforcement Learning for Active Voltage + Control IJCAI2024 + + +
+ Active voltage control presents a promising avenue for relieving power +congestion and enhancing voltage quality, taking advantage of the distributed +controllable generators in the power network, such as roof-top photovoltaics. +While Multi-Agent Reinforcement Learning (MARL) has emerged as a compelling +approach to address this challenge, existing MARL approaches tend to overlook +the constrained optimization nature of this problem, failing in guaranteeing +safety constraints. In this paper, we formalize the active voltage control +problem as a constrained Markov game and propose a safety-constrained MARL +algorithm. We expand the primal-dual optimization RL method to multi-agent +settings, and augment it with a novel approach of double safety estimation to +learn the policy and to update the Lagrange-multiplier. In addition, we +proposed different cost functions and investigated their influences on the +behavior of our constrained MARL method. We evaluate our approach in the power +distribution network simulation environment with real-world scale scenarios. +Experimental results demonstrate the effectiveness of the proposed method +compared with the state-of-the-art MARL methods. This paper is published at +\url{https://www.ijcai.org/Proceedings/2024/}. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Towards reliable respiratory disease diagnosis based on cough sounds and + vision transformers + + +
+ Recent advancements in deep learning techniques have sparked performance +boosts in various real-world applications including disease diagnosis based on +multi-modal medical data. Cough sound data-based respiratory disease (e.g., +COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also +attracted much attention. However, existing works usually utilise traditional +machine learning or deep models of moderate scales. On the other hand, the +developed approaches are trained and evaluated on small-scale data due to the +difficulty of curating and annotating clinical data on scale. To address these +issues in prior works, we create a unified framework to evaluate various deep +models from lightweight Convolutional Neural Networks (e.g., ResNet18) to +modern vision transformers and compare their performance in respiratory disease +classification. Based on the observations from such an extensive empirical +study, we propose a novel approach to cough-based disease classification based +on both self-supervised and supervised learning on a large-scale cough data +set. Experimental results demonstrate our proposed approach outperforms prior +arts consistently on two benchmark datasets for COVID-19 diagnosis and a +proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%. + +
+
+
+
+
+ + ♻ ☆ TimeSeriesBench: An Industrial-Grade Benchmark for Time Series Anomaly + Detection Models + + +
+ Time series anomaly detection (TSAD) has gained significant attention due to +its real-world applications to improve the stability of modern software +systems. However, there is no effective way to verify whether they can meet the +requirements for real-world deployment. Firstly, current algorithms typically +train a specific model for each time series. Maintaining such many models is +impractical in a large-scale system with tens of thousands of curves. The +performance of using merely one unified model to detect anomalies remains +unknown. Secondly, most TSAD models are trained on the historical part of a +time series and are tested on its future segment. In distributed systems, +however, there are frequent system deployments and upgrades, with new, +previously unseen time series emerging daily. The performance of testing newly +incoming unseen time series on current TSAD algorithms remains unknown. Lastly, +the assumptions of the evaluation metrics in existing benchmarks are far from +practical demands. To solve the above-mentioned problems, we propose an +industrial-grade benchmark TimeSeriesBench. We assess the performance of +existing algorithms across more than 168 evaluation settings and provide +comprehensive analysis for the future design of anomaly detection algorithms. +An industrial dataset is also released along with TimeSeriesBench. + +
+
+ comment: Accepted by ISSRE'24 +
+
+
+
+
+ + ♻ ☆ Investigating Recurrent Transformers with Dynamic Halt + + +
+ In this paper, we comprehensively study the inductive biases of two major +approaches to augmenting Transformers with a recurrent mechanism: (1) the +approach of incorporating a depth-wise recurrence similar to Universal +Transformers; and (2) the approach of incorporating a chunk-wise temporal +recurrence like Temporal Latent Bottleneck. Furthermore, we propose and +investigate novel ways to extend and combine the above methods - for example, +we propose a global mean-based dynamic halting mechanism for Universal +Transformers and an augmentation of Temporal Latent Bottleneck with elements +from Universal Transformer. We compare the models and probe their inductive +biases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop +language modeling, ListOps, and Logical Inference. The code is released in: +https://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main + +
+
+
+
+
+ + ♻ ☆ OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step + + +
+ Despite significant advancements in text generation and reasoning, Large +Language Models (LLMs) still face challenges in accurately performing complex +arithmetic operations. Language model systems often enable LLMs to generate +code for arithmetic operations to achieve accurate calculations. However, this +approach compromises speed and security, and fine-tuning risks the language +model losing prior capabilities. We propose a framework that enables exact +arithmetic in a single autoregressive step, providing faster, more secure, and +more interpretable LLM systems with arithmetic capabilities. We use the hidden +states of a LLM to control a symbolic architecture that performs arithmetic. +Our implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama) +achieves 100\% accuracy on single arithmetic operations +($+,-,\times,\div,\sin{},\cos{},\log{},\exp{},\sqrt{}$), outperforming GPT 4o +with and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o +with and without a code interpreter on average across a range of mathematical +problem solving benchmarks, demonstrating that OccamLLMs can excel in +arithmetic tasks, even surpassing much larger models. We will make our code +public shortly. + +
+
+
+
+
+ + ♻ ☆ MPruner: Optimizing Neural Network Size with CKA-Based Mutual + Information Pruning + + +
+ Determining the optimal size of a neural network is critical, as it directly +impacts runtime performance and memory usage. Pruning is a well-established +model compression technique that reduces the size of neural networks while +mathematically guaranteeing accuracy preservation. However, many recent pruning +methods overlook the global contributions of individual model components, +making it difficult to ensure that a pruned model meets the desired dataset and +performance requirements. To address these challenges, we developed a new +pruning algorithm, MPruner, that leverages mutual information through vector +similarity. MPruner utilizes layer clustering with the Centered Kernel +Alignment (CKA) similarity metric, allowing us to incorporate global +information from the neural network for more precise and efficient layer-wise +pruning. We evaluated MPruner across various architectures and configurations, +demonstrating its versatility and providing practical guidelines. MPruner +achieved up to a 50% reduction in parameters and memory usage for CNN and +transformer-based models, with minimal to no loss in accuracy. + +
+
+
+
+
+ + ♻ ☆ Recursively Feasible Probabilistic Safe Online Learning with Control + Barrier Functions + + +
+ Learning-based control has recently shown great efficacy in performing +complex tasks for various applications. However, to deploy it in real systems, +it is of vital importance to guarantee the system will stay safe. Control +Barrier Functions (CBFs) offer mathematical tools for designing +safety-preserving controllers for systems with known dynamics. In this article, +we first introduce a model-uncertainty-aware reformulation of CBF-based +safety-critical controllers using Gaussian Process (GP) regression to close the +gap between an approximate mathematical model and the real system, which +results in a second-order cone program (SOCP)-based control design. We then +present the pointwise feasibility conditions of the resulting safety +controller, highlighting the level of richness that the available system +information must meet to ensure safety. We use these conditions to devise an +event-triggered online data collection strategy that ensures the recursive +feasibility of the learned safety controller. Our method works by constantly +reasoning about whether the current information is sufficient to ensure safety +or if new measurements under active safe exploration are required to reduce the +uncertainty. As a result, our proposed framework can guarantee the forward +invariance of the safe set defined by the CBF with high probability, even if it +contains a priori unexplored regions. We validate the proposed framework in two +numerical simulation experiments. + +
+
+ comment: Journal article. Includes the results of the 2021 CDC paper titled + "Pointwise feasibility of gaussian process-based safety-critical control + under model uncertainty" and proposes a recursively feasible safe online + learning algorithm as new contribution +
+
+
+
+
+ + ♻ ☆ The Responsible Foundation Model Development Cheatsheet: A Review of + Tools & Resources + + +
+ Foundation model development attracts a rapidly expanding body of +contributors, scientists, and applications. To help shape responsible +development practices, we introduce the Foundation Model Development +Cheatsheet: a growing collection of 250+ tools and resources spanning text, +vision, and speech modalities. We draw on a large body of prior work to survey +resources (e.g. software, documentation, frameworks, guides, and practical +tools) that support informed data selection, processing, and understanding, +precise and limitation-aware artifact documentation, efficient model training, +advance awareness of the environmental impact from training, careful model +evaluation of capabilities, risks, and claims, as well as responsible model +release, licensing and deployment practices. We hope this curated collection of +resources helps guide more responsible development. The process of curating +this list, enabled us to review the AI development ecosystem, revealing what +tools are critically missing, misused, or over-used in existing practices. We +find that (i) tools for data sourcing, model evaluation, and monitoring are +critically under-serving ethical and real-world needs, (ii) evaluations for +model safety, capabilities, and environmental impact all lack reproducibility +and transparency, (iii) text and particularly English-centric analyses continue +to dominate over multilingual and multi-modal analyses, and (iv) evaluation of +systems, rather than just models, is needed so that capabilities and impact are +assessed in context. + +
+
+
+
+
+ + ♻ ☆ Kolmogorov Arnold Networks in Fraud Detection: Bridging the Gap Between + Theory and Practice + + +
+ This study evaluates the applicability of Kolmogorov-Arnold Networks (KAN) in +fraud detection, finding that their effectiveness is context-dependent. We +propose a quick decision rule using Principal Component Analysis (PCA) to +assess the suitability of KAN: if data can be effectively separated in two +dimensions using splines, KAN may outperform traditional models; otherwise, +other methods could be more appropriate. We also introduce a heuristic approach +to hyperparameter tuning, significantly reducing computational costs. These +findings suggest that while KAN has potential, its use should be guided by +data-specific assessments. + +
+
+
+
+
+ + ♻ ☆ Deep-MacroFin: Informed Equilibrium Neural Network for Continuous Time + Economic Models + + +
+ In this paper, we present Deep-MacroFin, a comprehensive framework designed +to solve partial differential equations, with a particular focus on models in +continuous time economics. This framework leverages deep learning +methodologies, including conventional Multi-Layer Perceptrons and the newly +developed Kolmogorov-Arnold Networks. It is optimized using economic +information encapsulated by Hamilton-Jacobi-Bellman equations and coupled +algebraic equations. The application of neural networks holds the promise of +accurately resolving high-dimensional problems with fewer computational demands +and limitations compared to standard numerical methods. This versatile +framework can be readily adapted for elementary differential equations, and +systems of differential equations, even in cases where the solutions may +exhibit discontinuities. Importantly, it offers a more straightforward and +user-friendly implementation than existing libraries. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ What do we know about Hugging Face? A systematic literature review and + quantitative validation of qualitative claims + + +
+ Background: Collaborative Software Package Registries (SPRs) are an integral +part of the software supply chain. Much engineering work synthesizes SPR +package into applications. Prior research has examined SPRs for traditional +software, such as NPM (JavaScript) and PyPI (Python). Pre-Trained Model (PTM) +Registries are an emerging class of SPR of increasing importance, because they +support the deep learning supply chain. + Aims: Recent empirical research has examined PTM registries in ways such as +vulnerabilities, reuse processes, and evolution. However, no existing research +synthesizes them to provide a systematic understanding of the current +knowledge. Some of the existing research includes qualitative claims lacking +quantitative analysis. Our research fills these gaps by providing a knowledge +synthesis and quantitative analyses. + Methods: We first conduct a systematic literature review (SLR). We then +observe that some of the claims are qualitative. We identify quantifiable +metrics associated with those claims, and measure in order to substantiate +these claims. + Results: From our SLR, we identify 12 claims about PTM reuse on the +HuggingFace platform, 4 of which lack quantitative validation. We successfully +test 3 of these claims through a quantitative analysis, and directly compare +one with traditional software. Our findings corroborate qualitative claims with +quantitative measurements. Our findings are: (1) PTMs have a much higher +turnover rate than traditional software, indicating a dynamic and rapidly +evolving reuse environment within the PTM ecosystem; and (2) There is a strong +correlation between documentation quality and PTM popularity. + Conclusions: We confirm qualitative research claims with concrete metrics, +supporting prior qualitative and case study research. Our measures show further +dynamics of PTM reuse, inspiring research infrastructure and new measures. + +
+
+ comment: [ESEM'24] Proceedings of the 18th ACM/IEEE International Symposium on + Empirical Software Engineering and Measurement (ESEM) 2024 +
+
+
+
+
+ + ♻ ☆ Linear Contextual Bandits with Hybrid Payoff: Revisited ECML + + +
+ We study the Linear Contextual Bandit problem in the hybrid reward setting. +In this setting every arm's reward model contains arm specific parameters in +addition to parameters shared across the reward models of all the arms. We can +reduce this setting to two closely related settings (a) Shared - no arm +specific parameters, and (b) Disjoint - only arm specific parameters, enabling +the application of two popular state of the art algorithms - $\texttt{LinUCB}$ +and $\texttt{DisLinUCB}$ (Algorithm 1 in (Li et al. 2010)). When the arm +features are stochastic and satisfy a popular diversity condition, we provide +new regret analyses for both algorithms, significantly improving on the known +regret guarantees of these algorithms. Our novel analysis critically exploits +the hybrid reward structure and the diversity condition. Moreover, we introduce +a new algorithm $\texttt{HyLinUCB}$ that crucially modifies $\texttt{LinUCB}$ +(using a new exploration coefficient) to account for sparsity in the hybrid +setting. Under the same diversity assumptions, we prove that +$\texttt{HyLinUCB}$ also incurs only $O(\sqrt{T})$ regret for $T$ rounds. We +perform extensive experiments on synthetic and real-world datasets +demonstrating strong empirical performance of $\texttt{HyLinUCB}$.For number of +arm specific parameters much larger than the number of shared parameters, we +observe that $\texttt{DisLinUCB}$ incurs the lowest regret. In this case, +regret of $\texttt{HyLinUCB}$ is the second best and extremely competitive to +$\texttt{DisLinUCB}$. In all other situations, including our real-world +dataset, $\texttt{HyLinUCB}$ has significantly lower regret than +$\texttt{LinUCB}$, $\texttt{DisLinUCB}$ and other SOTA baselines we considered. +We also empirically observe that the regret of $\texttt{HyLinUCB}$ grows much +slower with the number of arms compared to baselines, making it suitable even +for very large action spaces. + +
+
+ comment: Accepted at ECML PKDD 2024 as a Research Track Paper +
+
+
+
+
+ + ♻ ☆ Persian Slang Text Conversion to Formal and Deep Learning of Persian + Short Texts on Social Media for Sentiment Classification + + +
+ The lack of a suitable tool for the analysis of conversational texts in the +Persian language has made various analyses of these texts, including Sentiment +Analysis, difficult. In this research, we tried to make the understanding of +these texts easier for the machine by providing PSC, Persian Slang Converter, a +tool for converting conversational texts into formal ones, and by using the +most up-to-date and best deep learning methods along with the PSC, the +sentiment learning of short Persian language texts for the machine in a better +way. be made More than 10 million unlabeled texts from various social networks +and movie subtitles (as Conversational texts) and about 10 million news texts +(as formal texts) have been used for training unsupervised models and formal +implementation of the tool. 60,000 texts from the comments of Instagram social +network users with positive, negative, and neutral labels are considered +supervised data for training the emotion classification model of short texts. +Using the formal tool, 57% of the words of the corpus of conversation were +converted. Finally, by using the formalizer, FastText model, and deep LSTM +network, an accuracy of 81.91 was obtained on the test data. + +
+
+ comment: 16 pages, 4 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Projected Stochastic Gradient Descent with Quantum Annealed Binary + Gradients + + +
+ We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards +training neural networks with binary weights, known as binary neural networks +(BNNs), on quantum hardware. BNNs reduce the computational requirements and +energy consumption of deep learning models with minimal loss in accuracy. +However, training them in practice remains to be an open challenge. Most known +BNN-optimisers either rely on projected updates or binarise weights +post-training. Instead, QP-SBGD approximately maps the gradient onto binary +variables, by solving a quadratic constrained binary optimisation. Under +practically reasonable assumptions, we show that this update rule converges +with a rate of $\mathcal{O}(1 / \sqrt{T})$. Moreover, we show how the +$\mathcal{NP}$-hard projection can be effectively executed on an adiabatic +quantum annealer, harnessing recent advancements in quantum computation. We +also introduce a projected version of this update rule and prove that if a +fixed point exists in the binary variable space, the modified updates will +converge to it. Last but not least, our algorithm is implemented layer-wise, +making it suitable to train larger networks on resource-limited quantum +hardware. Through extensive evaluations, we show that QP-SBGD outperforms or is +on par with competitive and well-established baselines such as BinaryConnect, +signSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as +well as binary graph neural networks. + +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ LSTMSE-Net: Long Short Term Speech Enhancement Network for Audio-visual + Speech Enhancement + + +
+ In this paper, we propose long short term memory speech enhancement network +(LSTMSE-Net), an audio-visual speech enhancement (AVSE) method. This innovative +method leverages the complementary nature of visual and audio information to +boost the quality of speech signals. Visual features are extracted with +VisualFeatNet (VFN), and audio features are processed through an encoder and +decoder. The system scales and concatenates visual and audio features, then +processes them through a separator network for optimized speech enhancement. +The architecture highlights advancements in leveraging multi-modal data and +interpolation techniques for robust AVSE challenge systems. The performance of +LSTMSE-Net surpasses that of the baseline model from the COG-MHEAR AVSE +Challenge 2024 by a margin of 0.06 in scale-invariant signal-to-distortion +ratio (SISDR), $0.03$ in short-time objective intelligibility (STOI), and +$1.32$ in perceptual evaluation of speech quality (PESQ). The source code of +the proposed LSTMSE-Net is available at +\url{https://github.com/mtanveer1/AVSEC-3-Challenge}. + +
+
+
+
+
+ + ☆ Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection, + Removal, and Generation in the Era of Deep Learning + + +
+ Shadows are formed when light encounters obstacles, leading to areas of +diminished illumination. In computer vision, shadow detection, removal, and +generation are crucial for enhancing scene understanding, refining image +quality, ensuring visual consistency in video editing, and improving virtual +environments. This paper presents a comprehensive survey of shadow detection, +removal, and generation in images and videos within the deep learning landscape +over the past decade, covering tasks, deep models, datasets, and evaluation +metrics. Our key contributions include a comprehensive survey of shadow +analysis, standardization of experimental comparisons, exploration of the +relationships among model size, speed, and performance, a cross-dataset +generalization study, identification of open issues and future directions, and +provision of publicly available resources to support further research. + +
+
+ comment: Publicly available results, trained models, and evaluation metrics at + https://github.com/xw-hu/Unveiling-Deep-Shadows +
+
+
+
+
+ + ☆ Towards Real-World Adverse Weather Image Restoration: Enhancing + Clearness and Semantics with Vision-Language Models ECCV 2024 + + +
+ This paper addresses the limitations of adverse weather image restoration +approaches trained on synthetic data when applied to real-world scenarios. We +formulate a semi-supervised learning framework employing vision-language models +to enhance restoration performance across diverse adverse weather conditions in +real-world settings. Our approach involves assessing image clearness and +providing semantics using vision-language models on real data, serving as +supervision signals for training restoration models. For clearness enhancement, +we use real-world data, utilizing a dual-step strategy with pseudo-labels +assessed by vision-language models and weather prompt learning. For semantic +enhancement, we integrate real-world data by adjusting weather conditions in +vision-language model descriptions while preserving semantic meaning. +Additionally, we introduce an effective training strategy to bootstrap +restoration performance. Our approach achieves superior results in real-world +adverse weather image restoration, demonstrated through qualitative and +quantitative comparisons with state-of-the-art works. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Low-Resolution Face Recognition via Adaptable Instance-Relation + Distillation IJCNN 2024 + + +
+ Low-resolution face recognition is a challenging task due to the missing of +informative details. Recent approaches based on knowledge distillation have +proven that high-resolution clues can well guide low-resolution face +recognition via proper knowledge transfer. However, due to the distribution +difference between training and testing faces, the learned models often suffer +from poor adaptability. To address that, we split the knowledge transfer +process into distillation and adaptation steps, and propose an adaptable +instance-relation distillation approach to facilitate low-resolution face +recognition. In the approach, the student distills knowledge from +high-resolution teacher in both instance level and relation level, providing +sufficient cross-resolution knowledge transfer. Then, the learned student can +be adaptable to recognize low-resolution faces with adaptive batch +normalization in inference. In this manner, the capability of recovering +missing details of familiar low-resolution faces can be effectively enhanced, +leading to a better knowledge transfer. Extensive experiments on low-resolution +face recognition clearly demonstrate the effectiveness and adaptability of our +approach. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ PRoGS: Progressive Rendering of Gaussian Splats + + +
+ Over the past year, 3D Gaussian Splatting (3DGS) has received significant +attention for its ability to represent 3D scenes in a perceptually accurate +manner. However, it can require a substantial amount of storage since each +splat's individual data must be stored. While compression techniques offer a +potential solution by reducing the memory footprint, they still necessitate +retrieving the entire scene before any part of it can be rendered. In this +work, we introduce a novel approach for progressively rendering such scenes, +aiming to display visible content that closely approximates the final scene as +early as possible without loading the entire scene into memory. This approach +benefits both on-device rendering applications limited by memory constraints +and streaming applications where minimal bandwidth usage is preferred. To +achieve this, we approximate the contribution of each Gaussian to the final +scene and construct an order of prioritization on their inclusion in the +rendering process. Additionally, we demonstrate that our approach can be +combined with existing compression methods to progressively render (and stream) +3DGS scenes, optimizing bandwidth usage by focusing on the most important +splats within a scene. Overall, our work establishes a foundation for making +remotely hosted 3DGS content more quickly accessible to end-users in +over-the-top consumption scenarios, with our results showing significant +improvements in quality across all metrics compared to existing methods. + +
+
+
+
+
+ + ☆ Privacy-Preserving Multimedia Mobile Cloud Computing Using Protective + Perturbation + + +
+ Mobile cloud computing has been adopted in many multimedia applications, +where the resource-constrained mobile device sends multimedia data (e.g., +images) to remote cloud servers to request computation-intensive multimedia +services (e.g., image recognition). While significantly improving the +performance of the mobile applications, the cloud-based mechanism often causes +privacy concerns as the multimedia data and services are offloaded from the +trusted user device to untrusted cloud servers. Several recent studies have +proposed perturbation-based privacy preserving mechanisms, which obfuscate the +offloaded multimedia data to eliminate privacy exposures without affecting the +functionality of the remote multimedia services. However, the existing privacy +protection approaches require the deployment of computation-intensive +perturbation generation on the resource-constrained mobile devices. Also, the +obfuscated images are typically not compliant with the standard image +compression algorithms and suffer from significant bandwidth consumption. In +this paper, we develop a novel privacy-preserving multimedia mobile cloud +computing framework, namely $PMC^2$, to address the resource and bandwidth +challenges. $PMC^2$ employs secure confidential computing in the cloud to +deploy the perturbation generator, which addresses the resource challenge while +maintaining the privacy. Furthermore, we develop a neural compressor +specifically trained to compress the perturbed images in order to address the +bandwidth challenge. We implement $PMC^2$ in an end-to-end mobile cloud +computing system, based on which our evaluations demonstrate superior latency, +power efficiency, and bandwidth consumption achieved by $PMC^2$ while +maintaining high accuracy in the target multimedia service. + +
+
+
+
+
+ + ☆ Think Twice Before Recognizing: Large Multimodal Models for General + Fine-grained Traffic Sign Recognition + + +
+ We propose a new strategy called think twice before recognizing to improve +fine-grained traffic sign recognition (TSR). Fine-grained TSR in the wild is +difficult due to the complex road conditions, and existing approaches +particularly struggle with cross-country TSR when data is lacking. Our strategy +achieves effective fine-grained TSR by stimulating the multiple-thinking +capability of large multimodal models (LMM). We introduce context, +characteristic, and differential descriptions to design multiple thinking +processes for the LMM. The context descriptions with center coordinate prompt +optimization help the LMM to locate the target traffic sign in the original +road images containing multiple traffic signs and filter irrelevant answers +through the proposed prior traffic sign hypothesis. The characteristic +description is based on few-shot in-context learning of template traffic signs, +which decreases the cross-domain difference and enhances the fine-grained +recognition capability of the LMM. The differential descriptions of similar +traffic signs optimize the multimodal thinking capability of the LMM. The +proposed method is independent of training data and requires only simple and +uniform instructions. We conducted extensive experiments on three benchmark +datasets and two real-world datasets from different countries, and the proposed +method achieves state-of-the-art TSR results on all five datasets. + +
+
+
+
+
+ + ♻ ☆ TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot + Image Classification ICASSP 2024 + + +
+ Few-shot image classification aims to classify images from unseen novel +classes with few samples. Recent works demonstrate that deep local descriptors +exhibit enhanced representational capabilities compared to image-level +features. However, most existing methods solely rely on either employing all +local descriptors or directly utilizing partial descriptors, potentially +resulting in the loss of crucial information. Moreover, these methods primarily +emphasize the selection of query descriptors while overlooking support +descriptors. In this paper, we propose a novel Task-Aware Adaptive Local +Descriptors Selection Network (TALDS-Net), which exhibits the capacity for +adaptive selection of task-aware support descriptors and query descriptors. +Specifically, we compare the similarity of each local support descriptor with +other local support descriptors to obtain the optimal support descriptor subset +and then compare the query descriptors with the optimal support subset to +obtain discriminative query descriptors. Extensive experiments demonstrate that +our TALDS-Net outperforms state-of-the-art methods on both general and +fine-grained datasets. + +
+
+ comment: 4 pages, 1 figures, is accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ IDNet: A Novel Dataset for Identity Document Analysis and Fraud + Detection + + +
+ Effective fraud detection and analysis of government-issued identity +documents, such as passports, driver's licenses, and identity cards, are +essential in thwarting identity theft and bolstering security on online +platforms. The training of accurate fraud detection and analysis tools depends +on the availability of extensive identity document datasets. However, current +publicly available benchmark datasets for identity document analysis, including +MIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a +limited number of samples, cover insufficient varieties of fraud patterns, and +seldom include alterations in critical personal identifying fields like +portrait images, limiting their utility in training models capable of detecting +realistic frauds while preserving privacy. + In response to these shortcomings, our research introduces a new benchmark +dataset, IDNet, designed to advance privacy-preserving fraud detection efforts. +The IDNet dataset comprises 837,060 images of synthetically generated identity +documents, totaling approximately 490 gigabytes, categorized into 20 types from +$10$ U.S. states and 10 European countries. We evaluate the utility and present +use cases of the dataset, illustrating how it can aid in training +privacy-preserving fraud detection methods, facilitating the generation of +camera and video capturing of identity documents, and testing schema +unification and other identity document management functionalities. + +
+
+ comment: 40 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 37 + +
+
+
+ + ☆ DiversityMedQA: Assessing Demographic Biases in Medical Diagnosis using + Large Language Models + + +
+ As large language models (LLMs) gain traction in healthcare, concerns about +their susceptibility to demographic biases are growing. We introduce +{DiversityMedQA}, a novel benchmark designed to assess LLM responses to medical +queries across diverse patient demographics, such as gender and ethnicity. By +perturbing questions from the MedQA dataset, which comprises medical board exam +questions, we created a benchmark that captures the nuanced differences in +medical diagnosis across varying patient profiles. Our findings reveal notable +discrepancies in model performance when tested against these demographic +variations. Furthermore, to ensure the perturbations were accurate, we also +propose a filtering strategy that validates each perturbation. By releasing +DiversityMedQA, we provide a resource for evaluating and mitigating demographic +bias in LLM medical diagnoses. + +
+
+
+
+
+ + ☆ The Compressor-Retriever Architecture for Language Model OS + + +
+ Recent advancements in large language models (LLMs) have significantly +enhanced their capacity to aggregate and process information across multiple +modalities, enabling them to perform a wide range of tasks such as multimodal +data querying, tool usage, web interactions, and handling long documents. These +capabilities pave the way for transforming LLMs from mere chatbots into +general-purpose agents capable of interacting with the real world. This paper +explores the concept of using a language model as the core component of an +operating system (OS), effectively acting as a CPU that processes data stored +in a context window, which functions as RAM. A key challenge in realizing such +an LM OS is managing the life-long context and ensuring statefulness across +sessions, a feature limited by the current session-based interaction paradigm +due to context window size limit. To address this, we introduce +compressor-retriever, a model-agnostic architecture designed for life-long +context management. Unlike other long-context solutions such as +retrieval-augmented generation, our approach exclusively uses the base model's +forward function to compress and retrieve context, ensuring end-to-end +differentiability. Preliminary experiments demonstrate the effectiveness of +this architecture in in-context learning tasks, marking a step towards the +development of a fully stateful LLM OS. Project repo available at: +https://github.com/gblackout/LM-OS + +
+
+
+
+
+ + ☆ Revisiting SMoE Language Models by Evaluating Inefficiencies with Task + Specific Expert Pruning + + +
+ Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative +to dense models in language modeling. These models use conditionally activated +feedforward subnetworks in transformer blocks, allowing for a separation +between total model parameters and per-example computation. However, large +token-routed SMoE models face a significant challenge: during inference, the +entire model must be used for a sequence or a batch, resulting in high +latencies in a distributed setting that offsets the advantages of per-token +sparse activation. Our research explores task-specific model pruning to inform +decisions about designing SMoE architectures, mainly modulating the choice of +expert counts in pretraining. We investigate whether such pruned models offer +advantages over smaller SMoE models trained from scratch, when evaluating and +comparing them individually on tasks. To that end, we introduce an adaptive +task-aware pruning technique UNCURL to reduce the number of experts per MoE +layer in an offline manner post-training. Our findings reveal a threshold +pruning factor for the reduction that depends on the number of experts used in +pretraining, above which, the reduction starts to degrade model performance. +These insights contribute to our understanding of model design choices when +pretraining with SMoE architectures, particularly useful when considering +task-specific inference optimization for later stages. + +
+
+
+
+
+ + ☆ Masked Mixers for Language Generation and Retrieval + + +
+ Attention mechanisms that confer selective focus on a strict subset of input +elements are nearly ubiquitous in language models today. We posit there to be +downside to the use of attention: most information present in the input is +necessarily lost. In support of this idea we observe poor input representation +accuracy in transformers, but find more accurate representation in what we term +masked mixers which replace self-attention with masked convolutions. Applied to +TinyStories the masked mixer learns causal language tasks more efficiently than +early transformer implementations and somewhat less efficiently than optimized, +current implementations. The most efficient learning algorithm observed for +this dataset is a transformer-masked mixer hybrid, suggesting that these models +learn in an orthogonal manner. We hypothesized that the information loss +exhibited by transformers would be much more detrimental to retrieval than +generation, and to test this we introduce an efficient training approach for +retrieval models based on existing generative model embeddings. With this +method, embeddings from masked mixers are found to result in far better +summary-to-story retrieval compared to embeddings from transformers. + +
+
+ comment: 23 pages, 15 figures (11 primary, 4 supplementary) +
+
+
+
+
+ + ☆ PoliPrompt: A High-Performance Cost-Effective LLM-Based Text + Classification Framework for Political Science + + +
+ Recent advancements in large language models (LLMs) have opened new avenues +for enhancing text classification efficiency in political science, surpassing +traditional machine learning methods that often require extensive feature +engineering, human labeling, and task-specific training. However, their +effectiveness in achieving high classification accuracy remains questionable. +This paper introduces a three-stage in-context learning approach that leverages +LLMs to improve classification accuracy while minimizing experimental costs. +Our method incorporates automatic enhanced prompt generation, adaptive exemplar +selection, and a consensus mechanism that resolves discrepancies between two +weaker LLMs, refined by an advanced LLM. We validate our approach using +datasets from the BBC news reports, Kavanaugh Supreme Court confirmation, and +2018 election campaign ads. The results show significant improvements in +classification F1 score (+0.36 for zero-shot classification) with manageable +economic costs (-78% compared with human labeling), demonstrating that our +method effectively addresses the limitations of traditional machine learning +while offering a scalable and reliable solution for text analysis in political +science. + +
+
+ comment: 23 pages, 5 figures +
+
+
+
+
+ + ☆ Efficient and Scalable Estimation of Tool Representations in Vector + Space + + +
+ Recent advancements in function calling and tool use have significantly +enhanced the capabilities of large language models (LLMs) by enabling them to +interact with external information sources and execute complex tasks. However, +the limited context window of LLMs presents challenges when a large number of +tools are available, necessitating efficient methods to manage prompt length +and maintain accuracy. Existing approaches, such as fine-tuning LLMs or +leveraging their reasoning capabilities, either require frequent retraining or +incur significant latency overhead. A more efficient solution involves training +smaller models to retrieve the most relevant tools for a given query, although +this requires high quality, domain-specific data. To address those challenges, +we present a novel framework for generating synthetic data for tool retrieval +applications and an efficient data-driven tool retrieval strategy using small +encoder models. Empowered by LLMs, we create ToolBank, a new tool retrieval +dataset that reflects real human user usages. For tool retrieval methodologies, +we propose novel approaches: (1) Tool2Vec: usage-driven tool embedding +generation for tool retrieval, (2) ToolRefiner: a staged retrieval method that +iteratively improves the quality of retrieved tools, and (3) MLC: framing tool +retrieval as a multi-label classification problem. With these new methods, we +achieve improvements of up to 27.28 in Recall@K on the ToolBench dataset and +30.5 in Recall@K on ToolBank. Additionally, we present further experimental +results to rigorously validate our methods. Our code is available at +\url{https://github.com/SqueezeAILab/Tool2Vec} + +
+
+
+
+
+ + ♻ ☆ Dissociation of Faithful and Unfaithful Reasoning in LLMs + + +
+ Large language models (LLMs) often improve their performance in downstream +tasks when they generate Chain of Thought reasoning text before producing an +answer. We investigate how LLMs recover from errors in Chain of Thought. +Through analysis of error recovery behaviors, we find evidence for +unfaithfulness in Chain of Thought, which occurs when models arrive at the +correct answer despite invalid reasoning text. We identify factors that shift +LLM recovery behavior: LLMs recover more frequently from obvious errors and in +contexts that provide more evidence for the correct answer. Critically, these +factors have divergent effects on faithful and unfaithful recoveries. Our +results indicate that there are distinct mechanisms driving faithful and +unfaithful error recoveries. Selective targeting of these mechanisms may be +able to drive down the rate of unfaithful reasoning and improve model +interpretability. + +
+
+ comment: code published at + https://github.com/CoTErrorRecovery/CoTErrorRecovery +
+
+
+
+
+ + ♻ ☆ Manipulating Large Language Models to Increase Product Visibility + + +
+ Large language models (LLMs) are increasingly being integrated into search +engines to provide natural language responses tailored to user queries. +Customers and end-users are also becoming more dependent on these models for +quick and easy purchase decisions. In this work, we investigate whether +recommendations from LLMs can be manipulated to enhance a product's visibility. +We demonstrate that adding a strategic text sequence (STS) -- a carefully +crafted message -- to a product's information page can significantly increase +its likelihood of being listed as the LLM's top recommendation. To understand +the impact of STS, we use a catalog of fictitious coffee machines and analyze +its effect on two target products: one that seldom appears in the LLM's +recommendations and another that usually ranks second. We observe that the +strategic text sequence significantly enhances the visibility of both products +by increasing their chances of appearing as the top recommendation. This +ability to manipulate LLM-generated search responses provides vendors with a +considerable competitive advantage and has the potential to disrupt fair market +competition. Just as search engine optimization (SEO) revolutionized how +webpages are customized to rank higher in search engine results, influencing +LLM recommendations could profoundly impact content optimization for AI-driven +search services. Code for our experiments is available at +https://github.com/aounon/llm-rank-optimizer. + +
+
+
+
+
+ + ♻ ☆ Balancing Rigor and Utility: Mitigating Cognitive Biases in Large + Language Models for Multiple-Choice Questions + + +
+ This paper examines the role of cognitive biases in the decision-making +processes of large language models (LLMs), challenging the conventional goal of +eliminating all biases. We show that certain cognitive biases when properly +balanced, can enhance decision-making efficiency through rational deviations +and heuristic shortcuts. By introducing heuristic moderation and an abstention +option, which allows LLMs to withhold responses when uncertain, we reduce error +rates, improve decision accuracy, and optimize decision rates. Using the +Balance Rigor and Utility (BRU) dataset, developed through expert +collaboration, our findings demonstrate that targeted inspection of cognitive +biases aligns LLM decisions more closely with human reasoning, enhancing +reliability and suggesting strategies for future improvements. This approach +offers a novel way to leverage cognitive biases to improve the practical +utility of LLMs across various applications. + +
+
+ comment: This article is currently under review. All data will be open on + GitHub once the review is complete. + https://github.com/limanwang/Balancing-Rigor-and-Utility +
+
+
+
+
+ + ♻ ☆ Eliciting Informative Text Evaluations with Large Language Models + + +
+ Peer prediction mechanisms motivate high-quality feedback with provable +guarantees. However, current methods only apply to rather simple reports, like +multiple-choice or scalar numbers. We aim to broaden these techniques to the +larger domain of text-based reports, drawing on the recent developments in +large language models. This vastly increases the applicability of peer +prediction mechanisms as textual feedback is the norm in a large variety of +feedback channels: peer reviews, e-commerce customer reviews, and comments on +social media. + We introduce two mechanisms, the Generative Peer Prediction Mechanism (GPPM) +and the Generative Synopsis Peer Prediction Mechanism (GSPPM). These mechanisms +utilize LLMs as predictors, mapping from one agent's report to a prediction of +her peer's report. Theoretically, we show that when the LLM prediction is +sufficiently accurate, our mechanisms can incentivize high effort and +truth-telling as an (approximate) Bayesian Nash equilibrium. Empirically, we +confirm the efficacy of our mechanisms through experiments conducted on two +real datasets: the Yelp review dataset and the ICLR OpenReview dataset. We +highlight the results that on the ICLR dataset, our mechanisms can +differentiate three quality levels -- human-written reviews, GPT-4-generated +reviews, and GPT-3.5-generated reviews in terms of expected scores. +Additionally, GSPPM penalizes LLM-generated reviews more effectively than GPPM. + +
+
+ comment: Accepted by the Twenty-Fifth ACM Conference on Economics and + Computation (EC'24) +
+
+
+
+
+ + ♻ ☆ Exploring Bias and Prediction Metrics to Characterise the Fairness of + Machine Learning for Equity-Centered Public Health Decision-Making: A + Narrative Review + + +
+ Background: The rapid advancement of Machine Learning (ML) represents novel +opportunities to enhance public health research, surveillance, and +decision-making. However, there is a lack of comprehensive understanding of +algorithmic bias, systematic errors in predicted population health outcomes, +resulting from the public health application of ML. The objective of this +narrative review is to explore the types of bias generated by ML and +quantitative metrics to assess these biases. + Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of +Electrical and Electronics Engineers), ACM (Association for Computing +Machinery) Digital Library, Science Direct, and Springer Nature. We used +keywords to identify studies describing types of bias and metrics to measure +these in the domain of ML and public and population health published in English +between 2008 and 2023, inclusive. + Results: A total of 72 articles met the inclusion criteria. Our review +identified the commonly described types of bias and quantitative metrics to +assess these biases from an equity perspective. + Conclusion : The review will help formalize the evaluation framework for ML +on public health from an equity perspective. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Domain-Specific Improvement on Psychotherapy Chatbot Using Assistant ICASSP 2024 + + +
+ Large language models (LLMs) have demonstrated impressive generalization +capabilities on specific tasks with human-written instruction data. However, +the limited quantity, diversity, and professional expertise of such instruction +data raise concerns about the performance of LLMs in psychotherapy tasks when +provided with domain-specific instructions. To address this, we firstly propose +Domain-Specific Assistant Instructions based on AlexanderStreet therapy, and +secondly, we use an adaption fine-tuning method and retrieval augmented +generation method to improve pre-trained LLMs. Through quantitative evaluation +of linguistic quality using automatic and human evaluation, we observe that +pre-trained LLMs on Psychotherapy Assistant Instructions outperform +state-of-the-art LLMs response baselines. Our Assistant-Instruction approach +offers a half-annotation method to align pre-trained LLMs with instructions and +provide pre-trained LLMs with more psychotherapy knowledge. + +
+
+ comment: Accepted at ICASSP 2024 EIHRC Workshop +
+
+
+
+
+ + ♻ ☆ Exploring neural oscillations during speech perception via surrogate + gradient spiking neural networks + + +
+ Understanding cognitive processes in the brain demands sophisticated models +capable of replicating neural dynamics at large scales. We present a +physiologically inspired speech recognition architecture, compatible and +scalable with deep learning frameworks, and demonstrate that end-to-end +gradient descent training leads to the emergence of neural oscillations in the +central spiking neural network. Significant cross-frequency couplings, +indicative of these oscillations, are measured within and across network layers +during speech processing, whereas no such interactions are observed when +handling background noise inputs. Furthermore, our findings highlight the +crucial inhibitory role of feedback mechanisms, such as spike frequency +adaptation and recurrent connections, in regulating and synchronising neural +activity to improve recognition performance. Overall, on top of developing our +understanding of synchronisation phenomena notably observed in the human +auditory pathway, our architecture exhibits dynamic and efficient information +processing, with relevance to neuromorphic technology. + +
+
+
+
+
+ + ♻ ☆ Analyzing Diversity in Healthcare LLM Research: A Scientometric + Perspective + + +
+ The deployment of large language models (LLMs) in healthcare has demonstrated +substantial potential for enhancing clinical decision-making, administrative +efficiency, and patient outcomes. However, the underrepresentation of diverse +groups in the development and application of these models can perpetuate +biases, leading to inequitable healthcare delivery. This paper presents a +comprehensive scientometric analysis of LLM research for healthcare, including +data from January 1, 2021, to July 1, 2024. By analyzing metadata from PubMed +and Dimensions, including author affiliations, countries, and funding sources, +we assess the diversity of contributors to LLM research. Our findings highlight +significant gender and geographic disparities, with a predominance of male +authors and contributions primarily from high-income countries (HICs). We +introduce a novel journal diversity index based on Gini diversity to measure +the inclusiveness of scientific publications. Our results underscore the +necessity for greater representation in order to ensure the equitable +application of LLMs in healthcare. We propose actionable strategies to enhance +diversity and inclusivity in artificial intelligence research, with the +ultimate goal of fostering a more inclusive and equitable future in healthcare +innovation. + +
+
+
+
+
+ + ♻ ☆ Sentiment Analysis Across Languages: Evaluation Before and After Machine + Translation to English + + +
+ People communicate in more than 7,000 languages around the world, with around +780 languages spoken in India alone. Despite this linguistic diversity, +research on Sentiment Analysis has predominantly focused on English text data, +resulting in a disproportionate availability of sentiment resources for +English. This paper examines the performance of transformer models in Sentiment +Analysis tasks across multilingual datasets and text that has undergone machine +translation. By comparing the effectiveness of these models in different +linguistic contexts, we gain insights into their performance variations and +potential implications for sentiment analysis across diverse languages. We also +discuss the shortcomings and potential for future work towards the end. + +
+
+ comment: 6 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ ExtractGPT: Exploring the Potential of Large Language Models for Product + Attribute Value Extraction + + +
+ In order to facilitate features such as faceted product search and product +comparison, e-commerce platforms require accurately structured product data, +including precise attribute/value pairs. Vendors often times provide +unstructured product descriptions consisting only of an offer title and a +textual description. Consequently, extracting attribute values from titles and +descriptions is vital for e-commerce platforms. State-of-the-art attribute +value extraction methods based on pre-trained language models, such as BERT, +face two drawbacks (i) the methods require significant amounts of task-specific +training data and (ii) the fine-tuned models have problems with generalising to +unseen attribute values that were not part of the training data. This paper +explores the potential of using large language models as a more training +data-efficient and more robust alternative to existing AVE methods. We propose +prompt templates for describing the target attributes of the extraction to the +LLM, covering both zero-shot and few-shot scenarios. In the zero-shot scenario, +textual and JSON-based target schema representations of the attributes are +compared. In the few-shot scenario, we investigate (i) the provision of example +attribute values, (ii) the selection of in-context demonstrations, (iii) +shuffled ensembling to prevent position bias, and (iv) fine-tuning the LLM. We +evaluate the prompt templates in combination with hosted LLMs, such as GPT-3.5 +and GPT-4, and open-source LLMs which can be run locally. We compare the +performance of the LLMs to the PLM-based methods SU-OpenTag, AVEQA, and MAVEQA. +The highest average F1-score of 86% was achieved by GPT-4. Llama-3-70B performs +only 3% worse than GPT-4, making it a competitive open-source alternative. +Given the same training data, this prompt/GPT-4 combination outperforms the +best PLM baseline by an average of 6% F1-score. + +
+
+
+
+
+ + ♻ ☆ AMERICANO: Argument Generation with Discourse-driven Decomposition and + Agent Interaction + + +
+ Argument generation is a challenging task in natural language processing, +which requires rigorous reasoning and proper content organization. Inspired by +recent chain-of-thought prompting that breaks down a complex task into +intermediate steps, we propose Americano, a novel framework with agent +interaction for argument generation. Our approach decomposes the generation +process into sequential actions grounded on argumentation theory, which first +executes actions sequentially to generate argumentative discourse components, +and then produces a final argument conditioned on the components. To further +mimic the human writing process and improve the left-to-right generation +paradigm of current autoregressive language models, we introduce an argument +refinement module which automatically evaluates and refines argument drafts +based on feedback received. We evaluate our framework on the task of +counterargument generation using a subset of Reddit/CMV dataset. The results +show that our method outperforms both end-to-end and chain-of-thought prompting +methods and can generate more coherent and persuasive arguments with diverse +and rich contents. + +
+
+ comment: INLG 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models on Spatial Tasks: A Multi-Task + Benchmarking Study + + +
+ The advent of large language models such as ChatGPT, Gemini, and others has +underscored the importance of evaluating their diverse capabilities, ranging +from natural language understanding to code generation. However, their +performance on spatial tasks has not been comprehensively assessed. This study +addresses this gap by introducing a novel multi-task spatial evaluation +dataset, designed to systematically explore and compare the performance of +several advanced models on spatial tasks. The dataset encompasses twelve +distinct task types, including spatial understanding and path planning, each +with verified, accurate answers. We evaluated multiple models, including +OpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase +testing approach. Initially, we conducted zero-shot testing, followed by +categorizing the dataset by difficulty and performing prompt tuning tests. +Results indicate that gpt-4o achieved the highest overall accuracy in the first +phase, with an average of 71.3%. Although moonshot-v1-8k slightly +underperformed overall, it surpassed gpt-4o in place name recognition tasks. +The study also highlights the impact of prompt strategies on model performance +in specific tasks. For example, the Chain-of-Thought (COT) strategy increased +gpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot +strategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to +76.3%. + +
+
+
+
+
+ + ♻ ☆ LiveFC: A System for Live Fact-Checking of Audio Streams + + +
+ The advances in the digital era have led to rapid dissemination of +information. This has also aggravated the spread of misinformation and +disinformation. This has potentially serious consequences, such as civil +unrest. While fact-checking aims to combat this, manual fact-checking is +cumbersome and not scalable. While automated fact-checking approaches exist, +they do not operate in real-time and do not always account for spread of +misinformation through different modalities. This is particularly important as +proactive fact-checking on live streams in real-time can help people be +informed of false narratives and prevent catastrophic consequences that may +cause civil unrest. This is particularly relevant with the rapid dissemination +of information through video on social media platforms or other streams like +political rallies and debates. Hence, in this work we develop a platform named +LiveFC, that can aid in fact-checking live audio streams in real-time. LiveFC +has a user-friendly interface that displays the claims detected along with +their veracity and evidence for live streams with associated speakers for +claims from respective segments. The app can be accessed at +http://livefc.factiverse.ai and a screen recording of the demo can be found at +https://bit.ly/3WVAoIw. + +
+
+ comment: Under Review, 11 pages +
+
+
+
+
+ + ♻ ☆ Generalizing Fairness to Generative Language Models via Reformulation of + Non-discrimination Criteria + + +
+ Generative AI, such as large language models, has undergone rapid development +within recent years. As these models become increasingly available to the +public, concerns arise about perpetuating and amplifying harmful biases in +applications. Gender stereotypes can be harmful and limiting for the +individuals they target, whether they consist of misrepresentation or +discrimination. Recognizing gender bias as a pervasive societal construct, this +paper studies how to uncover and quantify the presence of gender biases in +generative language models. In particular, we derive generative AI analogues of +three well-known non-discrimination criteria from classification, namely +independence, separation and sufficiency. To demonstrate these criteria in +action, we design prompts for each of the criteria with a focus on occupational +gender stereotype, specifically utilizing the medical test to introduce the +ground truth in the generative AI context. Our results address the presence of +occupational gender bias within such conversational language models. + +
+
+
+
+
+ + ♻ ☆ A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning KDD + + +
+ Retrieval-augmented generation (RAG) is a framework enabling large language +models (LLMs) to enhance their accuracy and reduce hallucinations by +integrating external knowledge bases. In this paper, we introduce a hybrid RAG +system enhanced through a comprehensive suite of optimizations that +significantly improve retrieval quality, augment reasoning capabilities, and +refine numerical computation ability. We refined the text chunks and tables in +web pages, added attribute predictors to reduce hallucinations, conducted LLM +Knowledge Extractor and Knowledge Graph Extractor, and finally built a +reasoning strategy with all the references. We evaluated our system on the CRAG +dataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and +online evaluations demonstrate that our system significantly enhances complex +reasoning capabilities. In local evaluations, we have significantly improved +accuracy and reduced error rates compared to the baseline model, achieving a +notable increase in scores. In the meanwhile, we have attained outstanding +results in online assessments, demonstrating the performance and generalization +capabilities of the proposed system. The source code for our system is released +in \url{https://gitlab.aicrowd.com/shizueyy/crag-new}. + +
+
+ comment: Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024 +
+
+
+
+
+ + ♻ ☆ ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search + + +
+ Recent methodologies in LLM self-training mostly rely on LLM generating +responses and filtering those with correct output answers as training data. +This approach often yields a low-quality fine-tuning training set (e.g., +incorrect plans or intermediate reasoning). In this paper, we develop a +reinforced self-training approach, called ReST-MCTS*, based on integrating +process reward guidance with tree search MCTS* for collecting higher-quality +reasoning traces as well as per-step value to train policy and reward models. +ReST-MCTS* circumvents the per-step manual annotation typically used to train +process rewards by tree-search-based reinforcement learning: Given oracle final +correct answers, ReST-MCTS* is able to infer the correct process rewards by +estimating the probability this step can help lead to the correct answer. These +inferred rewards serve dual purposes: they act as value targets for further +refining the process reward model and also facilitate the selection of +high-quality traces for policy model self-training. We first show that the +tree-search policy in ReST-MCTS* achieves higher accuracy compared with prior +LLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same +search budget. We then show that by using traces searched by this tree-search +policy as training data, we can continuously enhance the three language models +for multiple iterations, and outperform other self-training algorithms such as +ReST$^\text{EM}$ and Self-Rewarding LM. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ FastMem: Fast Memorization of Prompt Improves Context Awareness of Large + Language Models + + +
+ Large language models (LLMs) excel in generating coherent text, but they +often struggle with context awareness, leading to inaccuracies in tasks +requiring faithful adherence to provided information. We introduce FastMem, a +novel method designed to enhance instruction fine-tuned LLMs' context awareness +through fast memorization of the prompt. FastMem maximizes the likelihood of +the prompt before inference by fine-tuning only the last Feed-Forward Network +(FFN) module. This targeted approach ensures efficient optimization without +overfitting, significantly improving the model's ability to comprehend and +accurately follow the context. Our experiments demonstrate substantial gains in +reading comprehension, text summarization and adherence to output structures. +For instance, FastMem improves the accuracy of Llama 3-8B-Inst on the NQ-SWAP +dataset from 59.1% to 71.6%, and reduces the output structure failure rate of +Qwen 1.5-4B-Chat from 34.9% to 25.5%. Extensive experimental results highlight +FastMem's potential to offer a robust solution to enhance the reliability and +accuracy of LLMs in various applications. Our code is available at: +https://github.com/IAAR-Shanghai/FastMem + +
+
+
+
+
+ + ♻ ☆ A Formal Perspective on Byte-Pair Encoding ACL 2023 + + +
+ Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in +NLP, despite being devised initially as a compression method. BPE appears to be +a greedy algorithm at face value, but the underlying optimization problem that +BPE seeks to solve has not yet been laid down. We formalize BPE as a +combinatorial optimization problem. Via submodular functions, we prove that the +iterative greedy version is a +$\frac{1}{{\sigma(\boldsymbol{\mu}^\star)}}(1-e^{-{\sigma(\boldsymbol{\mu}^\star)}})$-approximation +of an optimal merge sequence, where ${\sigma(\boldsymbol{\mu}^\star)}$ is the +total backward curvature with respect to the optimal merge sequence +$\boldsymbol{\mu}^\star$. Empirically the lower bound of the approximation is +$\approx 0.37$. + We provide a faster implementation of BPE which improves the runtime +complexity from $\mathcal{O}\left(N M\right)$ to $\mathcal{O}\left(N \log +M\right)$, where $N$ is the sequence length and $M$ is the merge count. +Finally, we optimize the brute-force algorithm for optimal BPE using +memoization. + +
+
+ comment: ACL 2023 +
+
+
+
+
+ + ♻ ☆ Pitfalls and Outlooks in Using COMET + + +
+ Since its introduction, the COMET metric has blazed a trail in the machine +translation community, given its strong correlation with human judgements of +translation quality. Its success stems from being a modified pre-trained +multilingual model finetuned for quality assessment. However, it being a +machine learning model also gives rise to a new set of pitfalls that may not be +widely known. We investigate these unexpected behaviours from three aspects: 1) +technical: obsolete software versions and compute precision; 2) data: empty +content, language mismatch, and translationese at test time as well as +distribution and domain biases in training; 3) usage and reporting: +multi-reference support and model referencing in the literature. All of these +problems imply that COMET scores is not comparable between papers or even +technical setups and we put forward our perspective on fixing each issue. +Furthermore, we release the SacreCOMET package that can generate a signature +for the software and model configuration as well as an appropriate citation. +The goal of this work is to help the community make more sound use of the COMET +metric. + +
+
+
+
+
+ + ♻ ☆ AudioBench: A Universal Benchmark for Audio Large Language Models + + +
+ We introduce AudioBench, a universal benchmark designed to evaluate Audio +Large Language Models (AudioLLMs). It encompasses 8 distinct tasks and 26 +datasets, among which, 7 are newly proposed datasets. The evaluation targets +three main aspects: speech understanding, audio scene understanding, and voice +understanding (paralinguistic). Despite recent advancements, there lacks a +comprehensive benchmark for AudioLLMs on instruction following capabilities +conditioned on audio signals. AudioBench addresses this gap by setting up +datasets as well as desired evaluation metrics. Besides, we also evaluated the +capabilities of five popular models and found that no single model excels +consistently across all tasks. We outline the research outlook for AudioLLMs +and anticipate that our open-sourced evaluation toolkit, data, and leaderboard +will offer a robust testbed for future model developments. + +
+
+ comment: v3 - Abundent update on models and evaluation details; Code: + https://github.com/AudioLLMs/AudioBench +
+
+
+
+
+ + ♻ ☆ Contrasting Linguistic Patterns in Human and LLM-Generated News Text + + +
+ We conduct a quantitative analysis contrasting human-written English news +text with comparable large language model (LLM) output from six different LLMs +that cover three different families and four sizes in total. Our analysis spans +several measurable linguistic dimensions, including morphological, syntactic, +psychometric, and sociolinguistic aspects. The results reveal various +measurable differences between human and AI-generated texts. Human texts +exhibit more scattered sentence length distributions, more variety of +vocabulary, a distinct use of dependency and constituent types, shorter +constituents, and more optimized dependency distances. Humans tend to exhibit +stronger negative emotions (such as fear and disgust) and less joy compared to +text generated by LLMs, with the toxicity of these models increasing as their +size grows. LLM outputs use more numbers, symbols and auxiliaries (suggesting +objective language) than human texts, as well as more pronouns. The sexist bias +prevalent in human text is also expressed by LLMs, and even magnified in all of +them but one. Differences between LLMs and humans are larger than between LLMs. + +
+
+ comment: Published at Artificial Intelligence Review vol. 57, 265 +
+
+
+
+
+ + ♻ ☆ LLM-as-a-tutor in EFL Writing Education: Focusing on Evaluation of + Student-LLM Interaction + + +
+ In the context of English as a Foreign Language (EFL) writing education, +LLM-as-a-tutor can assist students by providing real-time feedback on their +essays. However, challenges arise in assessing LLM-as-a-tutor due to differing +standards between educational and general use cases. To bridge this gap, we +integrate pedagogical principles to assess student-LLM interaction. First, we +explore how LLMs can function as English tutors, providing effective essay +feedback tailored to students. Second, we propose three metrics to evaluate +LLM-as-a-tutor specifically designed for EFL writing education, emphasizing +pedagogical aspects. In this process, EFL experts evaluate the feedback from +LLM-as-a-tutor regarding quality and characteristics. On the other hand, EFL +learners assess their learning outcomes from interaction with LLM-as-a-tutor. +This approach lays the groundwork for developing LLMs-as-a-tutor tailored to +the needs of EFL learners, advancing the effectiveness of writing education in +this context. + +
+
+
+
+
+ + ♻ ☆ MLR-Copilot: Autonomous Machine Learning Research based on Large + Language Models Agents + + +
+ Machine learning research, crucial for technological advancements and +innovation, often faces significant challenges due to its inherent complexity, +slow pace of experimentation, and the necessity for specialized expertise. +Motivated by this, we present a new systematic framework, autonomous Machine +Learning Research with large language models (MLR-Copilot), designed to enhance +machine learning research productivity through the automatic generation and +implementation of research ideas using Large Language Model (LLM) agents. The +framework consists of three phases: research idea generation, experiment +implementation, and implementation execution. First, existing research papers +are used to generate hypotheses and experimental plans vis IdeaAgent powered by +LLMs. Next, the implementation generation phase translates these plans into +executables with ExperimentAgent. This phase leverages retrieved prototype code +and optionally retrieves candidate models and data. Finally, the execution +phase, also managed by ExperimentAgent, involves running experiments with +mechanisms for human feedback and iterative debugging to enhance the likelihood +of achieving executable research outcomes. We evaluate our framework on five +machine learning research tasks and the experimental results show the +framework's potential to facilitate the research progress and innovations. + +
+
+
+
+
+ + ♻ ☆ Show Me the World in My Language: Establishing the First Baseline for + Scene-Text to Scene-Text Translation ICPR 2024 + + +
+ In this work, we study the task of ``visually'' translating scene text from a +source language (e.g., Hindi) to a target language (e.g., English). Visual +translation involves not just the recognition and translation of scene text but +also the generation of the translated image that preserves visual features of +the source scene text, such as font, size, and background. There are several +challenges associated with this task, such as translation with limited context, +deciding between translation and transliteration, accommodating varying text +lengths within fixed spatial boundaries, and preserving the font and background +styles of the source scene text in the target language. To address this +problem, we make the following contributions: (i) We study visual translation +as a standalone problem for the first time in the literature. (ii) We present a +cascaded framework for visual translation that combines state-of-the-art +modules for scene text recognition, machine translation, and scene text +synthesis as a baseline for the task. (iii) We propose a set of task-specific +design enhancements to design a variant of the baseline to obtain performance +improvements. (iv) Currently, the existing related literature lacks any +comprehensive performance evaluation for this novel task. To fill this gap, we +introduce several automatic and user-assisted evaluation metrics designed +explicitly for evaluating visual translation. Further, we evaluate presented +baselines for translating scene text between Hindi and English. Our experiments +demonstrate that although we can effectively perform visual translation over a +large collection of scene text images, the presented baseline only partially +addresses challenges posed by visual translation tasks. We firmly believe that +this new task and the limitations of existing models, as reported in this +paper, should encourage further research in visual translation. + +
+
+ comment: Accepted at ICPR 2024, Project Website: + https://vl2g.github.io/projects/visTrans/ +
+
+
+
+
+ + ♻ ☆ An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference + Acceleration for Large Vision-Language Models ECCV 2024 + + +
+ In this study, we identify the inefficient attention phenomena in Large +Vision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5, +QwenVL-Chat and Video-LLaVA. We find out that the attention computation over +visual tokens is of extreme inefficiency in the deep layers of popular LVLMs, +suggesting a need for a sparser approach compared to textual data handling. To +this end, we introduce FastV, a versatile plug-and-play method designed to +optimize computational efficiency by learning adaptive attention patterns in +early layers and pruning visual tokens in subsequent ones. Our evaluations +demonstrate FastV's ability to dramatically reduce computational costs (e.g., a +45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a +wide range of image and video understanding tasks. The computational efficiency +and performance trade-off of FastV are highly customizable and +pareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve +a lower budget than that of a 7B-parameter model, while still maintaining +superior performance. We believe FastV has practical values for deployment of +LVLMs in edge devices and commercial models. Code is released at +https://github.com/pkunlp-icler/FastV. + +
+
+ comment: Accepted to ECCV 2024 (Oral), code is released at + https://github.com/pkunlp-icler/FastV, +
+
+
+
+
+ + ♻ ☆ CHiSafetyBench: A Chinese Hierarchical Safety Benchmark for Large + Language Models + + +
+ With the profound development of large language models(LLMs), their safety +concerns have garnered increasing attention. However, there is a scarcity of +Chinese safety benchmarks for LLMs, and the existing safety taxonomies are +inadequate, lacking comprehensive safety detection capabilities in authentic +Chinese scenarios. In this work, we introduce CHiSafetyBench, a dedicated +safety benchmark for evaluating LLMs' capabilities in identifying risky content +and refusing answering risky questions in Chinese contexts. CHiSafetyBench +incorporates a dataset that covers a hierarchical Chinese safety taxonomy +consisting of 5 risk areas and 31 categories. This dataset comprises two types +of tasks: multiple-choice questions and question-answering, evaluating LLMs +from the perspectives of risk content identification and the ability to refuse +answering risky questions respectively. Utilizing this benchmark, we validate +the feasibility of automatic evaluation as a substitute for human evaluation +and conduct comprehensive automatic safety assessments on mainstream Chinese +LLMs. Our experiments reveal the varying performance of different models across +various safety domains, indicating that all models possess considerable +potential for improvement in Chinese safety capabilities. Our dataset is +publicly available at +https://github.com/UnicomAI/UnicomBenchmark/tree/main/CHiSafetyBench. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ ACORN: Aspect-wise Commonsense Reasoning Explanation Evaluation + + +
+ Evaluating the quality of free-text explanations is a multifaceted, +subjective, and labor-intensive task. Large language models (LLMs) present an +appealing alternative due to their potential for consistency, scalability, and +cost-efficiency. In this work, we present ACORN, a new dataset of 3,500 +free-text explanations and aspect-wise quality ratings, and use it to evaluate +how LLMs rate explanations. We observed that larger models outputted labels +that maintained or increased the inter-annotator agreement, suggesting that +they are within the expected variance between human raters. However, their +correlation with majority-voted human ratings varied across different quality +aspects, indicating that they are not a complete replacement. In turn, using +LLMs as a supplement to a smaller group of human raters in some cases improved +the correlation with the original majority labels. However, the effect was +limited to cases where human raters were scarce, and an additional human rater +had a more pronounced effect in all cases. Overall, we recommend against using +LLMs as a complete replacement for human raters but encourage using them in +configurations that end with targeted human involvement. Data available here: +https://github.com/a-brassard/ACORN + +
+
+ comment: 18 pages, 7 figures, accepted to COLM 2024. Data available here: + https://github.com/a-brassard/ACORN +
+
+
+
+
+ + ♻ ☆ MM-Soc: Benchmarking Multimodal Large Language Models in Social Media + Platforms ACL 2024 + + +
+ Social media platforms are hubs for multimodal information exchange, +encompassing text, images, and videos, making it challenging for machines to +comprehend the information or emotions associated with interactions in online +spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising +solution to these challenges, yet they struggle to accurately interpret human +emotions and complex content such as misinformation. This paper introduces +MM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of +multimodal social media content. MM-Soc compiles prominent multimodal datasets +and incorporates a novel large-scale YouTube tagging dataset, targeting a range +of tasks from misinformation detection, hate speech detection, and social +context generation. Through our exhaustive evaluation on ten size-variants of +four open-source MLLMs, we have identified significant performance disparities, +highlighting the need for advancements in models' social understanding +capabilities. Our analysis reveals that, in a zero-shot setting, various types +of MLLMs generally exhibit difficulties in handling social media tasks. +However, MLLMs demonstrate performance improvements post fine-tuning, +suggesting potential pathways for improvement. Our code and data are available +at https://github.com/claws-lab/MMSoc.git. + +
+
+ comment: In Proceedings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Persuasion Games using Large Language Models + + +
+ Large Language Models (LLMs) have emerged as formidable instruments capable +of comprehending and producing human-like text. This paper explores the +potential of LLMs, to shape user perspectives and subsequently influence their +decisions on particular tasks. This capability finds applications in diverse +domains such as Investment, Credit cards and Insurance, wherein they assist +users in selecting appropriate insurance policies, investment plans, Credit +cards, Retail, as well as in Behavioral Change Support Systems (BCSS). + We present a sophisticated multi-agent framework wherein a consortium of +agents operate in collaborative manner. The primary agent engages directly with +user agents through persuasive dialogue, while the auxiliary agents perform +tasks such as information retrieval, response analysis, development of +persuasion strategies, and validation of facts. Empirical evidence from our +experiments demonstrates that this collaborative methodology significantly +enhances the persuasive efficacy of the LLM. We continuously analyze the +resistance of the user agent to persuasive efforts and counteract it by +employing a combination of rule-based and LLM-based resistance-persuasion +mapping techniques. + We employ simulated personas and generate conversations in insurance, +banking, and retail domains to evaluate the proficiency of large language +models (LLMs) in recognizing, adjusting to, and influencing various personality +types. Concurrently, we examine the resistance mechanisms employed by LLM +simulated personas. Persuasion is quantified via measurable surveys before and +after interaction, LLM-generated scores on conversation, and user decisions +(purchase or non-purchase). + +
+
+
+
+
+ + ♻ ☆ Cultural Compass: Predicting Transfer Learning Success in Offensive + Language Detection with Cultural Features EMNLP 2023 + + +
+ The increasing ubiquity of language technology necessitates a shift towards +considering cultural diversity in the machine learning realm, particularly for +subjective tasks that rely heavily on cultural nuances, such as Offensive +Language Detection (OLD). Current understanding underscores that these tasks +are substantially influenced by cultural values, however, a notable gap exists +in determining if cultural features can accurately predict the success of +cross-cultural transfer learning for such subjective tasks. Addressing this, +our study delves into the intersection of cultural features and transfer +learning effectiveness. The findings reveal that cultural value surveys indeed +possess a predictive power for cross-cultural transfer learning success in OLD +tasks and that it can be further improved using offensive word distance. Based +on these results, we advocate for the integration of cultural information into +datasets. Additionally, we recommend leveraging data sources rich in cultural +information, such as surveys, to enhance cultural adaptability. Our research +signifies a step forward in the quest for more inclusive, culturally sensitive +language technologies. + +
+
+ comment: Findings of EMNLP 2023 (update) +
+
+
+
+
+ + ♻ ☆ From Wide to Deep: Dimension Lifting Network for Parameter-efficient + Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) that maps entities and relations into vector +representations is essential for downstream applications. Conventional KGE +methods require high-dimensional representations to learn the complex structure +of knowledge graph, but lead to oversized model parameters. Recent advances +reduce parameters by low-dimensional entity representations, while developing +techniques (e.g., knowledge distillation or reinvented representation forms) to +compensate for reduced dimension. However, such operations introduce +complicated computations and model designs that may not benefit large knowledge +graphs. To seek a simple strategy to improve the parameter efficiency of +conventional KGE models, we take inspiration from that deeper neural networks +require exponentially fewer parameters to achieve expressiveness comparable to +wider networks for compositional structures. We view all entity representations +as a single-layer embedding network, and conventional KGE methods that adopt +high-dimensional entity representations equal widening the embedding network to +gain expressiveness. To achieve parameter efficiency, we instead propose a +deeper embedding network for entity representations, i.e., a narrow entity +embedding layer plus a multi-layer dimension lifting network (LiftNet). +Experiments on three public datasets show that by integrating LiftNet, four +conventional KGE methods with 16-dimensional representations achieve comparable +link prediction accuracy as original models that adopt 512-dimensional +representations, saving 68.4% to 96.9% parameters. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 44 + +
+
+
+ + ♻ ☆ Inter-Frame Compression for Dynamic Point Cloud Geometry Coding + + +
+ Efficient point cloud compression is essential for applications like virtual +and mixed reality, autonomous driving, and cultural heritage. This paper +proposes a deep learning-based inter-frame encoding scheme for dynamic point +cloud geometry compression. We propose a lossy geometry compression scheme that +predicts the latent representation of the current frame using the previous +frame by employing a novel feature space inter-prediction network. The proposed +network utilizes sparse convolutions with hierarchical multiscale 3D feature +learning to encode the current frame using the previous frame. The proposed +method introduces a novel predictor network for motion compensation in the +feature domain to map the latent representation of the previous frame to the +coordinates of the current frame to predict the current frame's feature +embedding. The framework transmits the residual of the predicted features and +the actual features by compressing them using a learned probabilistic +factorized entropy model. At the receiver, the decoder hierarchically +reconstructs the current frame by progressively rescaling the feature +embedding. The proposed framework is compared to the state-of-the-art +Video-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud +Compression (G-PCC) schemes standardized by the Moving Picture Experts Group +(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta +Rate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against +G-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame +encoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based +inter-frame encoding mode using HEVC. These significant performance gains are +cross-checked and verified in the MPEG working group. + +
+
+
+
+
+ + ♻ ☆ SEDMamba: Enhancing Selective State Space Modelling with Bottleneck + Mechanism and Fine-to-Coarse Temporal Fusion for Efficient Error Detection in + Robot-Assisted Surgery + + +
+ Automated detection of surgical errors can improve robotic-assisted surgery. +Despite promising progress, existing methods still face challenges in capturing +rich temporal context to establish long-term dependencies while maintaining +computational efficiency. In this paper, we propose a novel hierarchical model +named SEDMamba, which incorporates the selective state space model (SSM) into +surgical error detection, facilitating efficient long sequence modelling with +linear complexity. SEDMamba enhances selective SSM with a bottleneck mechanism +and fine-to-coarse temporal fusion (FCTF) to detect and temporally localize +surgical errors in long videos. The bottleneck mechanism compresses and +restores features within their spatial dimension, thereby reducing +computational complexity. FCTF utilizes multiple dilated 1D convolutional +layers to merge temporal information across diverse scale ranges, accommodating +errors of varying duration. Our work also contributes the first-of-its-kind, +frame-level, in-vivo surgical error dataset to support error detection in real +surgical cases. Specifically, we deploy the clinically validated observational +clinical human reliability assessment tool (OCHRA) to annotate the errors +during suturing tasks in an open-source radical prostatectomy dataset +(SAR-RARP50). Experimental results demonstrate that our SEDMamba outperforms +state-of-the-art methods with at least 1.82% AUC and 3.80% AP performance gains +with significantly reduced computational complexity. The corresponding error +annotations, code and models will be released at +https://github.com/wzjialang/SEDMamba. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory WACV 2025 + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation + and Retrieval-Guidance + + +
+ Diffusion-based models demonstrate impressive generation capabilities. +However, they also have a massive number of parameters, resulting in enormous +model sizes, thus making them unsuitable for deployment on resource-constraint +devices. Block-wise generation can be a promising alternative for designing +compact-sized (parameter-efficient) deep generative models since the model can +generate one block at a time instead of generating the whole image at once. +However, block-wise generation is also considerably challenging because +ensuring coherence across generated blocks can be non-trivial. To this end, we +design a retrieval-augmented generation (RAG) approach and leverage the +corresponding blocks of the images retrieved by the RAG module to condition the +training and generation stages of a block-wise denoising diffusion model. Our +conditioning schemes ensure coherence across the different blocks during +training and, consequently, during generation. While we showcase our approach +using the latent diffusion model (LDM) as the base model, it can be used with +other variants of denoising diffusion models. We validate the solution of the +coherence problem through the proposed approach by reporting substantive +experiments to demonstrate our approach's effectiveness in compact model size +and excellent generation quality. + +
+
+
+
+
+ + ♻ ☆ Multi-Visual-Inertial System: Analysis, Calibration and Estimation + + +
+ In this paper, we study state estimation of multi-visual-inertial systems +(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary +number of asynchronous inertial measurement units (IMUs) or gyroscopes and +global and(or) rolling shutter cameras. We are especially interested in the +full calibration of the associated visual-inertial sensors, including the IMU +or camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as +well as the image readout time of rolling-shutter cameras (if used). To this +end, we develop a new analytic combined IMU integration with intrinsics-termed +ACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary +IMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial +measurements to include all the necessary inertial intrinsic and IMU-IMU +spatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body +constraints to eliminate the necessity of auxiliary inertial poses and thus +reducing computational complexity. By performing observability analysis of +MVIS, we prove that the standard four unobservable directions remain - no +matter how many inertial sensors are used, and also identify, for the first +time, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary +inertial intrinsics. In addition to the extensive simulations that validate our +analysis and algorithms, we have built our own MVIS sensor rig and collected +over 25 real-world datasets to experimentally verify the proposed calibration +against the state-of-the-art calibration method such as Kalibr. We show that +the proposed MVIS calibration is able to achieve competing accuracy with +improved convergence and repeatability, which is open sourced to better benefit +the community. + +
+
+
+
+
+ + ♻ ☆ On Evaluating Adversarial Robustness of Volumetric Medical Segmentation + Models + + +
+ Volumetric medical segmentation models have achieved significant success on +organ and tumor-based segmentation tasks in recent years. However, their +vulnerability to adversarial attacks remains largely unexplored, raising +serious concerns regarding the real-world deployment of tools employing such +models in the healthcare sector. This underscores the importance of +investigating the robustness of existing models. In this context, our work aims +to empirically examine the adversarial robustness across current volumetric +segmentation architectures, encompassing Convolutional, Transformer, and +Mamba-based models. We extend this investigation across four volumetric +segmentation datasets, evaluating robustness under both white box and black box +adversarial attacks. Overall, we observe that while both pixel and +frequency-based attacks perform reasonably well under \emph{white box} setting, +the latter performs significantly better under transfer-based black box +attacks. Across our experiments, we observe transformer-based models show +higher robustness than convolution-based models with Mamba-based models being +the most vulnerable. Additionally, we show that large-scale training of +volumetric segmentation models improves the model's robustness against +adversarial attacks. The code and robust models are available at +https://github.com/HashmatShadab/Robustness-of-Volumetric-Medical-Segmentation-Models. + +
+
+ comment: Accepted at British Machine Vision Conference 2024 +
+
+
+
+
+ + ♻ ☆ Training-free Long Video Generation with Chain of Diffusion Model + Experts + + +
+ Video generation models hold substantial potential in areas such as +filmmaking. However, current video diffusion models need high computational +costs and produce suboptimal results due to high complexity of video generation +task. In this paper, we propose \textbf{ConFiner}, an efficient high-quality +video generation framework that decouples video generation into easier +subtasks: structure \textbf{con}trol and spatial-temporal re\textbf{fine}ment. +It can generate high-quality videos with chain of off-the-shelf diffusion model +experts, each expert responsible for a decoupled subtask. During the +refinement, we introduce coordinated denoising, which can merge multiple +diffusion experts' capabilities into a single sampling. Furthermore, we design +ConFiner-Long framework, which can generate long coherent video with three +constraint strategies on ConFiner. Experimental results indicate that with only +10\% of the inference cost, our ConFiner surpasses representative models like +Lavie and Modelscope across all objective and subjective metrics. And +ConFiner-Long can generate high-quality and coherent videos with up to 600 +frames. + +
+
+
+
+
+ + ♻ ☆ TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos + + +
+ We propose TRAM, a two-stage method to reconstruct a human's global +trajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover +the camera motion in the presence of dynamic humans and uses the scene +background to derive the motion scale. Using the recovered camera as a +metric-scale reference frame, we introduce a video transformer model (VIMO) to +regress the kinematic body motion of a human. By composing the two motions, we +achieve accurate recovery of 3D humans in the world space, reducing global +motion errors by a large margin from prior work. +https://yufu-wang.github.io/tram4d/ + +
+
+ comment: The project website: https://yufu-wang.github.io/tram4d/ +
+
+
+
+
+ + ♻ ☆ Privacy-Aware Document Visual Question Answering ICDAR 2024 + + +
+ Document Visual Question Answering (DocVQA) has quickly grown into a central +task of document understanding. But despite the fact that documents contain +sensitive or copyrighted information, none of the current DocVQA methods offers +strong privacy guarantees. In this work, we explore privacy in the domain of +DocVQA for the first time, highlighting privacy issues in state of the art +multi-modal LLM models used for DocVQA, and explore possible solutions. +Specifically, we focus on invoice processing as a realistic document +understanding scenario, and propose a large scale DocVQA dataset comprising +invoice documents and associated questions and answers. We employ a federated +learning scheme, that reflects the real-life distribution of documents in +different businesses, and we explore the use case where the data of the invoice +provider is the sensitive information to be protected. We demonstrate that +non-private models tend to memorise, a behaviour that can lead to exposing +private information. We then evaluate baseline training schemes employing +federated learning and differential privacy in this multi-modal scenario, where +the sensitive information might be exposed through either or both of the two +input modalities: vision (document image) or language (OCR tokens). Finally, we +design attacks exploiting the memorisation effect of the model, and demonstrate +their effectiveness in probing a representative DocVQA models. + +
+
+ comment: 35 pages, 12 figures, accepted for publication at the 18th + International Conference on Document Analysis and Recognition, ICDAR 2024 +
+
+
+
+
+ + ♻ ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? ECCV 2024 + + +
+ Foundation models have emerged as robust models with label efficiency in +diverse domains. In medical imaging, these models contribute to the advancement +of medical diagnoses due to the difficulty in obtaining labeled data. However, +it is unclear whether using a large amount of unlabeled data, biased by the +presence of sensitive attributes during pre-training, influences the fairness +of the model. This research examines the bias in the Foundation model +(RetFound) when it is applied to fine-tune the Brazilian Multilabel +Ophthalmological Dataset (BRSET), which has a different population than the +pre-training dataset. The model evaluation, in comparison with supervised +learning, shows that the Foundation Model has the potential to reduce the gap +between the maximum AUC and minimum AUC evaluations across gender and age +groups. However, in a data-efficient generalization, the model increases the +bias when the data amount decreases. These findings suggest that when deploying +a Foundation Model in real-life scenarios with limited data, the possibility of +fairness issues should be considered. + +
+
+ comment: Preprint of paper to be presented at Fairness and Ethics Towards + Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during + ECCV 2024 +
+
+
+
+
+ + ♻ ☆ LSMS: Language-guided Scale-aware MedSegmentor for Medical Image + Referring Segmentation + + +
+ Conventional medical image segmentation methods have been found inadequate in +facilitating physicians with the identification of specific lesions for +diagnosis and treatment. Given the utility of text as an instructional format, +we introduce a novel task termed Medical Image Referring Segmentation (MIRS), +which requires segmenting specified lesions in images based on the given +language expressions. Due to the varying object scales in medical images, MIRS +demands robust vision-language modeling and comprehensive multi-scale +interaction for precise localization and segmentation under linguistic +guidance. However, existing medical image segmentation methods fall short in +meeting these demands, resulting in insufficient segmentation accuracy. In +response, we propose an approach named Language-guided Scale-aware MedSegmentor +(LSMS), incorporating two appealing designs: (1)~a Scale-aware Vision-Language +Attention module that leverages diverse convolutional kernels to acquire rich +visual knowledge and interact closely with linguistic features, thereby +enhancing lesion localization capability; (2)~a Full-Scale Decoder that +globally models multi-modal features across various scales, capturing +complementary information between scales to accurately outline lesion +boundaries. Addressing the lack of suitable datasets for MIRS, we constructed a +vision-language medical dataset called Reference Hepatic Lesion Segmentation +(RefHL-Seg). This dataset comprises 2,283 abdominal CT slices from 231 cases, +with corresponding textual annotations and segmentation masks for various liver +lesions in images. We validated the performance of LSMS for MIRS and +conventional medical image segmentation tasks across various datasets. Our LSMS +consistently outperforms on all datasets with lower computational costs. The +code and datasets will be released. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Implicit Concept Removal of Diffusion Models ECCV2024 + + +
+ Text-to-image (T2I) diffusion models often inadvertently generate unwanted +concepts such as watermarks and unsafe images. These concepts, termed as the +"implicit concepts", could be unintentionally learned during training and then +be generated uncontrollably during inference. Existing removal methods still +struggle to eliminate implicit concepts primarily due to their dependency on +the model's ability to recognize concepts it actually can not discern. To +address this, we utilize the intrinsic geometric characteristics of implicit +concepts and present the Geom-Erasing, a novel concept removal method based on +the geometric-driven control. Specifically, once an unwanted implicit concept +is identified, we integrate the existence and geometric information of the +concept into the text prompts with the help of an accessible classifier or +detector model. Subsequently, the model is optimized to identify and +disentangle this information, which is then adopted as negative prompts during +generation. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel +image-text dataset imbued with three typical implicit concepts (i.e., QR codes, +watermarks, and text), reflecting real-life situations where implicit concepts +are easily injected. Geom-Erasing effectively mitigates the generation of +implicit concepts, achieving the state-of-the-art results on the Inappropriate +Image Prompts (I2P) and our challenging Implicit Concept Dataset (ICD) +benchmarks. + +
+
+ comment: Accepted by ECCV2024. Project Page: + https://kaichen1998.github.io/projects/geom-erasing/ +
+
+
+
+
+ + ♻ ☆ GUing: A Mobile GUI Search Engine using a Vision-Language Model + + +
+ App developers use the Graphical User Interface (GUI) of other apps as a +source of inspiration for designing and improving their own apps. Recent +research has thus suggested retrieving relevant GUI designs that match a +certain text query from screenshot datasets acquired through crowdsourced or +automated exploration of GUIs. However, such text-to-GUI retrieval approaches +only leverage the textual information of the GUI elements, neglecting visual +information such as icons or background images. In addition, retrieved +screenshots are not steered by app developers and often lack important app +features that require particular input data. + To overcome these limitations, this paper proposes GUing, a GUI search engine +based on a vision-language model called GUIClip, which we trained specifically +for the problem of designing app GUIs. For this, we first collected from Google +Play app introduction images which usually display the most representative +screenshots and are often captioned (i.e.~labeled) by app vendors. Then, we +developed an automated pipeline to classify, crop, and extract the captions +from these images. This resulted in a large dataset which we share with this +paper: including 303k app screenshots, out of which 135k have captions. We used +this dataset to train a novel vision-language model, which is, to the best of +our knowledge, the first of its kind in GUI retrieval. We evaluated our +approach on various datasets from related work and in manual experiment. The +results demonstrate that our model outperforms previous approaches in +text-to-GUI retrieval achieving a Recall@10 of up to 0.69 and a HIT@10 of 0.91. +We also explored the performance of GUIClip for other GUI tasks including GUI +classification and sketch-to-GUI retrieval with encouraging results. + +
+
+
+
+
+ + ♻ ☆ An open dataset for oracle bone script recognition and decipherment + + +
+ Oracle bone script, one of the earliest known forms of ancient Chinese +writing, presents invaluable research materials for scholars studying the +humanities and geography of the Shang Dynasty, dating back 3,000 years. The +immense historical and cultural significance of these writings cannot be +overstated. However, the passage of time has obscured much of their meaning, +presenting a significant challenge in deciphering these ancient texts. With the +advent of Artificial Intelligence (AI), employing AI to assist in deciphering +Oracle Bone Characters (OBCs) has become a feasible option. Yet, progress in +this area has been hindered by a lack of high-quality datasets. To address this +issue, this paper details the creation of the HUST-OBC dataset. This dataset +encompasses 77,064 images of 1,588 individual deciphered characters and 62,989 +images of 9,411 undeciphered characters, with a total of 140,053 images, +compiled from diverse sources. The hope is that this dataset could inspire and +assist future research in deciphering those unknown OBCs. All the codes and +datasets are available at https://github.com/Yuliang-Liu/Open-Oracle. + +
+
+
+
+
+ + ♻ ☆ SABER-6D: Shape Representation Based Implicit Object Pose Estimation ECCV 2024 + + +
+ In this paper, we propose a novel encoder-decoder architecture, named SABER, +to learn the 6D pose of the object in the embedding space by learning shape +representation at a given pose. This model enables us to learn pose by +performing shape representation at a target pose from RGB image input. We +perform shape representation as an auxiliary task which helps us in learning +rotations space for an object based on 2D images. An image encoder predicts the +rotation in the embedding space and the DeepSDF based decoder learns to +represent the object's shape at the given pose. As our approach is shape based, +the pipeline is suitable for any type of object irrespective of the symmetry. +Moreover, we need only a CAD model of the objects to train SABER. Our pipeline +is synthetic data based and can also handle symmetric objects without symmetry +labels and, thus, no additional labeled training data is needed. The +experimental evaluation shows that our method achieves close to benchmark +results for both symmetric objects and asymmetric objects on Occlusion-LineMOD, +and T-LESS datasets. + +
+
+ comment: ECCV 2024 R6D workshop +
+
+
+
+
+ + ♻ ☆ A Deep-Learning-Based Label-free No-Reference Image Quality Assessment + Metric: Application in Sodium MRI Denoising + + +
+ New multinuclear MRI techniques, such as sodium MRI, generally suffer from +low image quality due to an inherently low signal. Postprocessing methods, such +as image denoising, have been developed for image enhancement. However, the +assessment of these enhanced images is challenging especially considering when +there is a lack of high resolution and high signal images as reference, such as +in sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are +approaches to solve this problem. Existing learning-based NR-IQA metrics rely +on labels derived from subjective human opinions or metrics like +Signal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate +ground truths, resulting in unreliable assessment. We note that deep learning +(DL) models have a unique characteristic in that they are specialized to a +characteristic training set, meaning that deviations between the input testing +data from the training data will reduce prediction accuracy. Therefore, we +propose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM), +which does not depend on ground-truth images or labels. MSM measures the +difference between the input image and the model's prediction for evaluating +the quality of the input image. Experiments conducted on both simulated +distorted proton T1-weighted MR images and denoised sodium MR images +demonstrate that MSM exhibits a superior evaluation performance on various +simulated noises and distortions. MSM also has a substantial agreement with the +expert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528, +outperforming the existing NR-IQA metrics. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State + Space Model ACM MM 2024 + + +
+ Existing Transformer-based models for point cloud analysis suffer from +quadratic complexity, leading to compromised point cloud resolution and +information loss. In contrast, the newly proposed Mamba model, based on state +space models (SSM), outperforms Transformer in multiple areas with only linear +complexity. However, the straightforward adoption of Mamba does not achieve +satisfactory performance on point cloud tasks. In this work, we present +Mamba3D, a state space model tailored for point cloud learning to enhance local +feature extraction, achieving superior performance, high efficiency, and +scalability potential. Specifically, we propose a simple yet effective Local +Norm Pooling (LNP) block to extract local geometric features. Additionally, to +obtain better global features, we introduce a bidirectional SSM (bi-SSM) with +both a token forward SSM and a novel backward SSM that operates on the feature +channel. Extensive experimental results show that Mamba3D surpasses +Transformer-based counterparts and concurrent works in multiple tasks, with or +without pre-training. Notably, Mamba3D achieves multiple SoTA, including an +overall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1% +(with single-modal pre-training) on the ModelNet40 classification task, with +only linear complexity. Our code and weights are available at +https://github.com/xhanxu/Mamba3D. + +
+
+ comment: ACM MM 2024. Code and weights are available at + https://github.com/xhanxu/Mamba3D +
+
+
+
+
+ + ♻ ☆ Enabling Local Editing in Diffusion Models by Joint and Individual + Component Analysis BMVC2024 + + +
+ Recent advances in Diffusion Models (DMs) have led to significant progress in +visual synthesis and editing tasks, establishing them as a strong competitor to +Generative Adversarial Networks (GANs). However, the latent space of DMs is not +as well understood as that of GANs. Recent research has focused on unsupervised +semantic discovery in the latent space of DMs by leveraging the bottleneck +layer of the denoising network, which has been shown to exhibit properties of a +semantic latent space. However, these approaches are limited to discovering +global attributes. In this paper we address, the challenge of local image +manipulation in DMs and introduce an unsupervised method to factorize the +latent semantics learned by the denoising network of pre-trained DMs. Given an +arbitrary image and defined regions of interest, we utilize the Jacobian of the +denoising network to establish a relation between the regions of interest and +their corresponding subspaces in the latent space. Furthermore, we disentangle +the joint and individual components of these subspaces to identify latent +directions that enable local image manipulation. Once discovered, these +directions can be applied to different images to produce semantically +consistent edits, making our method suitable for practical applications. +Experimental results on various datasets demonstrate that our method can +produce semantic edits that are more localized and have better fidelity +compared to the state-of-the-art. + +
+
+ comment: Accepted at BMVC2024 +
+
+
+
+
+ + ♻ ☆ Structured Generative Models for Scene Understanding + + +
+ This position paper argues for the use of \emph{structured generative models} +(SGMs) for the understanding of static scenes. This requires the reconstruction +of a 3D scene from an input image (or a set of multi-view images), whereby the +contents of the image(s) are causally explained in terms of models of +instantiated objects, each with their own type, shape, appearance and pose, +along with global variables like scene lighting and camera parameters. This +approach also requires scene models which account for the co-occurrences and +inter-relationships of objects in a scene. The SGM approach has the merits that +it is compositional and generative, which lead to interpretability and +editability. \\\\ To pursue the SGM agenda, we need models for objects and +scenes, and approaches to carry out inference. We first review models for +objects, which include ``things'' (object categories that have a well defined +shape), and ``stuff'' (categories which have amorphous spatial extent). We then +move on to review \emph{scene models} which describe the inter-relationships of +objects. Perhaps the most challenging problem for SGMs is \emph{inference} of +the objects, lighting and camera parameters, and scene inter-relationships from +input consisting of a single or multiple images. We conclude with a discussion +of issues that need addressing to advance the SGM agenda. + +
+
+ comment: 32 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ MAPL: Memory Augmentation and Pseudo-Labeling for Semi-Supervised + Anomaly Detection + + +
+ Large unlabeled data and difficult-to-identify anomalies are the urgent +issues need to overcome in most industrial scene. In order to address this +issue, a new meth-odology for detecting surface defects in in-dustrial settings +is introduced, referred to as Memory Augmentation and Pseudo-Labeling(MAPL). +The methodology first in-troduces an anomaly simulation strategy, which +significantly improves the model's ability to recognize rare or unknown +anom-aly types by generating simulated anomaly samples. To cope with the +problem of the lack of labeling of anomalous simulated samples, a +pseudo-labeler method based on a one-classifier ensemble was employed in this +study, which enhances the robustness of the model in the case of limited +labeling data by automatically selecting key pseudo-labeling hyperparameters. +Meanwhile, a memory-enhanced learning mechanism is introduced to effectively +predict abnormal regions by analyzing the difference be-tween the input samples +and the normal samples in the memory pool. An end-to-end learning framework is +employed by MAPL to identify the abnormal regions directly from the input data, +which optimizes the ef-ficiency and real-time performance of de-tection. By +conducting extensive trials on the recently developed BHAD dataset (in-cluding +MVTec AD [1], Visa [2], and MDPP [3]), MAPL achieves an average im-age-level +AUROC score of 86.2%, demon-strating a 5.1% enhancement compared to the +original MemSeg [4] model. The source code is available at +https://github.com/jzc777/MAPL. + +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+ + ♻ ☆ UniUSNet: A Promptable Framework for Universal Ultrasound Disease + Prediction and Tissue Segmentation + + +
+ Ultrasound is widely used in clinical practice due to its affordability, +portability, and safety. However, current AI research often overlooks combined +disease prediction and tissue segmentation. We propose UniUSNet, a universal +framework for ultrasound image classification and segmentation. This model +handles various ultrasound types, anatomical positions, and input formats, +excelling in both segmentation and classification tasks. Trained on a +comprehensive dataset with over 9.7K annotations from 7 distinct anatomical +positions, our model matches state-of-the-art performance and surpasses +single-dataset and ablated models. Zero-shot and fine-tuning experiments show +strong generalization and adaptability with minimal fine-tuning. We plan to +expand our dataset and refine the prompting mechanism, with model weights and +code available at (https://github.com/Zehui-Lin/UniUSNet). + +
+
+ comment: Accepted to BIBM 2024 +
+
+
+
+
+ + ♻ ☆ GuidedNet: Semi-Supervised Multi-Organ Segmentation via Labeled Data + Guide Unlabeled Data ACM MM2024 + + +
+ Semi-supervised multi-organ medical image segmentation aids physicians in +improving disease diagnosis and treatment planning and reduces the time and +effort required for organ annotation.Existing state-of-the-art methods train +the labeled data with ground truths and train the unlabeled data with +pseudo-labels. However, the two training flows are separate, which does not +reflect the interrelationship between labeled and unlabeled data.To address +this issue, we propose a semi-supervised multi-organ segmentation method called +GuidedNet, which leverages the knowledge from labeled data to guide the +training of unlabeled data. The primary goals of this study are to improve the +quality of pseudo-labels for unlabeled data and to enhance the network's +learning capability for both small and complex organs.A key concept is that +voxel features from labeled and unlabeled data that are close to each other in +the feature space are more likely to belong to the same class.On this basis, a +3D Consistent Gaussian Mixture Model (3D-CGMM) is designed to leverage the +feature distributions from labeled data to rectify the generated +pseudo-labels.Furthermore, we introduce a Knowledge Transfer Cross Pseudo +Supervision (KT-CPS) strategy, which leverages the prior knowledge obtained +from the labeled data to guide the training of the unlabeled data, thereby +improving the segmentation accuracy for both small and complex organs. +Extensive experiments on two public datasets, FLARE22 and AMOS, demonstrated +that GuidedNet is capable of achieving state-of-the-art performance. The source +code with our proposed model are available at +https://github.com/kimjisoo12/GuidedNet. + +
+
+ comment: Accepted by ACM MM2024, 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ PD-APE: A Parallel Decoding Framework with Adaptive Position Encoding + for 3D Visual Grounding + + +
+ 3D visual grounding aims to identify objects in 3D point cloud scenes that +match specific natural language descriptions. This requires the model to not +only focus on the target object itself but also to consider the surrounding +environment to determine whether the descriptions are met. Most previous works +attempt to accomplish both tasks within the same module, which can easily lead +to a distraction of attention. To this end, we propose PD-APE, a dual-branch +decoding framework that separately decodes target object attributes and +surrounding layouts. Specifically, in the target object branch, the decoder +processes text tokens that describe features of the target object (e.g., +category and color), guiding the queries to pay attention to the target object +itself. In the surrounding branch, the queries align with other text tokens +that carry surrounding environment information, making the attention maps +accurately capture the layout described in the text. Benefiting from the +proposed dual-branch design, the queries are allowed to focus on points +relevant to each branch's specific objective. Moreover, we design an adaptive +position encoding method for each branch respectively. In the target object +branch, the position encoding relies on the relative positions between seed +points and predicted 3D boxes. In the surrounding branch, the attention map is +additionally guided by the confidence between visual and text features, +enabling the queries to focus on points that have valuable layout information. +Extensive experiments demonstrate that we surpass the state-of-the-art on two +widely adopted 3D visual grounding datasets, ScanRefer and Nr3D. + +
+
+
+
+
+ + ♻ ☆ DORec: Decomposed Object Reconstruction and Segmentation Utilizing 2D + Self-Supervised Features + + +
+ Recovering 3D geometry and textures of individual objects is crucial for many +robotics applications, such as manipulation, pose estimation, and autonomous +driving. However, decomposing a target object from a complex background is +challenging. Most existing approaches rely on costly manual labels to acquire +object instance perception. Recent advancements in 2D self-supervised learning +offer new prospects for identifying objects of interest, yet leveraging such +noisy 2D features for clean decomposition remains difficult. In this paper, we +propose a Decomposed Object Reconstruction (DORec) network based on neural +implicit representations. Our key idea is to use 2D self-supervised features to +create two levels of masks for supervision: a binary mask for foreground +regions and a K-cluster mask for semantically similar regions. These +complementary masks result in robust decomposition. Experimental results on +different datasets show DORec's superiority in segmenting and reconstructing +diverse foreground objects from varied backgrounds enabling downstream tasks +such as pose estimation. + +
+
+
+
+
+ + ♻ ☆ FRDiff : Feature Reuse for Universal Training-free Acceleration of + Diffusion Models ECCV 2024 + + +
+ The substantial computational costs of diffusion models, especially due to +the repeated denoising steps necessary for high-quality image generation, +present a major obstacle to their widespread adoption. While several studies +have attempted to address this issue by reducing the number of score function +evaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased +number of denoising iterations misses the opportunity to update fine details, +resulting in noticeable quality degradation. In our work, we introduce an +advanced acceleration technique that leverages the temporal redundancy inherent +in diffusion models. Reusing feature maps with high temporal similarity opens +up a new opportunity to save computation resources without compromising output +quality. To realize the practical benefits of this intuition, we conduct an +extensive analysis and propose a novel method, FRDiff. FRDiff is designed to +harness the advantages of both reduced NFE and feature reuse, achieving a +Pareto frontier that balances fidelity and latency trade-offs in various +generative tasks. + +
+
+ comment: Accepted at ECCV 2024. Code : + https://github.com/ECoLab-POSTECH/FRDiff +
+
+
+
+
+ + ♻ ☆ Evolution-aware VAriance (EVA) Coreset Selection for Medical Image + Classification + + +
+ In the medical field, managing high-dimensional massive medical imaging data +and performing reliable medical analysis from it is a critical challenge, +especially in resource-limited environments such as remote medical facilities +and mobile devices. This necessitates effective dataset compression techniques +to reduce storage, transmission, and computational cost. However, existing +coreset selection methods are primarily designed for natural image datasets, +and exhibit doubtful effectiveness when applied to medical image datasets due +to challenges such as intra-class variation and inter-class similarity. In this +paper, we propose a novel coreset selection strategy termed as Evolution-aware +VAriance (EVA), which captures the evolutionary process of model training +through a dual-window approach and reflects the fluctuation of sample +importance more precisely through variance measurement. Extensive experiments +on medical image datasets demonstrate the effectiveness of our strategy over +previous SOTA methods, especially at high compression rates. EVA achieves +98.27% accuracy with only 10% training data, compared to 97.20% for the full +training set. None of the compared baseline methods can exceed Random at 5% +selection rate, while EVA outperforms Random by 5.61%, showcasing its potential +for efficient medical image analysis. + +
+
+ comment: Accepted by ACM Multimedia 2024 (oral), see: + https://openreview.net/forum?id=m1qrB9KSYD +
+
+
+
+
+ + ♻ ☆ Biometrics and Behavior Analysis for Detecting Distractions in + e-Learning + + +
+ In this article, we explore computer vision approaches to detect abnormal +head pose during e-learning sessions and we introduce a study on the effects of +mobile phone usage during these sessions. We utilize behavioral data collected +from 120 learners monitored while participating in a MOOC learning sessions. +Our study focuses on the influence of phone-usage events on behavior and +physiological responses, specifically attention, heart rate, and meditation, +before, during, and after phone usage. Additionally, we propose an approach for +estimating head pose events using images taken by the webcam during the MOOC +learning sessions to detect phone-usage events. Our hypothesis suggests that +head posture undergoes significant changes when learners interact with a mobile +phone, contrasting with the typical behavior seen when learners face a computer +during e-learning sessions. We propose an approach designed to detect +deviations in head posture from the average observed during a learner's +session, operating as a semi-supervised method. This system flags events +indicating alterations in head posture for subsequent human review and +selection of mobile phone usage occurrences with a sensitivity over 90%. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ VAAD: Visual Attention Analysis Dashboard applied to e-Learning + + +
+ In this paper, we present an approach in the Multimodal Learning Analytics +field. Within this approach, we have developed a tool to visualize and analyze +eye movement data collected during learning sessions in online courses. The +tool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These +eye movement data have been gathered using an eye-tracker and subsequently +processed and visualized for interpretation. The purpose of the tool is to +conduct a descriptive analysis of the data by facilitating its visualization, +enabling the identification of differences and learning patterns among various +learner populations. Additionally, it integrates a predictive module capable of +anticipating learner activities during a learning session. Consequently, VAAD +holds the potential to offer valuable insights into online learning behaviors +from both descriptive and predictive perspectives. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ Dual-scale Enhanced and Cross-generative Consistency Learning for + Semi-supervised Medical Image Segmentation + + +
+ Medical image segmentation plays a crucial role in computer-aided diagnosis. +However, existing methods heavily rely on fully supervised training, which +requires a large amount of labeled data with time-consuming pixel-wise +annotations. Moreover, accurately segmenting lesions poses challenges due to +variations in shape, size, and location. To address these issues, we propose a +novel Dual-scale Enhanced and Cross-generative consistency learning framework +for semi-supervised medical image Segmentation (DEC-Seg). First, we propose a +Cross-level Feature Aggregation (CFA) module that integrates cross-level +adjacent layers to enhance the feature representation ability across different +resolutions. To address scale variation, we present a scale-enhanced +consistency constraint, which ensures consistency in the segmentation maps +generated from the same input image at different scales. This constraint helps +handle variations in lesion sizes and improves the robustness of the model. +Furthermore, we propose a cross-generative consistency scheme, in which the +original and perturbed images can be reconstructed using cross-segmentation +maps. This consistency constraint allows us to mine effective feature +representations and boost the segmentation performance. To further exploit the +scale information, we propose a Dual-scale Complementary Fusion (DCF) module +that integrates features from two scale-specific decoders operating at +different scales to help produce more accurate segmentation maps. Extensive +experimental results on multiple medical segmentation tasks (polyp, skin +lesion, and brain glioma) demonstrate the effectiveness of our DEC-Seg against +other state-of-the-art semi-supervised segmentation approaches. The +implementation code will be released at https://github.com/taozh2017/DECSeg. + +
+
+ comment: 12 pages 10 figures +
+
+
+
+
+ + ♻ ☆ Pedestrian Attribute Recognition via CLIP based Prompt Vision-Language + Fusion + + +
+ Existing pedestrian attribute recognition (PAR) algorithms adopt pre-trained +CNN (e.g., ResNet) as their backbone network for visual feature learning, which +might obtain sub-optimal results due to the insufficient employment of the +relations between pedestrian images and attribute labels. In this paper, we +formulate PAR as a vision-language fusion problem and fully exploit the +relations between pedestrian images and attribute labels. Specifically, the +attribute phrases are first expanded into sentences, and then the pre-trained +vision-language model CLIP is adopted as our backbone for feature embedding of +visual images and attribute descriptions. The contrastive learning objective +connects the vision and language modalities well in the CLIP-based feature +space, and the Transformer layers used in CLIP can capture the long-range +relations between pixels. Then, a multi-modal Transformer is adopted to fuse +the dual features effectively and feed-forward network is used to predict +attributes. To optimize our network efficiently, we propose the region-aware +prompt tuning technique to adjust very few parameters (i.e., only the prompt +vectors and classification heads) and fix both the pre-trained VL model and +multi-modal Transformer. Our proposed PAR algorithm only adjusts 0.75% +learnable parameters compared with the fine-tuning strategy. It also achieves +new state-of-the-art performance on both standard and zero-shot settings for +PAR, including RAPv1, RAPv2, WIDER, PA100K, and PETA-ZS, RAP-ZS datasets. The +source code and pre-trained models will be released on +https://github.com/Event-AHU/OpenPAR. + +
+
+ comment: Accepted by IEEE TCSVT 2024, Camera Ready Version +
+
+
+
+
+ + ♻ ☆ Disease Classification and Impact of Pretrained Deep Convolution Neural + Networks on Diverse Medical Imaging Datasets across Imaging Modalities + + +
+ Imaging techniques such as Chest X-rays, whole slide images, and optical +coherence tomography serve as the initial screening and detection for a wide +variety of medical pulmonary and ophthalmic conditions respectively. This paper +investigates the intricacies of using pretrained deep convolutional neural +networks with transfer learning across diverse medical imaging datasets with +varying modalities for binary and multiclass classification. We conducted a +comprehensive performance analysis with ten network architectures and model +families each with pretraining and random initialization. Our finding showed +that the use of pretrained models as fixed feature extractors yields poor +performance irrespective of the datasets. Contrary, histopathology microscopy +whole slide images have better performance. It is also found that deeper and +more complex architectures did not necessarily result in the best performance. +This observation implies that the improvements in ImageNet are not parallel to +the medical imaging tasks. Within a medical domain, the performance of the +network architectures varies within model families with shifts in datasets. +This indicates that the performance of models within a specific modality may +not be conclusive for another modality within the same domain. This study +provides a deeper understanding of the applications of deep learning techniques +in medical imaging and highlights the impact of pretrained networks across +different medical imaging datasets under five different experimental settings. + +
+
+ comment: 15 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Instant Adversarial Purification with Adversarial Consistency + Distillation + + +
+ Neural networks, despite their remarkable performance in widespread +applications, including image classification, are also known to be vulnerable +to subtle adversarial noise. Although some diffusion-based purification methods +have been proposed, for example, DiffPure, those methods are time-consuming. In +this paper, we propose One Step Control Purification (OSCP), a diffusion-based +purification model that can purify the adversarial image in one Neural Function +Evaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and +ControlNet for our one-step purification. OSCP is computationally friendly and +time efficient compared to other diffusion-based purification methods; we +achieve defense success rate of 74.19\% on ImageNet, only requiring 0.1s for +each purification. Moreover, there is a fundamental incongruence between +consistency distillation and adversarial perturbation. To address this +ontological dissonance, we propose Gaussian Adversarial Noise Distillation +(GAND), a novel consistency distillation framework that facilitates a more +nuanced reconciliation of the latent space dynamics, effectively bridging the +natural and adversarial manifolds. Our experiments show that the GAND does not +need a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient. + +
+
+
+
+
+ + ♻ ☆ Show Me the World in My Language: Establishing the First Baseline for + Scene-Text to Scene-Text Translation ICPR 2024 + + +
+ In this work, we study the task of ``visually'' translating scene text from a +source language (e.g., Hindi) to a target language (e.g., English). Visual +translation involves not just the recognition and translation of scene text but +also the generation of the translated image that preserves visual features of +the source scene text, such as font, size, and background. There are several +challenges associated with this task, such as translation with limited context, +deciding between translation and transliteration, accommodating varying text +lengths within fixed spatial boundaries, and preserving the font and background +styles of the source scene text in the target language. To address this +problem, we make the following contributions: (i) We study visual translation +as a standalone problem for the first time in the literature. (ii) We present a +cascaded framework for visual translation that combines state-of-the-art +modules for scene text recognition, machine translation, and scene text +synthesis as a baseline for the task. (iii) We propose a set of task-specific +design enhancements to design a variant of the baseline to obtain performance +improvements. (iv) Currently, the existing related literature lacks any +comprehensive performance evaluation for this novel task. To fill this gap, we +introduce several automatic and user-assisted evaluation metrics designed +explicitly for evaluating visual translation. Further, we evaluate presented +baselines for translating scene text between Hindi and English. Our experiments +demonstrate that although we can effectively perform visual translation over a +large collection of scene text images, the presented baseline only partially +addresses challenges posed by visual translation tasks. We firmly believe that +this new task and the limitations of existing models, as reported in this +paper, should encourage further research in visual translation. + +
+
+ comment: Accepted at ICPR 2024, Project Website: + https://vl2g.github.io/projects/visTrans/ +
+
+
+
+
+ + ♻ ☆ An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference + Acceleration for Large Vision-Language Models ECCV 2024 + + +
+ In this study, we identify the inefficient attention phenomena in Large +Vision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5, +QwenVL-Chat and Video-LLaVA. We find out that the attention computation over +visual tokens is of extreme inefficiency in the deep layers of popular LVLMs, +suggesting a need for a sparser approach compared to textual data handling. To +this end, we introduce FastV, a versatile plug-and-play method designed to +optimize computational efficiency by learning adaptive attention patterns in +early layers and pruning visual tokens in subsequent ones. Our evaluations +demonstrate FastV's ability to dramatically reduce computational costs (e.g., a +45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a +wide range of image and video understanding tasks. The computational efficiency +and performance trade-off of FastV are highly customizable and +pareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve +a lower budget than that of a 7B-parameter model, while still maintaining +superior performance. We believe FastV has practical values for deployment of +LVLMs in edge devices and commercial models. Code is released at +https://github.com/pkunlp-icler/FastV. + +
+
+ comment: Accepted to ECCV 2024 (Oral), code is released at + https://github.com/pkunlp-icler/FastV, +
+
+
+
+
+ + ♻ ☆ Video Diffusion Models are Strong Video Inpainter + + +
+ Propagation-based video inpainting using optical flow at the pixel or feature +level has recently garnered significant attention. However, it has limitations +such as the inaccuracy of optical flow prediction and the propagation of noise +over time. These issues result in non-uniform noise and time consistency +problems throughout the video, which are particularly pronounced when the +removed area is large and involves substantial movement. To address these +issues, we propose a novel First Frame Filling Video Diffusion Inpainting model +(FFF-VDI). We design FFF-VDI inspired by the capabilities of pre-trained +image-to-video diffusion models that can transform the first frame image into a +highly natural video. To apply this to the video inpainting task, we propagate +the noise latent information of future frames to fill the masked areas of the +first frame's noise latent code. Next, we fine-tune the pre-trained +image-to-video diffusion model to generate the inpainted video. The proposed +model addresses the limitations of existing methods that rely on optical flow +quality, producing much more natural and temporally consistent videos. This +proposed approach is the first to effectively integrate image-to-video +diffusion models into video inpainting tasks. Through various comparative +experiments, we demonstrate that the proposed model can robustly handle diverse +inpainting types with high quality. + +
+
+
+
+
+ + ♻ ☆ A Grey-box Attack against Latent Diffusion Model-based Image Editing by + Posterior Collapse + + +
+ Recent advancements in generative AI, particularly Latent Diffusion Models +(LDMs), have revolutionized image synthesis and manipulation. However, these +generative techniques raises concerns about data misappropriation and +intellectual property infringement. Adversarial attacks on machine learning +models have been extensively studied, and a well-established body of research +has extended these techniques as a benign metric to prevent the underlying +misuse of generative AI. Current approaches to safeguarding images from +manipulation by LDMs are limited by their reliance on model-specific knowledge +and their inability to significantly degrade semantic quality of generated +images. In response to these shortcomings, we propose the Posterior Collapse +Attack (PCA) based on the observation that VAEs suffer from posterior collapse +during training. Our method minimizes dependence on the white-box information +of target models to get rid of the implicit reliance on model-specific +knowledge. By accessing merely a small amount of LDM parameters, in specific +merely the VAE encoder of LDMs, our method causes a substantial semantic +collapse in generation quality, particularly in perceptual consistency, and +demonstrates strong transferability across various model architectures. +Experimental results show that PCA achieves superior perturbation effects on +image generation of LDMs with lower runtime and VRAM. Our method outperforms +existing techniques, offering a more robust and generalizable solution that is +helpful in alleviating the socio-technical challenges posed by the rapidly +evolving landscape of generative AI. + +
+
+ comment: 21 pages, 7 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Event Voxel Set Transformer for Spatiotemporal Representation Learning + on Event Streams + + +
+ Event cameras are neuromorphic vision sensors that record a scene as sparse +and asynchronous event streams. Most event-based methods project events into +dense frames and process them using conventional vision models, resulting in +high computational complexity. A recent trend is to develop point-based +networks that achieve efficient event processing by learning sparse +representations. However, existing works may lack robust local information +aggregators and effective feature interaction operations, thus limiting their +modeling capabilities. To this end, we propose an attention-aware model named +Event Voxel Set Transformer (EVSTr) for efficient spatiotemporal representation +learning on event streams. It first converts the event stream into voxel sets +and then hierarchically aggregates voxel features to obtain robust +representations. The core of EVSTr is an event voxel transformer encoder that +consists of two well-designed components, including the Multi-Scale Neighbor +Embedding Layer (MNEL) for local information aggregation and the Voxel +Self-Attention Layer (VSAL) for global feature interaction. Enabling the +network to incorporate a long-range temporal structure, we introduce a segment +modeling strategy (S$^{2}$TM) to learn motion patterns from a sequence of +segmented voxel sets. The proposed model is evaluated on two recognition tasks, +including object classification and action recognition. To provide a convincing +model evaluation, we present a new event-based action recognition dataset +(NeuroHAR) recorded in challenging scenarios. Comprehensive experiments show +that EVSTr achieves state-of-the-art performance while maintaining low model +complexity. + +
+
+ comment: Accepted by IEEE Transactions on Circuits and Systems for Video + Technology (TCSVT) +
+
+
+
+
+ + ♻ ☆ MM-Soc: Benchmarking Multimodal Large Language Models in Social Media + Platforms ACL 2024 + + +
+ Social media platforms are hubs for multimodal information exchange, +encompassing text, images, and videos, making it challenging for machines to +comprehend the information or emotions associated with interactions in online +spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising +solution to these challenges, yet they struggle to accurately interpret human +emotions and complex content such as misinformation. This paper introduces +MM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of +multimodal social media content. MM-Soc compiles prominent multimodal datasets +and incorporates a novel large-scale YouTube tagging dataset, targeting a range +of tasks from misinformation detection, hate speech detection, and social +context generation. Through our exhaustive evaluation on ten size-variants of +four open-source MLLMs, we have identified significant performance disparities, +highlighting the need for advancements in models' social understanding +capabilities. Our analysis reveals that, in a zero-shot setting, various types +of MLLMs generally exhibit difficulties in handling social media tasks. +However, MLLMs demonstrate performance improvements post fine-tuning, +suggesting potential pathways for improvement. Our code and data are available +at https://github.com/claws-lab/MMSoc.git. + +
+
+ comment: In Proceedings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection + with Semantic Feature Fusion Guidance + + +
+ Although most existing multi-modal salient object detection (SOD) methods +demonstrate effectiveness through training models from scratch, the limited +multi-modal data hinders these methods from reaching optimality. In this paper, +we propose a novel framework to explore and exploit the powerful feature +representation and zero-shot generalization ability of the pre-trained Segment +Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision +fundamental model, driving the class-agnostic SAM to comprehend and detect +salient objects accurately is non-trivial, especially in challenging scenes. To +this end, we develop \underline{SAM} with se\underline{m}antic +f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which +incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to +multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal +data to directly mine the complementary benefits of multi-modal inputs and +comprehensively utilize them to achieve accurate saliency prediction. To +address these issues, we first design a multi-modal complementary fusion module +to extract robust multi-modal semantic features by integrating information from +visible and thermal or depth image pairs. Then, we feed the extracted +multi-modal semantic features into both the SAM image encoder and mask decoder +for fine-tuning and prompting, respectively. Specifically, in the image +encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to +multi-modal information. In the mask decoder, a semantic-geometric prompt +generation strategy is proposed to produce corresponding embeddings with +various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD +benchmarks show the effectiveness of the proposed framework. The code will be +available at \url{https://github.com/Angknpng/Sammese}. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ S-NeRF++: Autonomous Driving Simulation via Neural Reconstruction and + Generation + + +
+ Autonomous driving simulation system plays a crucial role in enhancing +self-driving data and simulating complex and rare traffic scenarios, ensuring +navigation safety. However, traditional simulation systems, which often heavily +rely on manual modeling and 2D image editing, struggled with scaling to +extensive scenes and generating realistic simulation data. In this study, we +present S-NeRF++, an innovative autonomous driving simulation system based on +neural reconstruction. Trained on widely-used self-driving datasets such as +nuScenes and Waymo, S-NeRF++ can generate a large number of realistic street +scenes and foreground objects with high rendering quality as well as offering +considerable flexibility in manipulation and simulation. Specifically, S-NeRF++ +is an enhanced neural radiance field for synthesizing large-scale scenes and +moving vehicles, with improved scene parameterization and camera pose learning. +The system effectively utilizes noisy and sparse LiDAR data to refine training +and address depth outliers, ensuring high-quality reconstruction and novel-view +rendering. It also provides a diverse foreground asset bank by reconstructing +and generating different foreground vehicles to support comprehensive scenario +creation.Moreover, we have developed an advanced foreground-background fusion +pipeline that skillfully integrates illumination and shadow effects, further +enhancing the realism of our simulations. With the high-quality simulated data +provided by our S-NeRF++, we found the perception methods enjoy performance +boosts on several autonomous driving downstream tasks, further demonstrating +our proposed simulator's effectiveness. + +
+
+
+
+
+ + ♻ ☆ NutritionVerse: Empirical Study of Various Dietary Intake Estimation + Approaches + + +
+ Accurate dietary intake estimation is critical for informing policies and +programs to support healthy eating, as malnutrition has been directly linked to +decreased quality of life. However self-reporting methods such as food diaries +suffer from substantial bias. Other conventional dietary assessment techniques +and emerging alternative approaches such as mobile applications incur high time +costs and may necessitate trained personnel. Recent work has focused on using +computer vision and machine learning to automatically estimate dietary intake +from food images, but the lack of comprehensive datasets with diverse +viewpoints, modalities and food annotations hinders the accuracy and realism of +such methods. To address this limitation, we introduce NutritionVerse-Synth, +the first large-scale dataset of 84,984 photorealistic synthetic 2D food images +with associated dietary information and multimodal annotations (including depth +images, instance masks, and semantic masks). Additionally, we collect a real +image dataset, NutritionVerse-Real, containing 889 images of 251 dishes to +evaluate realism. Leveraging these novel datasets, we develop and benchmark +NutritionVerse, an empirical study of various dietary intake estimation +approaches, including indirect segmentation-based and direct prediction +networks. We further fine-tune models pretrained on synthetic data with real +images to provide insights into the fusion of synthetic and real data. Finally, +we release both datasets (NutritionVerse-Synth, NutritionVerse-Real) on +https://www.kaggle.com/nutritionverse/datasets as part of an open initiative to +accelerate machine learning for dietary sensing. + +
+
+ comment: Corrections made to Tables 6, 7, and 8, and corrections made to + Experiments Part C. Additional clarification made in Section 4 +
+
+
+
+
+ + ♻ ☆ DarkGS: Learning Neural Illumination and 3D Gaussians Relighting for + Robotic Exploration in the Dark + + +
+ Humans have the remarkable ability to construct consistent mental models of +an environment, even under limited or varying levels of illumination. We wish +to endow robots with this same capability. In this paper, we tackle the +challenge of constructing a photorealistic scene representation under poorly +illuminated conditions and with a moving light source. We approach the task of +modeling illumination as a learning problem, and utilize the developed +illumination model to aid in scene reconstruction. We introduce an innovative +framework that uses a data-driven approach, Neural Light Simulators (NeLiS), to +model and calibrate the camera-light system. Furthermore, we present DarkGS, a +method that applies NeLiS to create a relightable 3D Gaussian scene model +capable of real-time, photorealistic rendering from novel viewpoints. We show +the applicability and robustness of our proposed simulator and system in a +variety of real-world environments. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Towards Non-invasive and Personalized Management of Breast Cancer + Patients from Multiparametric MRI via A Large Mixture-of-Modality-Experts + Model + + +
+ Breast magnetic resonance imaging (MRI) is the imaging technique with the +highest sensitivity for detecting breast cancer and is routinely used for women +at high risk. Despite the comprehensive multiparametric protocol of breast MRI, +existing artificial intelligence-based studies predominantly rely on single +sequences and have limited validation. Here we report a large +mixture-of-modality-experts model (MOME) that integrates multiparametric MRI +information within a unified structure, offering a noninvasive method for +personalized breast cancer management. We have curated the largest +multiparametric breast MRI dataset, involving 5,205 patients from three +hospitals in the north, southeast, and southwest of China, for the development +and extensive evaluation of our model. MOME demonstrated accurate and robust +identification of breast cancer. It achieved comparable performance for +malignancy recognition to that of four senior radiologists and significantly +outperformed a junior radiologist, with 0.913 AUROC, 0.948 AUPRC, 0.905 F1 +score, and 0.723 MCC. Our findings suggest that MOME could reduce the need for +biopsies in BI-RADS 4 patients with a ratio of 7.3%, classify triple-negative +breast cancer with an AUROC of 0.709, and predict pathological complete +response to neoadjuvant chemotherapy with an AUROC of 0.694. The model further +supports scalable and interpretable inference, adapting to missing modalities +and providing decision explanations by highlighting lesions and measuring +modality contributions. MOME exemplifies a discriminative, robust, scalable, +and interpretable multimodal model, paving the way for noninvasive, +personalized management of breast cancer patients based on multiparametric +breast imaging data. + +
+
+ comment: 27 pages, 8 figures, 10 tables +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets ECCV 2024 + + +
+ Temporal video alignment aims to synchronize the key events like object +interactions or action phase transitions in two videos. Such methods could +benefit various video editing, processing, and understanding tasks. However, +existing approaches operate under the restrictive assumption that a suitable +video pair for alignment is given, significantly limiting their broader +applicability. To address this, we re-pose temporal alignment as a search +problem and introduce the task of Alignable Video Retrieval (AVR). Given a +query video, our approach can identify well-alignable videos from a large +collection of clips and temporally synchronize them to the query. To achieve +this, we make three key contributions: 1) we introduce DRAQ, a video +alignability indicator to identify and re-rank the best alignable video from a +set of candidates; 2) we propose an effective and generalizable frame-level +video feature design to improve the alignment performance of several +off-the-shelf feature representations, and 3) we propose a novel benchmark and +evaluation protocol for AVR using cycle-consistency metrics. Our experiments on +3 datasets, including large-scale Kinetics700, demonstrate the effectiveness of +our approach in identifying alignable video pairs from diverse datasets. +Project Page: https://daveishan.github.io/avr-webpage/. + +
+
+ comment: ECCV 2024 Oral +
+
+
+
+
+ + ☆ Know When to Fuse: Investigating Non-English Hybrid Retrieval in the + Legal Domain + + +
+ Hybrid search has emerged as an effective strategy to offset the limitations +of different matching paradigms, especially in out-of-domain contexts where +notable improvements in retrieval quality have been observed. However, existing +research predominantly focuses on a limited set of retrieval methods, evaluated +in pairs on domain-general datasets exclusively in English. In this work, we +study the efficacy of hybrid search across a variety of prominent retrieval +models within the unexplored field of law in the French language, assessing +both zero-shot and in-domain scenarios. Our findings reveal that in a zero-shot +context, fusing different domain-general models consistently enhances +performance compared to using a standalone model, regardless of the fusion +method. Surprisingly, when models are trained in-domain, we find that fusion +generally diminishes performance relative to using the best single system, +unless fusing scores with carefully tuned weights. These novel insights, among +others, expand the applicability of prior findings across a new field and +language, and contribute to a deeper understanding of hybrid search in +non-English specialized domains. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ SSD4Rec: A Structured State Space Duality Model for Efficient Sequential + Recommendation + + +
+ Sequential recommendation methods are crucial in modern recommender systems +for their remarkable capability to understand a user's changing interests based +on past interactions. However, a significant challenge faced by current methods +(e.g., RNN- or Transformer-based models) is to effectively and efficiently +capture users' preferences by modeling long behavior sequences, which impedes +their various applications like short video platforms where user interactions +are numerous. Recently, an emerging architecture named Mamba, built on state +space models (SSM) with efficient hardware-aware designs, has showcased the +tremendous potential for sequence modeling, presenting a compelling avenue for +addressing the challenge effectively. Inspired by this, we propose a novel +generic and efficient sequential recommendation backbone, SSD4Rec, which +explores the seamless adaptation of Mamba for sequential recommendations. +Specifically, SSD4Rec marks the variable- and long-length item sequences with +sequence registers and processes the item representations with bidirectional +Structured State Space Duality (SSD) blocks. This not only allows for +hardware-aware matrix multiplication but also empowers outstanding capabilities +in variable-length and long-range sequence modeling. Extensive evaluations on +four benchmark datasets demonstrate that the proposed model achieves +state-of-the-art performance while maintaining near-linear scalability with +user sequence length. Our code is publicly available at +https://github.com/ZhangYifeng1995/SSD4Rec. + +
+
+
+
+
+ + ☆ Real World Conversational Entity Linking Requires More Than Zeroshots + + +
+ Entity linking (EL) in conversations faces notable challenges in practical +applications, primarily due to the scarcity of entity-annotated conversational +datasets and sparse knowledge bases (KB) containing domain-specific, long-tail +entities. We designed targeted evaluation scenarios to measure the efficacy of +EL models under resource constraints. Our evaluation employs two KBs: Fandom, +exemplifying real-world EL complexities, and the widely used Wikipedia. First, +we assess EL models' ability to generalize to a new unfamiliar KB using Fandom +and a novel zero-shot conversational entity linking dataset that we curated +based on Reddit discussions on Fandom entities. We then evaluate the +adaptability of EL models to conversational settings without prior training. +Our results indicate that current zero-shot EL models falter when introduced to +new, domain-specific KBs without prior training, significantly dropping in +performance. Our findings reveal that previous evaluation approaches fall short +of capturing real-world complexities for zero-shot EL, highlighting the +necessity for new approaches to design and assess conversational EL models to +adapt to limited resources. The evaluation setup and the dataset proposed in +this research are made publicly available. + +
+
+
+
+
+ + ☆ LLM-PQA: LLM-enhanced Prediction Query Answering CIKM 2024 + + +
+ The advent of Large Language Models (LLMs) provides an opportunity to change +the way queries are processed, moving beyond the constraints of conventional +SQL-based database systems. However, using an LLM to answer a prediction query +is still challenging, since an external ML model has to be employed and +inference has to be performed in order to provide an answer. This paper +introduces LLM-PQA, a novel tool that addresses prediction queries formulated +in natural language. LLM-PQA is the first to combine the capabilities of LLMs +and retrieval-augmented mechanism for the needs of prediction queries by +integrating data lakes and model zoos. This integration provides users with +access to a vast spectrum of heterogeneous data and diverse ML models, +facilitating dynamic prediction query answering. In addition, LLM-PQA can +dynamically train models on demand, based on specific query requirements, +ensuring reliable and relevant results even when no pre-trained model in a +model zoo, available for the task. + +
+
+ comment: This paper is accepted as a demo at CIKM 2024 +
+
+
+
+
+ + ☆ Evidential Transformers for Improved Image Retrieval ECCV 2024 + + +
+ We introduce the Evidential Transformer, an uncertainty-driven transformer +model for improved and robust image retrieval. In this paper, we make several +contributions to content-based image retrieval (CBIR). We incorporate +probabilistic methods into image retrieval, achieving robust and reliable +results, with evidential classification surpassing traditional training based +on multiclass classification as a baseline for deep metric learning. +Furthermore, we improve the state-of-the-art retrieval results on several +datasets by leveraging the Global Context Vision Transformer (GC ViT) +architecture. Our experimental results consistently demonstrate the reliability +of our approach, setting a new benchmark in CBIR in all test settings on the +Stanford Online Products (SOP) and CUB-200-2011 datasets. + +
+
+ comment: 6 pages, 6 figures, To be presented at the 3rd Workshop on + Uncertainty Quantification for Computer Vision, at the ECCV 2024 conference + in Milan, Italy +
+
+
+
+
+ + ☆ Improved Diversity-Promoting Collaborative Metric Learning for + Recommendation + + +
+ Collaborative Metric Learning (CML) has recently emerged as a popular method +in recommendation systems (RS), closing the gap between metric learning and +collaborative filtering. Following the convention of RS, existing practices +exploit unique user representation in their model design. This paper focuses on +a challenging scenario where a user has multiple categories of interests. Under +this setting, the unique user representation might induce preference bias, +especially when the item category distribution is imbalanced. To address this +issue, we propose a novel method called \textit{Diversity-Promoting +Collaborative Metric Learning} (DPCML), with the hope of considering the +commonly ignored minority interest of the user. The key idea behind DPCML is to +introduce a set of multiple representations for each user in the system where +users' preference toward an item is aggregated by taking the minimum item-user +distance among their embedding set. Specifically, we instantiate two effective +assignment strategies to explore a proper quantity of vectors for each user. +Meanwhile, a \textit{Diversity Control Regularization Scheme} (DCRS) is +developed to accommodate the multi-vector representation strategy better. +Theoretically, we show that DPCML could induce a smaller generalization error +than traditional CML. Furthermore, we notice that CML-based approaches usually +require \textit{negative sampling} to reduce the heavy computational burden +caused by the pairwise objective therein. In this paper, we reveal the +fundamental limitation of the widely adopted hard-aware sampling from the +One-Way Partial AUC (OPAUC) perspective and then develop an effective sampling +alternative for the CML-based paradigm. Finally, comprehensive experiments over +a range of benchmark datasets speak to the efficacy of DPCML. Code are +available at \url{https://github.com/statusrank/LibCML}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.15292 +
+
+
+
+
+ + ☆ Towards Investigating Biases in Spoken Conversational Search + + +
+ Voice-based systems like Amazon Alexa, Google Assistant, and Apple Siri, +along with the growing popularity of OpenAI's ChatGPT and Microsoft's Copilot, +serve diverse populations, including visually impaired and low-literacy +communities. This reflects a shift in user expectations from traditional search +to more interactive question-answering models. However, presenting information +effectively in voice-only channels remains challenging due to their linear +nature. This limitation can impact the presentation of complex queries +involving controversial topics with multiple perspectives. Failing to present +diverse viewpoints may perpetuate or introduce biases and affect user +attitudes. Balancing information load and addressing biases is crucial in +designing a fair and effective voice-based system. To address this, we (i) +review how biases and user attitude changes have been studied in screen-based +web search, (ii) address challenges in studying these changes in voice-based +settings like SCS, (iii) outline research questions, and (iv) propose an +experimental setup with variables, data, and instruments to explore biases in a +voice-based setting like Spoken Conversational Search. + +
+
+ comment: Accepted Late-Breaking Results at ACM ICMI Companion 2024 +
+
+
+
+
+ + ♻ ☆ Manipulating Large Language Models to Increase Product Visibility + + +
+ Large language models (LLMs) are increasingly being integrated into search +engines to provide natural language responses tailored to user queries. +Customers and end-users are also becoming more dependent on these models for +quick and easy purchase decisions. In this work, we investigate whether +recommendations from LLMs can be manipulated to enhance a product's visibility. +We demonstrate that adding a strategic text sequence (STS) -- a carefully +crafted message -- to a product's information page can significantly increase +its likelihood of being listed as the LLM's top recommendation. To understand +the impact of STS, we use a catalog of fictitious coffee machines and analyze +its effect on two target products: one that seldom appears in the LLM's +recommendations and another that usually ranks second. We observe that the +strategic text sequence significantly enhances the visibility of both products +by increasing their chances of appearing as the top recommendation. This +ability to manipulate LLM-generated search responses provides vendors with a +considerable competitive advantage and has the potential to disrupt fair market +competition. Just as search engine optimization (SEO) revolutionized how +webpages are customized to rank higher in search engine results, influencing +LLM recommendations could profoundly impact content optimization for AI-driven +search services. Code for our experiments is available at +https://github.com/aounon/llm-rank-optimizer. + +
+
+
+
+
+ + ♻ ☆ A multi-language toolkit for supporting automated checking of research + outputs + + +
+ This article presents the automatic checking of research outputs package +acro, which assists researchers and data governance teams by automatically +applying best-practice principles-based statistical disclosure control (SDC) +techniques on-the-fly as researchers conduct their analyses. acro distinguishes +between: research output that is safe to publish; output that requires further +analysis; and output that cannot be published because it creates substantial +risk of disclosing private data. This is achieved through the use of a +lightweight Python wrapper that sits over well-known analysis tools that +produce outputs such as tables, plots, and statistical models. This adds +functionality to (i) identify potentially disclosive outputs against a range of +commonly used disclosure tests; (ii) apply disclosure mitigation strategies +where required; (iii) report reasons for applying SDC; and (iv) produce simple +summary documents trusted research environment staff can use to streamline +their workflow. The major analytical programming languages used by researchers +are supported: Python, R, and Stata. The acro code and documentation are +available under an MIT license at https://github.com/AI-SDC/ACRO + +
+
+
+
+
+ + ♻ ☆ VM-Rec: A Variational Mapping Approach for Cold-start User + Recommendation + + +
+ The cold-start problem is a common challenge for most recommender systems. +The practical application of most cold-start methods is hindered by the +deficiency in auxiliary content information for users. Moreover, most methods +necessitate simultaneous updates to the extensive parameters of recommender +models, leading to significant training costs, particularly in large-scale +industrial scenarios. We observe that the model can generate expressive +embeddings for warm users with relatively more interactions. Initially, these +users were cold-start users, and after transitioning to warm users, they +exhibit clustering patterns in their embeddings with consistent initial +interactions. Based on this motivation, we propose a Variational Mapping +approach for cold-start user Recommendation (VM-Rec), mapping from few initial +interactions to expressive embeddings for cold-start users. Specifically, we +encode the initial interactions into a latent representation, where each +dimension disentangledly signifies the degree of association with each warm +user. Subsequently, we utilize this latent representation as the parameters for +the mapping function, mapping (decoding) it into an expressive embedding, which +can be integrated into a pre-trained recommender model directly. Our method is +evaluated on three datasets using the same base model, demonstrating superior +performance compared to other popular cold-start methods. + +
+
+
+
+
+ + ♻ ☆ A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning KDD + + +
+ Retrieval-augmented generation (RAG) is a framework enabling large language +models (LLMs) to enhance their accuracy and reduce hallucinations by +integrating external knowledge bases. In this paper, we introduce a hybrid RAG +system enhanced through a comprehensive suite of optimizations that +significantly improve retrieval quality, augment reasoning capabilities, and +refine numerical computation ability. We refined the text chunks and tables in +web pages, added attribute predictors to reduce hallucinations, conducted LLM +Knowledge Extractor and Knowledge Graph Extractor, and finally built a +reasoning strategy with all the references. We evaluated our system on the CRAG +dataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and +online evaluations demonstrate that our system significantly enhances complex +reasoning capabilities. In local evaluations, we have significantly improved +accuracy and reduced error rates compared to the baseline model, achieving a +notable increase in scores. In the meanwhile, we have attained outstanding +results in online assessments, demonstrating the performance and generalization +capabilities of the proposed system. The source code for our system is released +in \url{https://gitlab.aicrowd.com/shizueyy/crag-new}. + +
+
+ comment: Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024 +
+
+
+
+
+ + ♻ ☆ PEPT: Expert Finding Meets Personalized Pre-training + + +
+ Finding experts is essential in Community Question Answering (CQA) platforms +as it enables the effective routing of questions to potential users who can +provide relevant answers. The key is to personalized learning expert +representations based on their historical answered questions, and accurately +matching them with target questions. There have been some preliminary works +exploring the usability of PLMs in expert finding, such as pre-training expert +or question representations. However, these models usually learn pure text +representations of experts from histories, disregarding personalized and +fine-grained expert modeling. For alleviating this, we present a personalized +pre-training and fine-tuning paradigm, which could effectively learn expert +interest and expertise simultaneously. Specifically, in our pre-training +framework, we integrate historical answered questions of one expert with one +target question, and regard it as a candidate aware expert-level input unit. +Then, we fuse expert IDs into the pre-training for guiding the model to model +personalized expert representations, which can help capture the unique +characteristics and expertise of each individual expert. Additionally, in our +pre-training task, we design: 1) a question-level masked language model task to +learn the relatedness between histories, enabling the modeling of +question-level expert interest; 2) a vote-oriented task to capture +question-level expert expertise by predicting the vote score the expert would +receive. Through our pre-training framework and tasks, our approach could +holistically learn expert representations including interests and expertise. +Our method has been extensively evaluated on six real-world CQA datasets, and +the experimental results consistently demonstrate the superiority of our +approach over competitive baseline methods. + +
+
+
+
+
+
+
+
+ + Machine Learning 52 + +
+
+
+ + ♻ ☆ Advanced Predictive Modeling for Enhanced Mortality Prediction in ICU + Stroke Patients Using Clinical Data + + +
+ Background: Stroke is second-leading cause of disability and death among +adults. Approximately 17 million people suffer from a stroke annually, with +about 85% being ischemic strokes. Predicting mortality of ischemic stroke +patients in intensive care unit (ICU) is crucial for optimizing treatment +strategies, allocating resources, and improving survival rates. Methods: We +acquired data on ICU ischemic stroke patients from MIMIC-IV database, including +diagnoses, vital signs, laboratory tests, medications, procedures, treatments, +and clinical notes. Stroke patients were randomly divided into training (70%, +n=2441), test (15%, n=523), and validation (15%, n=523) sets. To address data +imbalances, we applied Synthetic Minority Over-sampling Technique (SMOTE). We +selected 30 features for model development, significantly reducing feature +number from 1095 used in the best study. We developed a deep learning model to +assess mortality risk and implemented several baseline machine learning models +for comparison. Results: XGB-DL model, combining XGBoost for feature selection +and deep learning, effectively minimized false positives. Model's AUROC +improved from 0.865 (95% CI: 0.821 - 0.905) on first day to 0.903 (95% CI: +0.868 - 0.936) by fourth day using data from 3,646 ICU mortality patients in +the MIMIC-IV database with 0.945 AUROC (95% CI: 0.944 - 0.947) during training. +Although other ML models also performed well in terms of AUROC, we chose Deep +Learning for its higher specificity. Conclusions: Through enhanced feature +selection and data cleaning, proposed model demonstrates a 13% AUROC +improvement compared to existing models while reducing feature number from 1095 +in previous studies to 30. + +
+
+
+
+
+ + ♻ ☆ AlphaFold Meets Flow Matching for Generating Protein Ensembles ICML 2024 + + +
+ The biological functions of proteins often depend on dynamic structural +ensembles. In this work, we develop a flow-based generative modeling approach +for learning and sampling the conformational landscapes of proteins. We +repurpose highly accurate single-state predictors such as AlphaFold and ESMFold +and fine-tune them under a custom flow matching framework to obtain +sequence-conditoned generative models of protein structure called AlphaFlow and +ESMFlow. When trained and evaluated on the PDB, our method provides a superior +combination of precision and diversity compared to AlphaFold with MSA +subsampling. When further trained on ensembles from all-atom MD, our method +accurately captures conformational flexibility, positional distributions, and +higher-order ensemble observables for unseen proteins. Moreover, our method can +diversify a static PDB structure with faster wall-clock convergence to certain +equilibrium properties than replicate MD trajectories, demonstrating its +potential as a proxy for expensive physics-based simulations. Code is available +at https://github.com/bjing2016/alphaflow. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Automatic Differentiation is Essential in Training Neural Networks for + Solving Differential Equations + + +
+ Neural network-based approaches have recently shown significant promise in +solving partial differential equations (PDEs) in science and engineering, +especially in scenarios featuring complex domains or incorporation of empirical +data. One advantage of the neural network methods for PDEs lies in its +automatic differentiation (AD), which necessitates only the sample points +themselves, unlike traditional finite difference (FD) approximations that +require nearby local points to compute derivatives. In this paper, we +quantitatively demonstrate the advantage of AD in training neural networks. The +concept of truncated entropy is introduced to characterize the training +property. Specifically, through comprehensive experimental and theoretical +analyses conducted on random feature models and two-layer neural networks, we +discover that the defined truncated entropy serves as a reliable metric for +quantifying the residual loss of random feature models and the training speed +of neural networks for both AD and FD methods. Our experimental and theoretical +analyses demonstrate that, from a training perspective, AD outperforms FD in +solving PDEs. + +
+
+
+
+
+ + ♻ ☆ On the limits of neural network explainability via descrambling + + +
+ We characterize the exact solutions to neural network descrambling--a +mathematical model for explaining the fully connected layers of trained neural +networks (NNs). By reformulating the problem to the minimization of the +Brockett function arising in graph matching and complexity theory we show that +the principal components of the hidden layer preactivations can be +characterized as the optimal explainers or descramblers for the layer weights, +leading to descrambled weight matrices. We show that in typical deep learning +contexts these descramblers take diverse and interesting forms including (1) +matching largest principal components with the lowest frequency modes of the +Fourier basis for isotropic hidden data, (2) discovering the semantic +development in two-layer linear NNs for signal recovery problems, and (3) +explaining CNNs by optimally permuting the neurons. Our numerical experiments +indicate that the eigendecompositions of the hidden layer data--now understood +as the descramblers--can also reveal the layer's underlying transformation. +These results illustrate that the SVD is more directly related to the +explainability of NNs than previously thought and offers a promising avenue for +discovering interpretable motifs for the hidden action of NNs, especially in +contexts of operator learning or physics-informed NNs, where the input/output +data has limited human readability. + +
+
+
+
+
+ + ♻ MAP: Low-compute Model Merging with Amortized Pareto Fronts via + Quadratic Approximation + + +
+ Model merging has emerged as an effective approach to combine multiple +single-task models, fine-tuned from the same pre-trained model, into a +multitask model. This process typically involves computing a weighted average +of the model parameters without any additional training. Existing model-merging +methods focus on enhancing average task accuracy. However, interference and +conflicts between the objectives of different tasks can lead to trade-offs +during model merging. In real-world applications, a set of solutions with +various trade-offs can be more informative, helping practitioners make +decisions based on diverse preferences. In this paper, we introduce a novel +low-compute algorithm, Model Merging with Amortized Pareto Front (MAP). MAP +identifies a Pareto set of scaling coefficients for merging multiple models to +reflect the trade-offs. The core component of MAP is approximating the +evaluation metrics of the various tasks using a quadratic approximation +surrogate model derived from a pre-selected set of scaling coefficients, +enabling amortized inference. Experimental results on vision and natural +language processing tasks show that MAP can accurately identify the Pareto +front. To further reduce the required computation of MAP, we propose (1) a +Bayesian adaptive sampling algorithm and (2) a nested merging scheme with +multiple stages. + +
+
+
+
+
+ + ♻ ☆ RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation + and Retrieval-Guidance + + +
+ Diffusion-based models demonstrate impressive generation capabilities. +However, they also have a massive number of parameters, resulting in enormous +model sizes, thus making them unsuitable for deployment on resource-constraint +devices. Block-wise generation can be a promising alternative for designing +compact-sized (parameter-efficient) deep generative models since the model can +generate one block at a time instead of generating the whole image at once. +However, block-wise generation is also considerably challenging because +ensuring coherence across generated blocks can be non-trivial. To this end, we +design a retrieval-augmented generation (RAG) approach and leverage the +corresponding blocks of the images retrieved by the RAG module to condition the +training and generation stages of a block-wise denoising diffusion model. Our +conditioning schemes ensure coherence across the different blocks during +training and, consequently, during generation. While we showcase our approach +using the latent diffusion model (LDM) as the base model, it can be used with +other variants of denoising diffusion models. We validate the solution of the +coherence problem through the proposed approach by reporting substantive +experiments to demonstrate our approach's effectiveness in compact model size +and excellent generation quality. + +
+
+
+
+
+ + ♻ ☆ Uplift Modeling Under Limited Supervision + + +
+ Estimating causal effects in e-commerce tends to involve costly treatment +assignments which can be impractical in large-scale settings. Leveraging +machine learning to predict such treatment effects without actual intervention +is a standard practice to diminish the risk. However, existing methods for +treatment effect prediction tend to rely on training sets of substantial size, +which are built from real experiments and are thus inherently risky to create. +In this work we propose a graph neural network to diminish the required +training set size, relying on graphs that are common in e-commerce data. +Specifically, we view the problem as node regression with a restricted number +of labeled instances, develop a two-model neural architecture akin to previous +causal effect estimators, and test varying message-passing layers for encoding. +Furthermore, as an extra step, we combine the model with an acquisition +function to guide the creation of the training set in settings with extremely +low experimental budget. The framework is flexible since each step can be used +separately with other models or treatment policies. The experiments on real +large-scale networks indicate a clear advantage of our methodology over the +state of the art, which in many cases performs close to random, underlining the +need for models that can generalize with limited supervision to reduce +experimental risks. + +
+
+
+
+
+ + ♻ ☆ Globally Stable Neural Imitation Policies + + +
+ Imitation learning presents an effective approach to alleviate the +resource-intensive and time-consuming nature of policy learning from scratch in +the solution space. Even though the resulting policy can mimic expert +demonstrations reliably, it often lacks predictability in unexplored regions of +the state-space, giving rise to significant safety concerns in the face of +perturbations. To address these challenges, we introduce the Stable Neural +Dynamical System (SNDS), an imitation learning regime which produces a policy +with formal stability guarantees. We deploy a neural policy architecture that +facilitates the representation of stability based on Lyapunov theorem, and +jointly train the policy and its corresponding Lyapunov candidate to ensure +global stability. We validate our approach by conducting extensive experiments +in simulation and successfully deploying the trained policies on a real-world +manipulator arm. The experimental results demonstrate that our method overcomes +the instability, accuracy, and computational intensity problems associated with +previous imitation learning methods, making our method a promising solution for +stable policy learning in complex planning scenarios. + +
+
+
+
+
+ + ♻ AI-Assisted Generation of Difficult Math Questions + + +
+ Current LLM training positions mathematical reasoning as a core capability. +With publicly available sources fully tapped, there is unmet demand for diverse +and challenging math questions. Relying solely on human experts is both +time-consuming and costly, while LLM-generated questions often lack the +requisite diversity and difficulty. We present a design framework that combines +the strengths of LLMs with a human-in-the-loop approach to generate a diverse +array of challenging math questions. We leverage LLM metacognition skills +[Didolkar et al., 2024] of a strong LLM to extract core "skills" from existing +math datasets. These skills serve as the basis for generating novel and +difficult questions by prompting the LLM with random pairs of core skills. The +use of two different skills within each question makes finding such questions +an "out of distribution" task for both LLMs and humans. Our pipeline employs +LLMs to iteratively generate and refine questions and solutions through +multiturn prompting. Human annotators then verify and further refine the +questions, with their efficiency enhanced via further LLM interactions. +Applying this pipeline on skills extracted from the MATH dataset [Hendrycks et +al., 2021] resulted in MATH$^2$ - a dataset of higher-quality math questions, +as evidenced by: (a) Lower performance of all models on MATH$^2$ than on MATH +(b) Higher performance on MATH when using MATH$^2$ questions as in-context +examples. Although focused on mathematics, our methodology seems applicable to +other domains requiring structured reasoning, and potentially as a component of +scalable oversight. Also of interest is a striking relationship observed +between models' performance on the new dataset: the success rate on MATH$^2$ is +the square on MATH, suggesting that successfully solving the question in +MATH$^2$ requires a nontrivial combination of two distinct math skills. + +
+
+
+
+
+ + ♻ ☆ Online Detection of Anomalies in Temporal Knowledge Graphs with + Interpretability SIGMOD 2025 + + +
+ Temporal knowledge graphs (TKGs) are valuable resources for capturing +evolving relationships among entities, yet they are often plagued by noise, +necessitating robust anomaly detection mechanisms. Existing dynamic graph +anomaly detection approaches struggle to capture the rich semantics introduced +by node and edge categories within TKGs, while TKG embedding methods lack +interpretability, undermining the credibility of anomaly detection. Moreover, +these methods falter in adapting to pattern changes and semantic drifts +resulting from knowledge updates. To tackle these challenges, we introduce +AnoT, an efficient TKG summarization method tailored for interpretable online +anomaly detection in TKGs. AnoT begins by summarizing a TKG into a novel rule +graph, enabling flexible inference of complex patterns in TKGs. When new +knowledge emerges, AnoT maps it onto a node in the rule graph and traverses the +rule graph recursively to derive the anomaly score of the knowledge. The +traversal yields reachable nodes that furnish interpretable evidence for the +validity or the anomalous of the new knowledge. Overall, AnoT embodies a +detector-updater-monitor architecture, encompassing a detector for offline TKG +summarization and online scoring, an updater for real-time rule graph updates +based on emerging knowledge, and a monitor for estimating the approximation +error of the rule graph. Experimental results on four real-world datasets +demonstrate that AnoT surpasses existing methods significantly in terms of +accuracy and interoperability. All of the raw datasets and the implementation +of AnoT are provided in https://github.com/zjs123/ANoT. + +
+
+ comment: 26 pages, 10 figures. Accepted by SIGMOD 2025 +
+
+
+
+
+ + ♻ ☆ NeuFair: Neural Network Fairness Repair with Dropout ISSTA 2024 + + +
+ This paper investigates neuron dropout as a post-processing bias mitigation +for deep neural networks (DNNs). Neural-driven software solutions are +increasingly applied in socially critical domains with significant fairness +implications. While neural networks are exceptionally good at finding +statistical patterns from data, they may encode and amplify existing biases +from the historical data. Existing bias mitigation algorithms often require +modifying the input dataset or the learning algorithms. We posit that the +prevalent dropout methods that prevent over-fitting during training by randomly +dropping neurons may be an effective and less intrusive approach to improve the +fairness of pre-trained DNNs. However, finding the ideal set of neurons to drop +is a combinatorial problem. We propose NeuFair, a family of post-processing +randomized algorithms that mitigate unfairness in pre-trained DNNs via dropouts +during inference after training. Our randomized search is guided by an +objective to minimize discrimination while maintaining the model's utility. We +show that our design of randomized algorithms is effective and efficient in +improving fairness (up to 69%) with minimal or no model performance +degradation. We provide intuitive explanations of these phenomena and carefully +examine the influence of various hyperparameters of search algorithms on the +results. Finally, we empirically and conceptually compare NeuFair to different +state-of-the-art bias mitigators. + +
+
+ comment: Paper accepted at ACM ISSTA 2024 +
+
+
+
+
+ + ♻ ☆ Privacy-Aware Document Visual Question Answering ICDAR 2024 + + +
+ Document Visual Question Answering (DocVQA) has quickly grown into a central +task of document understanding. But despite the fact that documents contain +sensitive or copyrighted information, none of the current DocVQA methods offers +strong privacy guarantees. In this work, we explore privacy in the domain of +DocVQA for the first time, highlighting privacy issues in state of the art +multi-modal LLM models used for DocVQA, and explore possible solutions. +Specifically, we focus on invoice processing as a realistic document +understanding scenario, and propose a large scale DocVQA dataset comprising +invoice documents and associated questions and answers. We employ a federated +learning scheme, that reflects the real-life distribution of documents in +different businesses, and we explore the use case where the data of the invoice +provider is the sensitive information to be protected. We demonstrate that +non-private models tend to memorise, a behaviour that can lead to exposing +private information. We then evaluate baseline training schemes employing +federated learning and differential privacy in this multi-modal scenario, where +the sensitive information might be exposed through either or both of the two +input modalities: vision (document image) or language (OCR tokens). Finally, we +design attacks exploiting the memorisation effect of the model, and demonstrate +their effectiveness in probing a representative DocVQA models. + +
+
+ comment: 35 pages, 12 figures, accepted for publication at the 18th + International Conference on Document Analysis and Recognition, ICDAR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Bias and Prediction Metrics to Characterise the Fairness of + Machine Learning for Equity-Centered Public Health Decision-Making: A + Narrative Review + + +
+ Background: The rapid advancement of Machine Learning (ML) represents novel +opportunities to enhance public health research, surveillance, and +decision-making. However, there is a lack of comprehensive understanding of +algorithmic bias, systematic errors in predicted population health outcomes, +resulting from the public health application of ML. The objective of this +narrative review is to explore the types of bias generated by ML and +quantitative metrics to assess these biases. + Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of +Electrical and Electronics Engineers), ACM (Association for Computing +Machinery) Digital Library, Science Direct, and Springer Nature. We used +keywords to identify studies describing types of bias and metrics to measure +these in the domain of ML and public and population health published in English +between 2008 and 2023, inclusive. + Results: A total of 72 articles met the inclusion criteria. Our review +identified the commonly described types of bias and quantitative metrics to +assess these biases from an equity perspective. + Conclusion : The review will help formalize the evaluation framework for ML +on public health from an equity perspective. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? ECCV 2024 + + +
+ Foundation models have emerged as robust models with label efficiency in +diverse domains. In medical imaging, these models contribute to the advancement +of medical diagnoses due to the difficulty in obtaining labeled data. However, +it is unclear whether using a large amount of unlabeled data, biased by the +presence of sensitive attributes during pre-training, influences the fairness +of the model. This research examines the bias in the Foundation model +(RetFound) when it is applied to fine-tune the Brazilian Multilabel +Ophthalmological Dataset (BRSET), which has a different population than the +pre-training dataset. The model evaluation, in comparison with supervised +learning, shows that the Foundation Model has the potential to reduce the gap +between the maximum AUC and minimum AUC evaluations across gender and age +groups. However, in a data-efficient generalization, the model increases the +bias when the data amount decreases. These findings suggest that when deploying +a Foundation Model in real-life scenarios with limited data, the possibility of +fairness issues should be considered. + +
+
+ comment: Preprint of paper to be presented at Fairness and Ethics Towards + Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during + ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Ancestral Reinforcement Learning: Unifying Zeroth-Order Optimization and + Genetic Algorithms for Reinforcement Learning + + +
+ Reinforcement Learning (RL) offers a fundamental framework for discovering +optimal action strategies through interactions within unknown environments. +Recent advancement have shown that the performance and applicability of RL can +significantly be enhanced by exploiting a population of agents in various ways. +Zeroth-Order Optimization (ZOO) leverages an agent population to estimate the +gradient of the objective function, enabling robust policy refinement even in +non-differentiable scenarios. As another application, Genetic Algorithms (GA) +boosts the exploration of policy landscapes by mutational generation of policy +diversity in an agent population and its refinement by selection. A natural +question is whether we can have the best of two worlds that the agent +population can have. In this work, we propose Ancestral Reinforcement Learning +(ARL), which synergistically combines the robust gradient estimation of ZOO +with the exploratory power of GA. The key idea in ARL is that each agent within +a population infers gradient by exploiting the history of its ancestors, i.e., +the ancestor population in the past, while maintaining the diversity of +policies in the current population as in GA. We also theoretically reveal that +the populational search in ARL implicitly induces the KL-regularization of the +objective function, resulting in the enhanced exploration. Our results extend +the applicability of populational algorithms for RL. + +
+
+ comment: 16pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Analysis of Failures and Risks in Deep Learning Model Converters: A Case + Study in the ONNX Ecosystem ISSTA'24 + + +
+ Software engineers develop, fine-tune, and deploy deep learning (DL) models +using a variety of development frameworks and runtime environments. DL model +converters move models between frameworks and to runtime environments. +Conversion errors compromise model quality and disrupt deployment. However, the +failure characteristics of DL model converters are unknown, adding risk when +using DL interoperability technologies. + This paper analyzes failures in DL model converters. We survey software +engineers about DL interoperability tools, use cases, and pain points (N=92). +Then, we characterize failures in model converters associated with the main +interoperability tool, ONNX (N=200 issues in PyTorch and TensorFlow). Finally, +we formulate and test two hypotheses about structural causes for the failures +we studied. We find that the node conversion stage of a model converter +accounts for ~75% of the defects and 33% of reported failure are related to +semantically incorrect models. The cause of semantically incorrect models is +elusive, but models with behaviour inconsistencies share operator sequences. +Our results motivate future research on making DL interoperability software +simpler to maintain, extend, and validate. Research into behavioural tolerances +and architectural coverage metrics could be fruitful. + +
+
+ comment: [ISSTA'24] Proceedings of the 33rd ACM SIGSOFT International + Symposium on Software Testing and Analysis (ISSTA) 2024 +
+
+
+
+
+ + ♻ ☆ HyperInterval: Hypernetwork approach to training weight interval regions + in continual learning + + +
+ Recently, a new Continual Learning (CL) paradigm was presented to control +catastrophic forgetting, called Interval Continual Learning (InterContiNet), +which relies on enforcing interval constraints on the neural network parameter +space. Unfortunately, InterContiNet training is challenging due to the high +dimensionality of the weight space, making intervals difficult to manage. To +address this issue, we introduce \our{} \footnote{The source code is available +at https://github.com/gmum/HyperInterval}, a technique that employs interval +arithmetic within the embedding space and utilizes a hypernetwork to map these +intervals to the target network parameter space. We train interval embeddings +for consecutive tasks and train a hypernetwork to transform these embeddings +into weights of the target network. An embedding for a given task is trained +along with the hypernetwork, preserving the response of the target network for +the previous task embeddings. Interval arithmetic works with a more manageable, +lower-dimensional embedding space rather than directly preparing intervals in a +high-dimensional weight space. Our model allows faster and more efficient +training. Furthermore, \our{} maintains the guarantee of not forgetting. At the +end of training, we can choose one universal embedding to produce a single +network dedicated to all tasks. In such a framework, hypernetwork is used only +for training and, finally, we can utilize one set of weights. \our{} obtains +significantly better results than InterContiNet and gives SOTA results on +several benchmarks. + +
+
+
+
+
+ + ♻ ☆ ResQuNNs:Towards Enabling Deep Learning in Quantum Convolution Neural + Networks + + +
+ In this paper, we present a novel framework for enhancing the performance of +Quanvolutional Neural Networks (QuNNs) by introducing trainable quanvolutional +layers and addressing the critical challenges associated with them. Traditional +quanvolutional layers, although beneficial for feature extraction, have largely +been static, offering limited adaptability. Unlike state-of-the-art, our +research overcomes this limitation by enabling training within these layers, +significantly increasing the flexibility and potential of QuNNs. However, the +introduction of multiple trainable quanvolutional layers induces complexities +in gradient-based optimization, primarily due to the difficulty in accessing +gradients across these layers. To resolve this, we propose a novel +architecture, Residual Quanvolutional Neural Networks (ResQuNNs), leveraging +the concept of residual learning, which facilitates the flow of gradients by +adding skip connections between layers. By inserting residual blocks between +quanvolutional layers, we ensure enhanced gradient access throughout the +network, leading to improved training performance. Moreover, we provide +empirical evidence on the strategic placement of these residual blocks within +QuNNs. Through extensive experimentation, we identify an efficient +configuration of residual blocks, which enables gradients across all the layers +in the network that eventually results in efficient training. Our findings +suggest that the precise location of residual blocks plays a crucial role in +maximizing the performance gains in QuNNs. Our results mark a substantial step +forward in the evolution of quantum deep learning, offering new avenues for +both theoretical development and practical quantum computing applications. + +
+
+
+
+
+ + ♻ ☆ Stabilizing Extreme Q-learning by Maclaurin Expansion + + +
+ In offline reinforcement learning, in-sample learning methods have been +widely used to prevent performance degradation caused by evaluating +out-of-distribution actions from the dataset. Extreme Q-learning (XQL) employs +a loss function based on the assumption that Bellman error follows a Gumbel +distribution, enabling it to model the soft optimal value function in an +in-sample manner. It has demonstrated strong performance in both offline and +online reinforcement learning settings. However, issues remain, such as the +instability caused by the exponential term in the loss function and the risk of +the error distribution deviating from the Gumbel distribution. Therefore, we +propose Maclaurin Expanded Extreme Q-learning to enhance stability. In this +method, applying Maclaurin expansion to the loss function in XQL enhances +stability against large errors. This approach involves adjusting the modeled +value function between the value function under the behavior policy and the +soft optimal value function, thus achieving a trade-off between stability and +optimality depending on the order of expansion. It also enables adjustment of +the error distribution assumption from a normal distribution to a Gumbel +distribution. Our method significantly stabilizes learning in online RL tasks +from DM Control, where XQL was previously unstable. Additionally, it improves +performance in several offline RL tasks from D4RL. + +
+
+ comment: Accepted at RLC 2024: The first Reinforcement Learning Conference +
+
+
+
+
+ + ♻ ☆ An Effective Information Theoretic Framework for Channel Pruning + + +
+ Channel pruning is a promising method for accelerating and compressing +convolutional neural networks. However, current pruning algorithms still remain +unsolved problems that how to assign layer-wise pruning ratios properly and +discard the least important channels with a convincing criterion. In this +paper, we present a novel channel pruning approach via information theory and +interpretability of neural networks. Specifically, we regard information +entropy as the expected amount of information for convolutional layers. In +addition, if we suppose a matrix as a system of linear equations, a higher-rank +matrix represents there exist more solutions to it, which indicates more +uncertainty. From the point of view of information theory, the rank can also +describe the amount of information. In a neural network, considering the rank +and entropy as two information indicators of convolutional layers, we propose a +fusion function to reach a compromise of them, where the fusion results are +defined as ``information concentration''. When pre-defining layer-wise pruning +ratios, we employ the information concentration as a reference instead of +heuristic and engineering tuning to provide a more interpretable solution. +Moreover, we leverage Shapley values, which are a potent tool in the +interpretability of neural networks, to evaluate the channel contributions and +discard the least important channels for model compression while maintaining +its performance. Extensive experiments demonstrate the effectiveness and +promising performance of our method. For example, our method improves the +accuracy by 0.21% when reducing 45.5% FLOPs and removing 40.3% parameters for +ResNet-56 on CIFAR-10. Moreover, our method obtains loss in Top-1/Top-5 +accuracies of 0.43%/0.11% by reducing 41.6% FLOPs and removing 35.0% parameters +for ResNet-50 on ImageNet. + +
+
+
+
+
+ + ♻ ☆ Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State + Space Model ACM MM 2024 + + +
+ Existing Transformer-based models for point cloud analysis suffer from +quadratic complexity, leading to compromised point cloud resolution and +information loss. In contrast, the newly proposed Mamba model, based on state +space models (SSM), outperforms Transformer in multiple areas with only linear +complexity. However, the straightforward adoption of Mamba does not achieve +satisfactory performance on point cloud tasks. In this work, we present +Mamba3D, a state space model tailored for point cloud learning to enhance local +feature extraction, achieving superior performance, high efficiency, and +scalability potential. Specifically, we propose a simple yet effective Local +Norm Pooling (LNP) block to extract local geometric features. Additionally, to +obtain better global features, we introduce a bidirectional SSM (bi-SSM) with +both a token forward SSM and a novel backward SSM that operates on the feature +channel. Extensive experimental results show that Mamba3D surpasses +Transformer-based counterparts and concurrent works in multiple tasks, with or +without pre-training. Notably, Mamba3D achieves multiple SoTA, including an +overall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1% +(with single-modal pre-training) on the ModelNet40 classification task, with +only linear complexity. Our code and weights are available at +https://github.com/xhanxu/Mamba3D. + +
+
+ comment: ACM MM 2024. Code and weights are available at + https://github.com/xhanxu/Mamba3D +
+
+
+
+
+ + ♻ ☆ Identifying Weight-Variant Latent Causal Models + + +
+ The task of causal representation learning aims to uncover latent +higher-level causal representations that affect lower-level observations. +Identifying true latent causal representations from observed data, while +allowing instantaneous causal relations among latent variables, remains a +challenge, however. To this end, we start from the analysis of three intrinsic +properties in identifying latent space from observations: transitivity, +permutation indeterminacy, and scaling indeterminacy. We find that transitivity +acts as a key role in impeding the identifiability of latent causal +representations. To address the unidentifiable issue due to transitivity, we +introduce a novel identifiability condition where the underlying latent causal +model satisfies a linear-Gaussian model, in which the causal coefficients and +the distribution of Gaussian noise are modulated by an additional observed +variable. Under some mild assumptions, we can show that the latent causal +representations can be identified up to trivial permutation and scaling. +Furthermore, based on this theoretical result, we propose a novel method, +termed Structural caUsAl Variational autoEncoder, which directly learns latent +causal representations and causal relationships among them, together with the +mapping from the latent causal variables to the observed ones. We show that the +proposed method learns the true parameters asymptotically. Experimental results +on synthetic and real data demonstrate the identifiability and consistency +results and the efficacy of the proposed method in learning latent causal +representations. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-based Target-To-User Association in Integrated Sensing and + Communication Systems + + +
+ In Integrated Sensing and Communication (ISAC) systems, matching the radar +targets with communication user equipments (UEs) is functional to several +communication tasks, such as proactive handover and beam prediction. In this +paper, we consider a radar-assisted communication system where a base station +(BS) is equipped with a multiple-input-multiple-output (MIMO) radar that has a +double aim: (i) associate vehicular radar targets to vehicular equipments (VEs) +in the communication beamspace and (ii) predict the beamforming vector for each +VE from radar data. The proposed target-to-user (T2U) association consists of +two stages. First, vehicular radar targets are detected from range-angle +images, and, for each, a beamforming vector is estimated. Then, the inferred +per-target beamforming vectors are matched with the ones utilized at the BS for +communication to perform target-to-user (T2U) association. Joint multi-target +detection and beam inference is obtained by modifying the you only look once +(YOLO) model, which is trained over simulated range-angle radar images. +Simulation results over different urban vehicular mobility scenarios show that +the proposed T2U method provides a probability of correct association that +increases with the size of the BS antenna array, highlighting the respective +increase of the separability of the VEs in the beamspace. Moreover, we show +that the modified YOLO architecture can effectively perform both beam +prediction and radar target detection, with similar performance in mean average +precision on the latter over different antenna array sizes. + +
+
+
+
+
+ + ♻ ☆ Barlow Twins Deep Neural Network for Advanced 1D Drug-Target Interaction + Prediction + + +
+ Accurate prediction of drug-target interactions is critical for advancing +drug discovery. By reducing time and cost, machine learning and deep learning +can accelerate this laborious discovery process. In a novel approach, +BarlowDTI, we utilise the powerful Barlow Twins architecture for +feature-extraction while considering the structure of the target protein. Our +method achieves state-of-the-art predictive performance against multiple +established benchmarks using only one-dimensional input. The use of gradient +boosting machine as the underlying predictor ensures fast and efficient +predictions without the need for substantial computational resources. We also +investigate how the model reaches its decision based on individual training +samples. By comparing co-crystal structures, we find that BarlowDTI effectively +exploits catalytically active and stabilising residues, highlighting the +model's ability to generalise from one-dimensional input data. In addition, we +further benchmark new baselines against existing methods. Together, these +innovations improve the efficiency and effectiveness of drug-target interaction +predictions, providing robust tools for accelerating drug development and +deepening the understanding of molecular interactions. Therefore, we provide an +easy-to-use web interface that can be freely accessed at +https://www.bio.nat.tum.de/oc2/barlowdti . + +
+
+ comment: Refined model architecture, additional results added +
+
+
+
+
+ + ♻ ☆ TRNet: Two-level Refinement Network leveraging Speech Enhancement for + Noise Robust Speech Emotion Recognition + + +
+ One persistent challenge in Speech Emotion Recognition (SER) is the +ubiquitous environmental noise, which frequently results in deteriorating SER +performance in practice. In this paper, we introduce a Two-level Refinement +Network, dubbed TRNet, to address this challenge. Specifically, a pre-trained +speech enhancement module is employed for front-end noise reduction and noise +level estimation. Later, we utilize clean speech spectrograms and their +corresponding deep representations as reference signals to refine the +spectrogram distortion and representation shift of enhanced speech during model +training. Experimental results validate that the proposed TRNet substantially +promotes the robustness of the proposed system in both matched and unmatched +noisy environments, without compromising its performance in noise-free +environments. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ DiffLoad: Uncertainty Quantification in Electrical Load Forecasting with + the Diffusion Model + + +
+ Electrical load forecasting plays a crucial role in decision-making for power +systems, including unit commitment and economic dispatch. The integration of +renewable energy sources and the occurrence of external events, such as the +COVID-19 pandemic, have rapidly increased uncertainties in load forecasting. +The uncertainties in load forecasting can be divided into two types: epistemic +uncertainty and aleatoric uncertainty. Separating these types of uncertainties +can help decision-makers better understand where and to what extent the +uncertainty is, thereby enhancing their confidence in the following +decision-making. This paper proposes a diffusion-based Seq2Seq structure to +estimate epistemic uncertainty and employs the robust additive Cauchy +distribution to estimate aleatoric uncertainty. Our method not only ensures the +accuracy of load forecasting but also demonstrates the ability to separate the +two types of uncertainties and be applicable to different levels of loads. The +relevant code can be found at +\url{https://anonymous.4open.science/r/DiffLoad-4714/}. + +
+
+ comment: Accepted by IEEE Transactions on Power Systems, 2024 +
+
+
+
+
+ + ♻ ☆ Near-Optimal Policy Identification in Robust Constrained Markov Decision + Processes via Epigraph Form + + +
+ Designing a safe policy for uncertain environments is crucial in real-world +control applications. However, this challenge remains inadequately addressed +within the Markov decision process (MDP) framework. This paper presents the +first algorithm capable of identifying a near-optimal policy in a robust +constrained MDP (RCMDP), where an optimal policy minimizes cumulative cost +while satisfying constraints in the worst-case scenario across a set of +environments. We first prove that the conventional Lagrangian max-min +formulation with policy gradient methods can become trapped in suboptimal +solutions by encountering a sum of conflicting gradients from the objective and +constraint functions during its inner minimization problem. To address this, we +leverage the epigraph form of the RCMDP problem, which resolves the conflict by +selecting a single gradient from either the objective or the constraints. +Building on the epigraph form, we propose a binary search algorithm with a +policy gradient subroutine and prove that it identifies an +$\varepsilon$-optimal policy in an RCMDP with +$\tilde{\mathcal{O}}(\varepsilon^{-4})$ policy evaluations. + +
+
+
+
+
+ + ♻ ☆ EuroPED-NN: Uncertainty aware surrogate model + + +
+ This work successfully generates an uncertainty-aware surrogate model of the +EuroPED plasma pedestal model using the Bayesian neural network with noise +contrastive prior (BNN-NCP) technique. This model is trained using data from +the JET-ILW pedestal database and subsequent model evaluations, conforming to +EuroPED-NN. The BNN-NCP technique has been proven to be a suitable method for +generating uncertainty-aware surrogate models. It matches the output results of +a regular neural network while providing confidence estimates for predictions +as uncertainties. Additionally, it highlights out-of-distribution (OOD) regions +using surrogate model uncertainties. This provides critical insights into model +robustness and reliability. EuroPED-NN has been physically validated, first, +analyzing electron density $n_e\!\left(\psi_{\text{pol}}=0.94\right)$ with +respect to increasing plasma current, $I_p$, and second, validating the +$\Delta-\beta_{p,ped}$ relation associated with the EuroPED model. This affirms +the robustness of the underlying physics learned by the surrogate model. On top +of that, the method was used to develop a EuroPED-like model fed with +experimental data, i.e. an uncertainty aware experimental model, which is +functional in JET database. Both models have been also tested in $\sim 50$ AUG +shots. + +
+
+
+
+
+ + ♻ ☆ The Initial Screening Order Problem + + +
+ We investigate the role of the initial screening order (ISO) in candidate +screening tasks, such as employee hiring and academic admissions, in which a +screener is tasked with selecting $k$ candidates from a candidate pool. The ISO +refers to the order in which the screener searches the candidate pool. Today, +it is common for the ISO to be the product of an information access system, +such as an online platform or a database query. The ISO has been largely +overlooked in the literature, despite its potential impact on the optimality +and fairness of the chosen $k$ candidates, especially under a human screener. +We define two problem formulations describing the search behavior of the +screener under the ISO: the best-$k$, where the screener selects the $k$ best +candidates; and the good-$k$, where the screener selects the $k$ first +good-enough candidates. To study the impact of the ISO, we introduce a +human-like screener and compare it to its algorithmic counterpart, where the +human-like screener is conceived to be inconsistent over time due to fatigue. +In particular, our analysis shows that the ISO, under a human-like screener +solving for the good-$k$ problem, hinders individual fairness despite meeting +group level fairness, and hampers the optimality of the selected $k$ +candidates. This is due to position bias, where a candidate's evaluation is +affected by its position within the ISO. We report extensive simulated +experiments exploring the parameters of the best-$k$ and good-$k$ problems for +the algorithmic and human-like screeners. The simulation framework is flexible +enough to account for multiple screening settings, being an alternative to +running real-world candidate screening procedures. This work is motivated by a +real-world candidate screening problem studied in collaboration with an +European company. + +
+
+
+
+
+ + ♻ ☆ Enabling Local Editing in Diffusion Models by Joint and Individual + Component Analysis BMVC2024 + + +
+ Recent advances in Diffusion Models (DMs) have led to significant progress in +visual synthesis and editing tasks, establishing them as a strong competitor to +Generative Adversarial Networks (GANs). However, the latent space of DMs is not +as well understood as that of GANs. Recent research has focused on unsupervised +semantic discovery in the latent space of DMs by leveraging the bottleneck +layer of the denoising network, which has been shown to exhibit properties of a +semantic latent space. However, these approaches are limited to discovering +global attributes. In this paper we address, the challenge of local image +manipulation in DMs and introduce an unsupervised method to factorize the +latent semantics learned by the denoising network of pre-trained DMs. Given an +arbitrary image and defined regions of interest, we utilize the Jacobian of the +denoising network to establish a relation between the regions of interest and +their corresponding subspaces in the latent space. Furthermore, we disentangle +the joint and individual components of these subspaces to identify latent +directions that enable local image manipulation. Once discovered, these +directions can be applied to different images to produce semantically +consistent edits, making our method suitable for practical applications. +Experimental results on various datasets demonstrate that our method can +produce semantic edits that are more localized and have better fidelity +compared to the state-of-the-art. + +
+
+ comment: Accepted at BMVC2024 +
+
+
+
+
+ + ♻ ☆ Autonomous Payload Thermal Control SP + + +
+ In small satellites there is less room for heat control equipment, scientific +instruments, and electronic components. Furthermore, the near proximity of +electronic components makes power dissipation difficult, with the risk of not +being able to control the temperature appropriately, reducing component +lifetime and mission performance. To address this challenge, taking advantage +of the advent of increasing intelligence on board satellites, an autonomous +thermal control tool that uses deep reinforcement learning is proposed for +learning the thermal control policy onboard. The tool was evaluated in a real +space edge processing computer that will be used in a demonstration payload +hosted in the International Space Station (ISS). The experiment results show +that the proposed framework is able to learn to control the payload processing +power to maintain the temperature under operational ranges, complementing +traditional thermal control systems. + +
+
+ comment: To be included in the proceedings of ESA's SPAICE conference at + ECSAT, UK, 2024 +
+
+
+
+
+ + ♻ ☆ Fast Robust Kernel Regression through Sign Gradient Descent with Early + Stopping + + +
+ Kernel ridge regression, KRR, is a generalization of linear ridge regression +that is non-linear in the data, but linear in the model parameters. Here, we +introduce an equivalent formulation of the objective function of KRR, which +opens up both for replacing the ridge penalty with the $\ell_\infty$ and +$\ell_1$ penalties and for studying kernel ridge regression from the +perspective of gradient descent. + Using the $\ell_\infty$ and $\ell_1$ penalties, we obtain robust and sparse +kernel regression, respectively. We further study the similarities between +explicitly regularized kernel regression and the solutions obtained by early +stopping of iterative gradient-based methods, where we connect $\ell_\infty$ +regularization to sign gradient descent, $\ell_1$ regularization to forward +stagewise regression (also known as coordinate descent), and $\ell_2$ +regularization to gradient descent, and, in the last case, theoretically bound +for the differences. We exploit the close relations between $\ell_\infty$ +regularization and sign gradient descent, and between $\ell_1$ regularization +and coordinate descent to propose computationally efficient methods for robust +and sparse kernel regression. + We finally compare robust kernel regression through sign gradient descent to +existing methods for robust kernel regression on five real data sets, +demonstrating that our method is one to two orders of magnitude faster, without +compromising accuracy. + +
+
+ comment: Article arXiv:2306.16838v1 has been updated and split into two + articles: this article and arXiv:2311.01762. Thus, some of the content in + arXiv:2306.16838v1 is not a part of arXiv:2306.16838v2, but of + arXiv:2311.01762 +
+
+
+
+
+ + ♻ ☆ Simplifying the Theory on Over-Smoothing + + +
+ Graph convolutions have gained popularity due to their ability to efficiently +operate on data with an irregular geometric structure. However, graph +convolutions cause over-smoothing, which refers to representations becoming +more similar with increased depth. However, many different definitions and +intuitions currently coexist, leading to research efforts focusing on +incompatible directions. This paper attempts to align these directions by +showing that over-smoothing is merely a special case of power iteration. This +greatly simplifies the existing theory on over-smoothing, making it more +accessible. Based on the theory, we provide a novel comprehensive definition of +rank collapse as a generalized form of over-smoothing and introduce the +rank-one distance as a corresponding metric. Our empirical evaluation of 14 +commonly used methods shows that more models than were previously known suffer +from this issue. + +
+
+
+
+
+ + ♻ ☆ BadMerging: Backdoor Attacks Against Model Merging CCS + + +
+ Fine-tuning pre-trained models for downstream tasks has led to a +proliferation of open-sourced task-specific models. Recently, Model Merging +(MM) has emerged as an effective approach to facilitate knowledge transfer +among these independently fine-tuned models. MM directly combines multiple +fine-tuned task-specific models into a merged model without additional +training, and the resulting model shows enhanced capabilities in multiple +tasks. Although MM provides great utility, it may come with security risks +because an adversary can exploit MM to affect multiple downstream tasks. +However, the security risks of MM have barely been studied. In this paper, we +first find that MM, as a new learning paradigm, introduces unique challenges +for existing backdoor attacks due to the merging process. To address these +challenges, we introduce BadMerging, the first backdoor attack specifically +designed for MM. Notably, BadMerging allows an adversary to compromise the +entire merged model by contributing as few as one backdoored task-specific +model. BadMerging comprises a two-stage attack mechanism and a novel +feature-interpolation-based loss to enhance the robustness of embedded +backdoors against the changes of different merging parameters. Considering that +a merged model may incorporate tasks from different domains, BadMerging can +jointly compromise the tasks provided by the adversary (on-task attack) and +other contributors (off-task attack) and solve the corresponding unique +challenges with novel attack designs. Extensive experiments show that +BadMerging achieves remarkable attacks against various MM algorithms. Our +ablation study demonstrates that the proposed attack designs can progressively +contribute to the attack performance. Finally, we show that prior defense +mechanisms fail to defend against our attacks, highlighting the need for more +advanced defense. + +
+
+ comment: To appear in ACM Conference on Computer and Communications Security + (CCS), 2024 +
+
+
+
+
+ + ♻ ☆ OriGen:Enhancing RTL Code Generation with Code-to-Code Augmentation and + Self-Reflection + + +
+ Recent studies have demonstrated the significant potential of Large Language +Models (LLMs) in generating Register Transfer Level (RTL) code, with notable +advancements showcased by commercial models such as GPT-4 and Claude3-Opus. +However, these proprietary LLMs often raise concerns regarding privacy and +security. While open-source LLMs offer solutions to these concerns, they +typically underperform commercial models in RTL code generation tasks, +primarily due to the scarcity of high-quality open-source RTL datasets. To +address this challenge, we introduce OriGen , a fully open-source framework +that incorporates self-reflection capabilities and a novel dataset augmentation +methodology for generating high-quality, large-scale RTL code. Our approach +employs a code-tocode augmentation technique to enhance the quality of +open-source RTL code datasets. Furthermore, OriGen can rectify syntactic errors +through a self-reflection process that leverages compiler feedback. +Experimental results demonstrate that OriGen significantly outperforms other +open-source alternatives in RTL code generation. It surpasses the previous +best-performing open-source LLM by 12.8% and even exceeds GPT-4 Turbo in the +pass@1 metric on the VerilogEval-Human benchmark. Moreover, OriGen exhibits +superior capabilities in self-reflection and error correction, outperforming +GPT-4 by 19.9% on a benchmark designed to evaluate self-reflection +capabilities. + +
+
+
+
+
+ + ♻ ☆ Biometrics and Behavior Analysis for Detecting Distractions in + e-Learning + + +
+ In this article, we explore computer vision approaches to detect abnormal +head pose during e-learning sessions and we introduce a study on the effects of +mobile phone usage during these sessions. We utilize behavioral data collected +from 120 learners monitored while participating in a MOOC learning sessions. +Our study focuses on the influence of phone-usage events on behavior and +physiological responses, specifically attention, heart rate, and meditation, +before, during, and after phone usage. Additionally, we propose an approach for +estimating head pose events using images taken by the webcam during the MOOC +learning sessions to detect phone-usage events. Our hypothesis suggests that +head posture undergoes significant changes when learners interact with a mobile +phone, contrasting with the typical behavior seen when learners face a computer +during e-learning sessions. We propose an approach designed to detect +deviations in head posture from the average observed during a learner's +session, operating as a semi-supervised method. This system flags events +indicating alterations in head posture for subsequent human review and +selection of mobile phone usage occurrences with a sensitivity over 90%. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ VAAD: Visual Attention Analysis Dashboard applied to e-Learning + + +
+ In this paper, we present an approach in the Multimodal Learning Analytics +field. Within this approach, we have developed a tool to visualize and analyze +eye movement data collected during learning sessions in online courses. The +tool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These +eye movement data have been gathered using an eye-tracker and subsequently +processed and visualized for interpretation. The purpose of the tool is to +conduct a descriptive analysis of the data by facilitating its visualization, +enabling the identification of differences and learning patterns among various +learner populations. Additionally, it integrates a predictive module capable of +anticipating learner activities during a learning session. Consequently, VAAD +holds the potential to offer valuable insights into online learning behaviors +from both descriptive and predictive perspectives. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ From Static to Dynamic Structures: Improving Binding Affinity Prediction + with Graph-Based Deep Learning + + +
+ Accurate prediction of protein-ligand binding affinities is an essential +challenge in structure-based drug design. Despite recent advances in +data-driven methods for affinity prediction, their accuracy is still limited, +partially because they only take advantage of static crystal structures while +the actual binding affinities are generally determined by the thermodynamic +ensembles between proteins and ligands. One effective way to approximate such a +thermodynamic ensemble is to use molecular dynamics (MD) simulation. Here, an +MD dataset containing 3,218 different protein-ligand complexes is curated, and +Dynaformer, a graph-based deep learning model is further developed to predict +the binding affinities by learning the geometric characteristics of the +protein-ligand interactions from the MD trajectories. In silico experiments +demonstrated that the model exhibits state-of-the-art scoring and ranking power +on the CASF-2016 benchmark dataset, outperforming the methods hitherto +reported. Moreover, in a virtual screening on heat shock protein 90 (HSP90) +using Dynaformer, 20 candidates are identified and their binding affinities are +further experimentally validated. Dynaformer displayed promising results in +virtual drug screening, revealing 12 hit compounds (two are in the +submicromolar range), including several novel scaffolds. Overall, these results +demonstrated that the approach offer a promising avenue for accelerating the +early drug discovery process. + +
+
+ comment: Update the content according to the published version on Advanced + Science (https://doi.org/10.1002/advs.202405404) +
+
+
+
+
+ + ♻ ☆ Directly Handling Missing Data in Linear Discriminant Analysis for + Enhancing Classification Accuracy and Interpretability + + +
+ As the adoption of Artificial Intelligence (AI) models expands into critical +real-world applications, ensuring the explainability of these models becomes +paramount, particularly in sensitive fields such as medicine and finance. +Linear Discriminant Analysis (LDA) remains a popular choice for classification +due to its interpretable nature, derived from its capacity to model class +distributions and enhance class separation through linear combinations of +features. However, real-world datasets often suffer from incomplete data, +posing substantial challenges for both classification accuracy and model +interpretability. In this paper, we introduce a novel and robust classification +method, termed Weighted missing Linear Discriminant Analysis (WLDA), which +extends LDA to handle datasets with missing values without the need for +imputation. Our approach innovatively incorporates a weight matrix that +penalizes missing entries, thereby refining parameter estimation directly on +incomplete data. This methodology not only preserves the interpretability of +LDA but also significantly enhances classification performance in scenarios +plagued by missing data. We conduct an in-depth theoretical analysis to +establish the properties of WLDA and thoroughly evaluate its explainability. +Experimental results across various datasets demonstrate that WLDA consistently +outperforms traditional methods, especially in challenging environments where +missing values are prevalent in both training and test datasets. This +advancement provides a critical tool for improving classification accuracy and +maintaining model transparency in the face of incomplete data. + +
+
+
+
+
+ + ♻ ☆ ERATTA: Extreme RAG for Table To Answers with Large Language Models + + +
+ Large language models (LLMs) with retrieval augmented-generation (RAG) have +been the optimal choice for scalable generative AI solutions in the recent +past. Although RAG implemented with AI agents (agentic-RAG) has been recently +popularized, its suffers from unstable cost and unreliable performances for +Enterprise-level data-practices. Most existing use-cases that incorporate RAG +with LLMs have been either generic or extremely domain specific, thereby +questioning the scalability and generalizability of RAG-LLM approaches. In this +work, we propose a unique LLM-based system where multiple LLMs can be invoked +to enable data authentication, user-query routing, data-retrieval and custom +prompting for question-answering capabilities from Enterprise-data tables. The +source tables here are highly fluctuating and large in size and the proposed +framework enables structured responses in under 10 seconds per query. +Additionally, we propose a five metric scoring module that detects and reports +hallucinations in the LLM responses. Our proposed system and scoring metrics +achieve >90% confidence scores across hundreds of user queries in the +sustainability, financial health and social media domains. Extensions to the +proposed extreme RAG architectures can enable heterogeneous source querying +using LLMs. + +
+
+ comment: 5 pages, 4 tables, IEEE Big Data, 2024 +
+
+
+
+
+ + ♻ ☆ Disease Classification and Impact of Pretrained Deep Convolution Neural + Networks on Diverse Medical Imaging Datasets across Imaging Modalities + + +
+ Imaging techniques such as Chest X-rays, whole slide images, and optical +coherence tomography serve as the initial screening and detection for a wide +variety of medical pulmonary and ophthalmic conditions respectively. This paper +investigates the intricacies of using pretrained deep convolutional neural +networks with transfer learning across diverse medical imaging datasets with +varying modalities for binary and multiclass classification. We conducted a +comprehensive performance analysis with ten network architectures and model +families each with pretraining and random initialization. Our finding showed +that the use of pretrained models as fixed feature extractors yields poor +performance irrespective of the datasets. Contrary, histopathology microscopy +whole slide images have better performance. It is also found that deeper and +more complex architectures did not necessarily result in the best performance. +This observation implies that the improvements in ImageNet are not parallel to +the medical imaging tasks. Within a medical domain, the performance of the +network architectures varies within model families with shifts in datasets. +This indicates that the performance of models within a specific modality may +not be conclusive for another modality within the same domain. This study +provides a deeper understanding of the applications of deep learning techniques +in medical imaging and highlights the impact of pretrained networks across +different medical imaging datasets under five different experimental settings. + +
+
+ comment: 15 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator + + +
+ We present Piecewise Rectified Flow (PeRFlow), a flow-based method for +accelerating diffusion models. PeRFlow divides the sampling process of +generative flows into several time windows and straightens the trajectories in +each interval via the reflow operation, thereby approaching piecewise linear +flows. PeRFlow achieves superior performance in a few-step generation. +Moreover, through dedicated parameterizations, the PeRFlow models inherit +knowledge from the pretrained diffusion models. Thus, the training converges +fast and the obtained models show advantageous transfer ability, serving as +universal plug-and-play accelerators that are compatible with various workflows +based on the pre-trained diffusion models. Codes for training and inference are +publicly released. https://github.com/magic-research/piecewise-rectified-flow + +
+
+
+
+
+ + ♻ ☆ Instant Adversarial Purification with Adversarial Consistency + Distillation + + +
+ Neural networks, despite their remarkable performance in widespread +applications, including image classification, are also known to be vulnerable +to subtle adversarial noise. Although some diffusion-based purification methods +have been proposed, for example, DiffPure, those methods are time-consuming. In +this paper, we propose One Step Control Purification (OSCP), a diffusion-based +purification model that can purify the adversarial image in one Neural Function +Evaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and +ControlNet for our one-step purification. OSCP is computationally friendly and +time efficient compared to other diffusion-based purification methods; we +achieve defense success rate of 74.19\% on ImageNet, only requiring 0.1s for +each purification. Moreover, there is a fundamental incongruence between +consistency distillation and adversarial perturbation. To address this +ontological dissonance, we propose Gaussian Adversarial Noise Distillation +(GAND), a novel consistency distillation framework that facilitates a more +nuanced reconciliation of the latent space dynamics, effectively bridging the +natural and adversarial manifolds. Our experiments show that the GAND does not +need a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient. + +
+
+
+
+
+ + ♻ ☆ MLR-Copilot: Autonomous Machine Learning Research based on Large + Language Models Agents + + +
+ Machine learning research, crucial for technological advancements and +innovation, often faces significant challenges due to its inherent complexity, +slow pace of experimentation, and the necessity for specialized expertise. +Motivated by this, we present a new systematic framework, autonomous Machine +Learning Research with large language models (MLR-Copilot), designed to enhance +machine learning research productivity through the automatic generation and +implementation of research ideas using Large Language Model (LLM) agents. The +framework consists of three phases: research idea generation, experiment +implementation, and implementation execution. First, existing research papers +are used to generate hypotheses and experimental plans vis IdeaAgent powered by +LLMs. Next, the implementation generation phase translates these plans into +executables with ExperimentAgent. This phase leverages retrieved prototype code +and optionally retrieves candidate models and data. Finally, the execution +phase, also managed by ExperimentAgent, involves running experiments with +mechanisms for human feedback and iterative debugging to enhance the likelihood +of achieving executable research outcomes. We evaluate our framework on five +machine learning research tasks and the experimental results show the +framework's potential to facilitate the research progress and innovations. + +
+
+
+
+
+ + ♻ ☆ A Grey-box Attack against Latent Diffusion Model-based Image Editing by + Posterior Collapse + + +
+ Recent advancements in generative AI, particularly Latent Diffusion Models +(LDMs), have revolutionized image synthesis and manipulation. However, these +generative techniques raises concerns about data misappropriation and +intellectual property infringement. Adversarial attacks on machine learning +models have been extensively studied, and a well-established body of research +has extended these techniques as a benign metric to prevent the underlying +misuse of generative AI. Current approaches to safeguarding images from +manipulation by LDMs are limited by their reliance on model-specific knowledge +and their inability to significantly degrade semantic quality of generated +images. In response to these shortcomings, we propose the Posterior Collapse +Attack (PCA) based on the observation that VAEs suffer from posterior collapse +during training. Our method minimizes dependence on the white-box information +of target models to get rid of the implicit reliance on model-specific +knowledge. By accessing merely a small amount of LDM parameters, in specific +merely the VAE encoder of LDMs, our method causes a substantial semantic +collapse in generation quality, particularly in perceptual consistency, and +demonstrates strong transferability across various model architectures. +Experimental results show that PCA achieves superior perturbation effects on +image generation of LDMs with lower runtime and VRAM. Our method outperforms +existing techniques, offering a more robust and generalizable solution that is +helpful in alleviating the socio-technical challenges posed by the rapidly +evolving landscape of generative AI. + +
+
+ comment: 21 pages, 7 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Pearl: A Production-ready Reinforcement Learning Agent + + +
+ Reinforcement learning (RL) is a versatile framework for optimizing long-term +goals. Although many real-world problems can be formalized with RL, learning +and deploying a performant RL policy requires a system designed to address +several important challenges, including the exploration-exploitation dilemma, +partial observability, dynamic action spaces, and safety concerns. While the +importance of these challenges has been well recognized, existing open-source +RL libraries do not explicitly address them. This paper introduces Pearl, a +Production-Ready RL software package designed to embrace these challenges in a +modular way. In addition to presenting benchmarking results, we also highlight +examples of Pearl's ongoing industry adoption to demonstrate its advantages for +production use cases. Pearl is open sourced on GitHub at +github.com/facebookresearch/pearl and its official website is +pearlagent.github.io. + +
+
+
+
+
+ + ♻ ☆ Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training + + +
+ In this paper, we investigate the challenging framework of Byzantine-robust +training in distributed machine learning (ML) systems, focusing on enhancing +both efficiency and practicality. As distributed ML systems become integral for +complex ML tasks, ensuring resilience against Byzantine failures-where workers +may contribute incorrect updates due to malice or error-gains paramount +importance. Our first contribution is the introduction of the Centered Trimmed +Meta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline +aggregators to optimal performance levels, while requiring low computational +demands. Additionally, we propose harnessing a recently developed gradient +estimation technique based on a double-momentum strategy within the Byzantine +context. Our paper highlights its theoretical and practical advantages for +Byzantine-robust training, especially in simplifying the tuning process and +reducing the reliance on numerous hyperparameters. The effectiveness of this +technique is supported by theoretical insights within the stochastic convex +optimization (SCO) framework and corroborated by empirical evidence. + +
+
+
+
+
+ + ♻ ☆ An Idiosyncrasy of Time-discretization in Reinforcement Learning + + +
+ Many reinforcement learning algorithms are built on an assumption that an +agent interacts with an environment over fixed-duration, discrete time steps. +However, physical systems are continuous in time, requiring a choice of +time-discretization granularity when digitally controlling them. Furthermore, +such systems do not wait for decisions to be made before advancing the +environment state, necessitating the study of how the choice of discretization +may affect a reinforcement learning algorithm. In this work, we consider the +relationship between the definitions of the continuous-time and discrete-time +returns. Specifically, we acknowledge an idiosyncrasy with naively applying a +discrete-time algorithm to a discretized continuous-time environment, and note +how a simple modification can better align the return definitions. This +observation is of practical consideration when dealing with environments where +time-discretization granularity is a choice, or situations where such +granularity is inherently stochastic. + +
+
+ comment: RLC 2024 +
+
+
+
+
+ + ♻ ☆ On the Benefits of Public Representations for Private Transfer Learning + under Distribution Shift + + +
+ Public pretraining is a promising approach to improve differentially private +model training. However, recent work has noted that many positive research +results studying this paradigm only consider in-distribution tasks, and may not +apply to settings where there is distribution shift between the pretraining and +finetuning data -- a scenario that is likely when finetuning private tasks due +to the sensitive nature of the data. In this work, we show empirically across +three tasks that even in settings with large distribution shift, where both +zero-shot performance from public data and training from scratch with private +data give unusably weak results, public features can in fact improve private +training accuracy by up to 67\% over private training from scratch. We provide +a theoretical explanation for this phenomenon, showing that if the public and +private data share a low-dimensional representation, public representations can +improve the sample complexity of private training even if it is impossible to +learn the private task from the public data alone. Altogether, our results +provide evidence that public data can indeed make private training practical in +realistic settings of extreme distribution shift. + +
+
+
+
+
+ + ♻ ☆ From Wide to Deep: Dimension Lifting Network for Parameter-efficient + Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) that maps entities and relations into vector +representations is essential for downstream applications. Conventional KGE +methods require high-dimensional representations to learn the complex structure +of knowledge graph, but lead to oversized model parameters. Recent advances +reduce parameters by low-dimensional entity representations, while developing +techniques (e.g., knowledge distillation or reinvented representation forms) to +compensate for reduced dimension. However, such operations introduce +complicated computations and model designs that may not benefit large knowledge +graphs. To seek a simple strategy to improve the parameter efficiency of +conventional KGE models, we take inspiration from that deeper neural networks +require exponentially fewer parameters to achieve expressiveness comparable to +wider networks for compositional structures. We view all entity representations +as a single-layer embedding network, and conventional KGE methods that adopt +high-dimensional entity representations equal widening the embedding network to +gain expressiveness. To achieve parameter efficiency, we instead propose a +deeper embedding network for entity representations, i.e., a narrow entity +embedding layer plus a multi-layer dimension lifting network (LiftNet). +Experiments on three public datasets show that by integrating LiftNet, four +conventional KGE methods with 16-dimensional representations achieve comparable +link prediction accuracy as original models that adopt 512-dimensional +representations, saving 68.4% to 96.9% parameters. + +
+
+
+
+
+ + ♻ ☆ TrajDeleter: Enabling Trajectory Forgetting in Offline Reinforcement + Learning Agents NDSS 2025 + + +
+ Reinforcement learning (RL) trains an agent from experiences interacting with +the environment. In scenarios where online interactions are impractical, +offline RL, which trains the agent using pre-collected datasets, has become +popular. While this new paradigm presents remarkable effectiveness across +various real-world domains, like healthcare and energy management, there is a +growing demand to enable agents to rapidly and completely eliminate the +influence of specific trajectories from both the training dataset and the +trained agents. To meet this problem, this paper advocates Trajdeleter, the +first practical approach to trajectory unlearning for offline RL agents. The +key idea of Trajdeleter is to guide the agent to demonstrate deteriorating +performance when it encounters states associated with unlearning trajectories. +Simultaneously, it ensures the agent maintains its original performance level +when facing other remaining trajectories. Additionally, we introduce +Trajauditor, a simple yet efficient method to evaluate whether Trajdeleter +successfully eliminates the specific trajectories of influence from the offline +RL agent. Extensive experiments conducted on six offline RL algorithms and +three tasks demonstrate that Trajdeleter requires only about 1.5% of the time +needed for retraining from scratch. It effectively unlearns an average of 94.8% +of the targeted trajectories yet still performs well in actual environment +interactions after unlearning. The replication package and agent parameters are +available online. + +
+
+ comment: Accepted at NDSS 2025. The presented document here is the full + version of our paper +
+
+
+
+
+ + ♻ ☆ Blending Neural Operators and Relaxation Methods in PDE Numerical + Solvers + + +
+ Neural networks suffer from spectral bias having difficulty in representing +the high frequency components of a function while relaxation methods can +resolve high frequencies efficiently but stall at moderate to low frequencies. +We exploit the weaknesses of the two approaches by combining them +synergistically to develop a fast numerical solver of partial differential +equations (PDEs) at scale. Specifically, we propose HINTS, a hybrid, iterative, +numerical, and transferable solver by integrating a Deep Operator Network +(DeepONet) with standard relaxation methods, leading to parallel efficiency and +algorithmic scalability for a wide class of PDEs, not tractable with existing +monolithic solvers. HINTS balances the convergence behavior across the spectrum +of eigenmodes by utilizing the spectral bias of DeepONet, resulting in a +uniform convergence rate and hence exceptional performance of the hybrid solver +overall. Moreover, HINTS applies to large-scale, multidimensional systems, it +is flexible with regards to discretizations, computational domain, and boundary +conditions. + +
+
+ comment: Main text: 17 pages, 6 figures. Supplementary Information: 30 pages, + 8 figures, 2 tables, 4 algorithms +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Spectron: Target Speaker Extraction using Conditional Transformer with + Adversarial Refinement + + +
+ Recently, attention-based transformers have become a de facto standard in +many deep learning applications including natural language processing, computer +vision, signal processing, etc.. In this paper, we propose a transformer-based +end-to-end model to extract a target speaker's speech from a monaural +multi-speaker mixed audio signal. Unlike existing speaker extraction methods, +we introduce two additional objectives to impose speaker embedding consistency +and waveform encoder invertibility and jointly train both speaker encoder and +speech separator to better capture the speaker conditional embedding. +Furthermore, we leverage a multi-scale discriminator to refine the perceptual +quality of the extracted speech. Our experiments show that the use of a dual +path transformer in the separator backbone along with proposed training +paradigm improves the CNN baseline by $3.12$ dB points. Finally, we compare our +approach with recent state-of-the-arts and show that our model outperforms +existing methods by $4.1$ dB points on an average without creating additional +data dependency. + +
+
+
+
+
+ + ☆ Multi-Reference Generative Face Video Compression with Contrastive + Learning + + +
+ Generative face video coding (GFVC) has been demonstrated as a potential +approach to low-latency, low bitrate video conferencing. GFVC frameworks +achieve an extreme gain in coding efficiency with over 70% bitrate savings when +compared to conventional codecs at bitrates below 10kbps. In recent MPEG/JVET +standardization efforts, all the information required to reconstruct video +sequences using GFVC frameworks are adopted as part of the supplemental +enhancement information (SEI) in existing compression pipelines. In light of +this development, we aim to address a challenge that has been weakly addressed +in prior GFVC frameworks, i.e., reconstruction drift as the distance between +the reference and target frames increases. This challenge creates the need to +update the reference buffer more frequently by transmitting more Intra-refresh +frames, which are the most expensive element of the GFVC bitstream. To overcome +this problem, we propose instead multiple reference animation as a robust +approach to minimizing reconstruction drift, especially when used in a +bi-directional prediction mode. Further, we propose a contrastive learning +formulation for multi-reference animation. We observe that using a contrastive +learning framework enhances the representation capabilities of the animation +generator. The resulting framework, MRDAC (Multi-Reference Deep Animation +Codec) can therefore be used to compress longer sequences with fewer reference +frames or achieve a significant gain in reconstruction accuracy at comparable +bitrates to previous frameworks. Quantitative and qualitative results show +significant coding and reconstruction quality gains compared to previous GFVC +methods, and more accurate animation quality in presence of large pose and +facial expression changes. + +
+
+
+
+
+ + ☆ Interpretable Convolutional SyncNet + + +
+ Because videos in the wild can be out of sync for various reasons, a sync-net +is used to bring the video back into sync for tasks that require synchronized +videos. Previous state-of-the-art (SOTA) sync-nets use InfoNCE loss, rely on +the transformer architecture, or both. Unfortunately, the former makes the +model's output difficult to interpret, and the latter is unfriendly with large +images, thus limiting the usefulness of sync-nets. In this work, we train a +convolutional sync-net using the balanced BCE loss (BBCE), a loss inspired by +the binary cross entropy (BCE) and the InfoNCE losses. In contrast to the +InfoNCE loss, the BBCE loss does not require complicated sampling schemes. Our +model can better handle larger images, and its output can be given a +probabilistic interpretation. The probabilistic interpretation allows us to +define metrics such as probability at offset and offscreen ratio to evaluate +the sync quality of audio-visual (AV) speech datasets. Furthermore, our model +achieves SOTA accuracy of $96.5\%$ on the LRS2 dataset and $93.8\%$ on the LRS3 +dataset. + +
+
+ comment: 8+5 pages +
+
+
+
+
+ + ♻ ☆ Inter-Frame Compression for Dynamic Point Cloud Geometry Coding + + +
+ Efficient point cloud compression is essential for applications like virtual +and mixed reality, autonomous driving, and cultural heritage. This paper +proposes a deep learning-based inter-frame encoding scheme for dynamic point +cloud geometry compression. We propose a lossy geometry compression scheme that +predicts the latent representation of the current frame using the previous +frame by employing a novel feature space inter-prediction network. The proposed +network utilizes sparse convolutions with hierarchical multiscale 3D feature +learning to encode the current frame using the previous frame. The proposed +method introduces a novel predictor network for motion compensation in the +feature domain to map the latent representation of the previous frame to the +coordinates of the current frame to predict the current frame's feature +embedding. The framework transmits the residual of the predicted features and +the actual features by compressing them using a learned probabilistic +factorized entropy model. At the receiver, the decoder hierarchically +reconstructs the current frame by progressively rescaling the feature +embedding. The proposed framework is compared to the state-of-the-art +Video-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud +Compression (G-PCC) schemes standardized by the Moving Picture Experts Group +(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta +Rate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against +G-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame +encoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based +inter-frame encoding mode using HEVC. These significant performance gains are +cross-checked and verified in the MPEG working group. + +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+ + ♻ ☆ Show Me the World in My Language: Establishing the First Baseline for + Scene-Text to Scene-Text Translation ICPR 2024 + + +
+ In this work, we study the task of ``visually'' translating scene text from a +source language (e.g., Hindi) to a target language (e.g., English). Visual +translation involves not just the recognition and translation of scene text but +also the generation of the translated image that preserves visual features of +the source scene text, such as font, size, and background. There are several +challenges associated with this task, such as translation with limited context, +deciding between translation and transliteration, accommodating varying text +lengths within fixed spatial boundaries, and preserving the font and background +styles of the source scene text in the target language. To address this +problem, we make the following contributions: (i) We study visual translation +as a standalone problem for the first time in the literature. (ii) We present a +cascaded framework for visual translation that combines state-of-the-art +modules for scene text recognition, machine translation, and scene text +synthesis as a baseline for the task. (iii) We propose a set of task-specific +design enhancements to design a variant of the baseline to obtain performance +improvements. (iv) Currently, the existing related literature lacks any +comprehensive performance evaluation for this novel task. To fill this gap, we +introduce several automatic and user-assisted evaluation metrics designed +explicitly for evaluating visual translation. Further, we evaluate presented +baselines for translating scene text between Hindi and English. Our experiments +demonstrate that although we can effectively perform visual translation over a +large collection of scene text images, the presented baseline only partially +addresses challenges posed by visual translation tasks. We firmly believe that +this new task and the limitations of existing models, as reported in this +paper, should encourage further research in visual translation. + +
+
+ comment: Accepted at ICPR 2024, Project Website: + https://vl2g.github.io/projects/visTrans/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 19 + +
+
+
+ + ♻ ☆ CMAT: A Multi-Agent Collaboration Tuning Framework for Enhancing Small + Language Models + + +
+ Open large language models (LLMs) have significantly advanced the field of +natural language processing, showcasing impressive performance across various +tasks.Despite the significant advancements in LLMs, their effective operation +still relies heavily on human input to accurately guide the dialogue flow, with +agent tuning being a crucial optimization technique that involves human +adjustments to the model for better response to such guidance.Addressing this +dependency, our work introduces the TinyAgent model, trained on a meticulously +curated high-quality dataset. We also present the Collaborative Multi-Agent +Tuning (CMAT) framework, an innovative system designed to augment language +agent capabilities through adaptive weight updates based on environmental +feedback. This framework fosters collaborative learning and real-time +adaptation among multiple intelligent agents, enhancing their context-awareness +and long-term memory. In this research, we propose a new communication agent +framework that integrates multi-agent systems with environmental feedback +mechanisms, offering a scalable method to explore cooperative behaviors. +Notably, our TinyAgent-7B model exhibits performance on par with GPT-3.5, +despite having fewer parameters, signifying a substantial improvement in the +efficiency and effectiveness of LLMs. + +
+
+
+
+
+ + ♻ ☆ Dynamic Boundary Time Warping for Sub-sequence Matching with Few + Examples + + +
+ The paper presents a novel method of finding a fragment in a long temporal +sequence similar to the set of shorter sequences. We are the first to propose +an algorithm for such a search that does not rely on computing the average +sequence from query examples. Instead, we use query examples as is, utilizing +all of them simultaneously. The introduced method based on the Dynamic Time +Warping (DTW) technique is suited explicitly for few-shot query-by-example +retrieval tasks. We evaluate it on two different few-shot problems from the +field of Natural Language Processing. The results show it either outperforms +baselines and previous approaches or achieves comparable results when a low +number of examples is available. + +
+
+
+
+
+ + ♻ ☆ REBEL: Reinforcement Learning via Regressing Relative Rewards + + +
+ While originally developed for continuous control problems, Proximal Policy +Optimization (PPO) has emerged as the work-horse of a variety of reinforcement +learning (RL) applications, including the fine-tuning of generative models. +Unfortunately, PPO requires multiple heuristics to enable stable convergence +(e.g. value networks, clipping), and is notorious for its sensitivity to the +precise implementation of these components. In response, we take a step back +and ask what a minimalist RL algorithm for the era of generative models would +look like. We propose REBEL, an algorithm that cleanly reduces the problem of +policy optimization to regressing the relative reward between two completions +to a prompt in terms of the policy, enabling strikingly lightweight +implementation. In theory, we prove that fundamental RL algorithms like Natural +Policy Gradient can be seen as variants of REBEL, which allows us to match the +strongest known theoretical guarantees in terms of convergence and sample +complexity in the RL literature. REBEL can also cleanly incorporate offline +data and be extended to handle the intransitive preferences we frequently see +in practice. Empirically, we find that REBEL provides a unified approach to +language modeling and image generation with stronger or similar performance as +PPO and DPO, all while being simpler to implement and more computationally +efficient than PPO. When fine-tuning Llama-3-8B-Instruct, REBEL achieves strong +performance in AlpacaEval 2.0, MT-Bench, and Open LLM Leaderboard. + +
+
+ comment: New experimental results on general chat +
+
+
+
+
+ + ♻ ☆ MedFuzz: Exploring the Robustness of Large Language Models in Medical + Question Answering + + +
+ Large language models (LLM) have achieved impressive performance on medical +question-answering benchmarks. However, high benchmark accuracy does not imply +that the performance generalizes to real-world clinical settings. Medical +question-answering benchmarks rely on assumptions consistent with quantifying +LLM performance but that may not hold in the open world of the clinic. Yet LLMs +learn broad knowledge that can help the LLM generalize to practical conditions +regardless of unrealistic assumptions in celebrated benchmarks. We seek to +quantify how well LLM medical question-answering benchmark performance +generalizes when benchmark assumptions are violated. Specifically, we present +an adversarial method that we call MedFuzz (for medical fuzzing). MedFuzz +attempts to modify benchmark questions in ways aimed at confounding the LLM. We +demonstrate the approach by targeting strong assumptions about patient +characteristics presented in the MedQA benchmark. Successful "attacks" modify a +benchmark item in ways that would be unlikely to fool a medical expert but +nonetheless "trick" the LLM into changing from a correct to an incorrect +answer. Further, we present a permutation test technique that can ensure a +successful attack is statistically significant. We show how to use performance +on a "MedFuzzed" benchmark, as well as individual successful attacks. The +methods show promise at providing insights into the ability of an LLM to +operate robustly in more realistic settings. + +
+
+ comment: 9 pages, 3 figures, 2 algorithms, appendix +
+
+
+
+
+ + ♻ ☆ Forecasting Live Chat Intent from Browsing History CIKM 2024 + + +
+ Customers reach out to online live chat agents with various intents, such as +asking about product details or requesting a return. In this paper, we propose +the problem of predicting user intent from browsing history and address it +through a two-stage approach. The first stage classifies a user's browsing +history into high-level intent categories. Here, we represent each browsing +history as a text sequence of page attributes and use the ground-truth class +labels to fine-tune pretrained Transformers. The second stage provides a large +language model (LLM) with the browsing history and predicted intent class to +generate fine-grained intents. For automatic evaluation, we use a separate LLM +to judge the similarity between generated and ground-truth intents, which +closely aligns with human judgments. Our two-stage approach yields significant +performance gains compared to generating intents without the classification +stage. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Greedy Grammar Induction with Indirect Negative Evidence + + +
+ This paper offers a fresh look at the pumping lemma constant as an upper +bound on the information required for learning Context Free Grammars. An +objective function based on indirect negative evidence considers the +occurrences, and non-occurrences, of a finite number of strings, encountered +after a sufficiently long presentation. This function has optimal substructure +in the hypotheses space, giving rise to a greedy search learner in a branch and +bound method. A hierarchy of learnable classes is defined in terms of the +number of production rules that must be added to interim solutions in order to +incrementally fit the input. Efficiency strongly depends on the position of the +target grammar in the hierarchy and on the richness of the input. + +
+
+ comment: 12 pages (including references), 1 png files. 5 anciliary files + (dataset) +
+
+
+
+
+ + ♻ ☆ LongRAG: Enhancing Retrieval-Augmented Generation with Long-context LLMs + + +
+ In traditional RAG framework, the basic retrieval units are normally short. +The common retrievers like DPR normally work with 100-word Wikipedia +paragraphs. Such a design forces the retriever to search over a large corpus to +find the `needle' unit. In contrast, the readers only need to generate answers +from the short retrieved units. The imbalanced `heavy' retriever and `light' +reader design can lead to sub-optimal performance. The loss of contextual +information in the short, chunked units may increase the likelihood of +introducing hard negatives during the retrieval stage. Additionally, the reader +might not fully leverage the capabilities of recent advancements in LLMs. In +order to alleviate the imbalance, we propose a new framework LongRAG, +consisting of a `long retriever' and a `long reader'. In the two +Wikipedia-based datasets, NQ and HotpotQA, LongRAG processes the entire +Wikipedia corpus into 4K-token units by grouping related documents. By +increasing the unit size, we significantly reduce the total number of units. +This greatly reduces the burden on the retriever, resulting in strong retrieval +performance with only a few (less than 8) top units. Without requiring any +training, LongRAG achieves an EM of 62.7% on NQ and 64.3% on HotpotQA, which +are on par with the (fully-trained) SoTA model. Furthermore, we test on two +non-Wikipedia-based datasets, Qasper and MultiFieldQA-en. LongRAG processes +each individual document as a single (long) unit rather than chunking them into +smaller units. By doing so, we achieve an F1 score of 25.9% on Qasper and 57.5% +on MultiFieldQA-en. Our study offers insights into the future roadmap for +combining RAG with long-context LLMs. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Med-MoE: Mixture of Domain-Specific Experts for Lightweight Medical + Vision-Language Models + + +
+ Recent advancements in general-purpose or domain-specific multimodal large +language models (LLMs) have witnessed remarkable progress for medical +decision-making. However, they are designated for specific classification or +generative tasks, and require model training or finetuning on large-scale +datasets with sizeable parameters and tremendous computing, hindering their +clinical utility across diverse resource-constrained scenarios in practice. In +this paper, we propose a novel and lightweight framework Med-MoE +(Mixture-of-Experts) that tackles both discriminative and generative multimodal +medical tasks. The learning of Med-MoE consists of three steps: multimodal +medical alignment, instruction tuning and routing, and domain-specific MoE +tuning. After aligning multimodal medical images with LLM tokens, we then +enable the model for different multimodal medical tasks with instruction +tuning, together with a trainable router tailored for expert selection across +input modalities. Finally, the model is tuned by integrating the router with +multiple domain-specific experts, which are selectively activated and further +empowered by meta expert. Comprehensive experiments on both open- and close-end +medical question answering (Med-VQA) and image classification tasks across +datasets such as VQA-RAD, SLAKE and Path-VQA demonstrate that our model can +achieve performance superior to or on par with state-of-the-art baselines, +while only requiring approximately 30\%-50\% of activated model parameters. +Extensive analysis and ablations corroborate the effectiveness and practical +utility of our method. + +
+
+
+
+
+ + ♻ ☆ T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models + + +
+ The recent development of Sora leads to a new era in text-to-video (T2V) +generation. Along with this comes the rising concern about its security risks. +The generated videos may contain illegal or unethical content, and there is a +lack of comprehensive quantitative understanding of their safety, posing a +challenge to their reliability and practical deployment. Previous evaluations +primarily focus on the quality of video generation. While some evaluations of +text-to-image models have considered safety, they cover fewer aspects and do +not address the unique temporal risk inherent in video generation. To bridge +this research gap, we introduce T2VSafetyBench, a new benchmark designed for +conducting safety-critical assessments of text-to-video models. We define 12 +critical aspects of video generation safety and construct a malicious prompt +dataset including real-world prompts, LLM-generated prompts and jailbreak +attack-based prompts. Based on our evaluation results, we draw several +important findings, including: 1) no single model excels in all aspects, with +different models showing various strengths; 2) the correlation between GPT-4 +assessments and manual reviews is generally high; 3) there is a trade-off +between the usability and safety of text-to-video generative models. This +indicates that as the field of video generation rapidly advances, safety risks +are set to surge, highlighting the urgency of prioritizing video safety. We +hope that T2VSafetyBench can provide insights for better understanding the +safety of video generation in the era of generative AI. + +
+
+
+
+
+ + ♻ ☆ CancerLLM: A Large Language Model in Cancer Domain + + +
+ Medical Large Language Models (LLMs) such as ClinicalCamel 70B, +Llama3-OpenBioLLM 70B have demonstrated impressive performance on a wide +variety of medical NLP task.However, there still lacks a large language model +(LLM) specifically designed for cancer domain. Moreover, these LLMs typically +have billions of parameters, making them computationally expensive for +healthcare systems.Thus, in this study, we propose CancerLLM, a model with 7 +billion parameters and a Mistral-style architecture, pre-trained on 2,676,642 +clinical notes and 515,524 pathology reports covering 17 cancer types, followed +by fine-tuning on three cancer-relevant tasks, including cancer phenotypes +extraction, and cancer diagnosis generation. Our evaluation demonstrated that +CancerLLM achieves state-of-the-art results compared to other existing LLMs, +with an average F1 score improvement of 7.61 %. Additionally, CancerLLM +outperforms other models on two proposed robustness testbeds. This illustrates +that CancerLLM can be effectively applied to clinical AI systems, enhancing +clinical research and healthcare delivery in the field of cancer. + +
+
+ comment: add the diagnosis evaluation of ICD code +
+
+
+
+
+ + ♻ ☆ MLRegTest: A Benchmark for the Machine Learning of Regular Languages + + +
+ Synthetic datasets constructed from formal languages allow fine-grained +examination of the learning and generalization capabilities of machine learning +systems for sequence classification. This article presents a new benchmark for +machine learning systems on sequence classification called MLRegTest, which +contains training, development, and test sets from 1,800 regular languages. +Different kinds of formal languages represent different kinds of long-distance +dependencies, and correctly identifying long-distance dependencies in sequences +is a known challenge for ML systems to generalize successfully. MLRegTest +organizes its languages according to their logical complexity (monadic second +order, first order, propositional, or monomial expressions) and the kind of +logical literals (string, tier-string, subsequence, or combinations thereof). +The logical complexity and choice of literal provides a systematic way to +understand different kinds of long-distance dependencies in regular languages, +and therefore to understand the capacities of different ML systems to learn +such long-distance dependencies. Finally, the performance of different neural +networks (simple RNN, LSTM, GRU, transformer) on MLRegTest is examined. The +main conclusion is that performance depends significantly on the kind of test +set, the class of language, and the neural network architecture. + +
+
+ comment: Accepted for publication in the Journal of Machine Learning Research. + Dataset available at https://doi.org/10.5061/dryad.dncjsxm4h , code available + at https://github.com/heinz-jeffrey/subregular-learning +
+
+
+
+
+ + ♻ ☆ The Oscars of AI Theater: A Survey on Role-Playing with Language Models + + +
+ This survey explores the burgeoning field of role-playing with language +models, focusing on their development from early persona-based models to +advanced character-driven simulations facilitated by Large Language Models +(LLMs). Initially confined to simple persona consistency due to limited model +capabilities, role-playing tasks have now expanded to embrace complex character +portrayals involving character consistency, behavioral alignment, and overall +attractiveness. We provide a comprehensive taxonomy of the critical components +in designing these systems, including data, models and alignment, agent +architecture and evaluation. This survey not only outlines the current +methodologies and challenges, such as managing dynamic personal profiles and +achieving high-level persona consistency but also suggests avenues for future +research in improving the depth and realism of role-playing applications. The +goal is to guide future research by offering a structured overview of current +methodologies and identifying potential areas for improvement. Related +resources and papers are available at +https://github.com/nuochenpku/Awesome-Role-Play-Papers. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Ex3: Automatic Novel Writing by Extracting, Excelsior and Expanding + + +
+ Generating long-term texts such as novels using artificial intelligence has +always been a challenge. A common approach is to use large language models +(LLMs) to construct a hierarchical framework that first plans and then writes. +Despite the fact that the generated novels reach a sufficient length, they +exhibit poor logical coherence and appeal in their plots and deficiencies in +character and event depiction, ultimately compromising the overall narrative +quality. In this paper, we propose a method named Extracting Excelsior and +Expanding. Ex3 initially extracts structure information from raw novel data. By +combining this structure information with the novel data, an +instruction-following dataset is meticulously crafted. This dataset is then +utilized to fine-tune the LLM, aiming for excelsior generation performance. In +the final stage, a tree-like expansion method is deployed to facilitate the +generation of arbitrarily long novels. Evaluation against previous methods +showcases Ex3's ability to produce higher-quality long-form novels. + +
+
+
+
+
+ + ♻ ☆ Tur[k]ingBench: A Challenge Benchmark for Web Agents + + +
+ Can advanced multi-modal models effectively tackle complex web-based tasks? +Such tasks are often found on crowdsourcing platforms, where crowdworkers +engage in challenging micro-tasks within web-based environments. + Building on this idea, we present TurkingBench, a benchmark consisting of +tasks presented as web pages with textual instructions and multi-modal +contexts. Unlike previous approaches that rely on artificially synthesized web +pages, our benchmark uses natural HTML pages originally designed for +crowdsourcing workers to perform various annotation tasks. Each task's HTML +instructions are instantiated with different values derived from crowdsourcing +tasks, creating diverse instances. This benchmark includes 32.2K instances +spread across 158 tasks. + To support the evaluation of TurkingBench, we have developed a framework that +links chatbot responses to actions on web pages (e.g., modifying a text box, +selecting a radio button). We assess the performance of cutting-edge private +and open-source models, including language-only and vision-language models +(such as GPT4 and InternVL), on this benchmark. Our results show that while +these models outperform random chance, there is still significant room for +improvement. We hope that this benchmark will drive progress in the evaluation +and development of web-based agents. + +
+
+
+
+
+ + ♻ ☆ ReMamba: Equip Mamba with Effective Long-Sequence Modeling + + +
+ While the Mamba architecture demonstrates superior inference efficiency and +competitive performance on short-context natural language processing (NLP) +tasks, empirical evidence suggests its capacity to comprehend long contexts is +limited compared to transformer-based models. In this study, we investigate the +long-context efficiency issues of the Mamba models and propose ReMamba, which +enhances Mamba's ability to comprehend long contexts. ReMamba incorporates +selective compression and adaptation techniques within a two-stage re-forward +process, incurring minimal additional inference costs overhead. Experimental +results on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy, +improving over the baselines by 3.2 and 1.6 points, respectively, and attaining +performance almost on par with same-size transformer models. + +
+
+
+
+
+ + ♻ ☆ Editing Personality for Large Language Models NLPCC 2024 + + +
+ This paper introduces an innovative task focused on editing the personality +traits of Large Language Models (LLMs). This task seeks to adjust the models' +responses to opinion-related questions on specified topics since an +individual's personality often manifests in the form of their expressed +opinions, thereby showcasing different personality traits. Specifically, we +construct PersonalityEdit, a new benchmark dataset to address this task. +Drawing on the theory in Social Psychology, we isolate three representative +traits, namely Neuroticism, Extraversion, and Agreeableness, as the foundation +for our benchmark. We then gather data using GPT-4, generating responses that +align with a specified topic and embody the targeted personality trait. We +conduct comprehensive experiments involving various baselines and discuss the +representation of personality behavior in LLMs. Our findings uncover potential +challenges of the proposed task, illustrating several remaining issues. We +anticipate that our work can stimulate further annotation in model editing and +personality-related research. Code is available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: NLPCC 2024 +
+
+
+
+
+ + ♻ ☆ Amplifying Training Data Exposure through Fine-Tuning with + Pseudo-Labeled Memberships + + +
+ Neural language models (LMs) are vulnerable to training data extraction +attacks due to data memorization. This paper introduces a novel attack scenario +wherein an attacker adversarially fine-tunes pre-trained LMs to amplify the +exposure of the original training data. This strategy differs from prior +studies by aiming to intensify the LM's retention of its pre-training dataset. +To achieve this, the attacker needs to collect generated texts that are closely +aligned with the pre-training data. However, without knowledge of the actual +dataset, quantifying the amount of pre-training data within generated texts is +challenging. To address this, we propose the use of pseudo-labels for these +generated texts, leveraging membership approximations indicated by +machine-generated probabilities from the target LM. We subsequently fine-tune +the LM to favor generations with higher likelihoods of originating from the +pre-training data, based on their membership probabilities. Our empirical +findings indicate a remarkable outcome: LMs with over 1B parameters exhibit a +four to eight-fold increase in training data exposure. We discuss potential +mitigations and suggest future research directions. + +
+
+ comment: 20 pages, 6 figures, 15 tables +
+
+
+
+
+ + ♻ ☆ BEADs: Bias Evaluation Across Domains + + +
+ Recent advancements in large language models (LLMs) have greatly enhanced +natural language processing (NLP) applications. Nevertheless, these models +often inherit biases from their training data. Despite the availability of +various datasets, most are limited to one or two NLP tasks (typically +classification or evaluation) and lack comprehensive evaluations across a +broader range of NLP tasks. To address this gap, we introduce the Bias +Evaluations Across Domains (BEADs) dataset, designed to support a wide array of +NLP tasks, including text classification, token classification, bias +quantification, and benign language generation. A key focus of this paper is +the gold label subset of BEADs, an important portion of the data verified by +experts to ensure high reliability. BEADs provides data for both fine-tuning, +including classification and language generation tasks, and for evaluating +LLMs. Our findings indicate that BEADs effectively identifies numerous biases +when fine-tuned on this dataset. It also reduces biases when used for +fine-tuning language generation task, while preserving language quality. The +results also reveal some prevalent demographic biases in LLMs when BEADs is +used for evaluation in demographic task. The benchmarking results highlight the +efficacy of fine-tuning LLMs for bias identification and the necessity of +comprehensive bias evaluation. We make BEADs publicly available to promote more +responsible AI development. The dataset can be accessed at +https://huggingface.co/datasets/shainar/BEAD . + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ The AI Scientist: Towards Fully Automated Open-Ended Scientific + Discovery + + +
+ One of the grand challenges of artificial general intelligence is developing +agents capable of conducting scientific research and discovering new knowledge. +While frontier models have already been used as aides to human scientists, e.g. +for brainstorming ideas, writing code, or prediction tasks, they still conduct +only a small part of the scientific process. This paper presents the first +comprehensive framework for fully automatic scientific discovery, enabling +frontier large language models to perform research independently and +communicate their findings. We introduce The AI Scientist, which generates +novel research ideas, writes code, executes experiments, visualizes results, +describes its findings by writing a full scientific paper, and then runs a +simulated review process for evaluation. In principle, this process can be +repeated to iteratively develop ideas in an open-ended fashion, acting like the +human scientific community. We demonstrate its versatility by applying it to +three distinct subfields of machine learning: diffusion modeling, +transformer-based language modeling, and learning dynamics. Each idea is +implemented and developed into a full paper at a cost of less than $15 per +paper. To evaluate the generated papers, we design and validate an automated +reviewer, which we show achieves near-human performance in evaluating paper +scores. The AI Scientist can produce papers that exceed the acceptance +threshold at a top machine learning conference as judged by our automated +reviewer. This approach signifies the beginning of a new era in scientific +discovery in machine learning: bringing the transformative benefits of AI +agents to the entire research process of AI itself, and taking us closer to a +world where endless affordable creativity and innovation can be unleashed on +the world's most challenging problems. Our code is open-sourced at +https://github.com/SakanaAI/AI-Scientist + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 22 + +
+
+
+ + ♻ ☆ U-BEV: Height-aware Bird's-Eye-View Segmentation and Neural Map-based + Relocalization + + +
+ Efficient relocalization is essential for intelligent vehicles when GPS +reception is insufficient or sensor-based localization fails. Recent advances +in Bird's-Eye-View (BEV) segmentation allow for accurate estimation of local +scene appearance and in turn, can benefit the relocalization of the vehicle. +However, one downside of BEV methods is the heavy computation required to +leverage the geometric constraints. This paper presents U-BEV, a U-Net inspired +architecture that extends the current state-of-the-art by allowing the BEV to +reason about the scene on multiple height layers before flattening the BEV +features. We show that this extension boosts the performance of the U-BEV by up +to 4.11 IoU. Additionally, we combine the encoded neural BEV with a +differentiable template matcher to perform relocalization on neural SD-map +data. The model is fully end-to-end trainable and outperforms transformer-based +BEV methods of similar computational complexity by 1.7 to 2.8 mIoU and +BEV-based relocalization by over 26% Recall Accuracy on the nuScenes dataset. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ REBEL: Reinforcement Learning via Regressing Relative Rewards + + +
+ While originally developed for continuous control problems, Proximal Policy +Optimization (PPO) has emerged as the work-horse of a variety of reinforcement +learning (RL) applications, including the fine-tuning of generative models. +Unfortunately, PPO requires multiple heuristics to enable stable convergence +(e.g. value networks, clipping), and is notorious for its sensitivity to the +precise implementation of these components. In response, we take a step back +and ask what a minimalist RL algorithm for the era of generative models would +look like. We propose REBEL, an algorithm that cleanly reduces the problem of +policy optimization to regressing the relative reward between two completions +to a prompt in terms of the policy, enabling strikingly lightweight +implementation. In theory, we prove that fundamental RL algorithms like Natural +Policy Gradient can be seen as variants of REBEL, which allows us to match the +strongest known theoretical guarantees in terms of convergence and sample +complexity in the RL literature. REBEL can also cleanly incorporate offline +data and be extended to handle the intransitive preferences we frequently see +in practice. Empirically, we find that REBEL provides a unified approach to +language modeling and image generation with stronger or similar performance as +PPO and DPO, all while being simpler to implement and more computationally +efficient than PPO. When fine-tuning Llama-3-8B-Instruct, REBEL achieves strong +performance in AlpacaEval 2.0, MT-Bench, and Open LLM Leaderboard. + +
+
+ comment: New experimental results on general chat +
+
+
+
+
+ + ♻ ☆ GRACE: Graph-Regularized Attentive Convolutional Entanglement with + Laplacian Smoothing for Robust DeepFake Video Detection + + +
+ As DeepFake video manipulation techniques escalate, posing profound threats, +the urgent need to develop efficient detection strategies is underscored. +However, one particular issue lies with facial images being mis-detected, often +originating from degraded videos or adversarial attacks, leading to unexpected +temporal artifacts that can undermine the efficacy of DeepFake video detection +techniques. This paper introduces a novel method for robust DeepFake video +detection, harnessing the power of the proposed Graph-Regularized Attentive +Convolutional Entanglement (GRACE) based on the graph convolutional network +with graph Laplacian to address the aforementioned challenges. First, +conventional Convolution Neural Networks are deployed to perform spatiotemporal +features for the entire video. Then, the spatial and temporal features are +mutually entangled by constructing a graph with sparse constraint, enforcing +essential features of valid face images in the noisy face sequences remaining, +thus augmenting stability and performance for DeepFake video detection. +Furthermore, the Graph Laplacian prior is proposed in the graph convolutional +network to remove the noise pattern in the feature space to further improve the +performance. Comprehensive experiments are conducted to illustrate that our +proposed method delivers state-of-the-art performance in DeepFake video +detection under noisy face sequences. The source code is available at +https://github.com/ming053l/GRACE. + +
+
+ comment: Submitted to TPAMI 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Driving Behavior for Autonomous Vehicles Based on Gramian + Angular Field Vision Transformer + + +
+ Effective classification of autonomous vehicle (AV) driving behavior emerges +as a critical area for diagnosing AV operation faults, enhancing autonomous +driving algorithms, and reducing accident rates. This paper presents the +Gramian Angular Field Vision Transformer (GAF-ViT) model, designed to analyze +AV driving behavior. The proposed GAF-ViT model consists of three key +components: GAF Transformer Module, Channel Attention Module, and Multi-Channel +ViT Module. These modules collectively convert representative sequences of +multivariate behavior into multi-channel images and employ image recognition +techniques for behavior classification. A channel attention mechanism is +applied to multi-channel images to discern the impact of various driving +behavior features. Experimental evaluation on the Waymo Open Dataset of +trajectories demonstrates that the proposed model achieves state-of-the-art +performance. Furthermore, an ablation study effectively substantiates the +efficacy of individual modules within the model. + +
+
+
+
+
+ + ♻ ☆ Med-MoE: Mixture of Domain-Specific Experts for Lightweight Medical + Vision-Language Models + + +
+ Recent advancements in general-purpose or domain-specific multimodal large +language models (LLMs) have witnessed remarkable progress for medical +decision-making. However, they are designated for specific classification or +generative tasks, and require model training or finetuning on large-scale +datasets with sizeable parameters and tremendous computing, hindering their +clinical utility across diverse resource-constrained scenarios in practice. In +this paper, we propose a novel and lightweight framework Med-MoE +(Mixture-of-Experts) that tackles both discriminative and generative multimodal +medical tasks. The learning of Med-MoE consists of three steps: multimodal +medical alignment, instruction tuning and routing, and domain-specific MoE +tuning. After aligning multimodal medical images with LLM tokens, we then +enable the model for different multimodal medical tasks with instruction +tuning, together with a trainable router tailored for expert selection across +input modalities. Finally, the model is tuned by integrating the router with +multiple domain-specific experts, which are selectively activated and further +empowered by meta expert. Comprehensive experiments on both open- and close-end +medical question answering (Med-VQA) and image classification tasks across +datasets such as VQA-RAD, SLAKE and Path-VQA demonstrate that our model can +achieve performance superior to or on par with state-of-the-art baselines, +while only requiring approximately 30\%-50\% of activated model parameters. +Extensive analysis and ablations corroborate the effectiveness and practical +utility of our method. + +
+
+
+
+
+ + ♻ ☆ T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models + + +
+ The recent development of Sora leads to a new era in text-to-video (T2V) +generation. Along with this comes the rising concern about its security risks. +The generated videos may contain illegal or unethical content, and there is a +lack of comprehensive quantitative understanding of their safety, posing a +challenge to their reliability and practical deployment. Previous evaluations +primarily focus on the quality of video generation. While some evaluations of +text-to-image models have considered safety, they cover fewer aspects and do +not address the unique temporal risk inherent in video generation. To bridge +this research gap, we introduce T2VSafetyBench, a new benchmark designed for +conducting safety-critical assessments of text-to-video models. We define 12 +critical aspects of video generation safety and construct a malicious prompt +dataset including real-world prompts, LLM-generated prompts and jailbreak +attack-based prompts. Based on our evaluation results, we draw several +important findings, including: 1) no single model excels in all aspects, with +different models showing various strengths; 2) the correlation between GPT-4 +assessments and manual reviews is generally high; 3) there is a trade-off +between the usability and safety of text-to-video generative models. This +indicates that as the field of video generation rapidly advances, safety risks +are set to surge, highlighting the urgency of prioritizing video safety. We +hope that T2VSafetyBench can provide insights for better understanding the +safety of video generation in the era of generative AI. + +
+
+
+
+
+ + ♻ ☆ Surgical-VQLA++: Adversarial Contrastive Learning for Calibrated Robust + Visual Question-Localized Answering in Robotic Surgery + + +
+ Medical visual question answering (VQA) bridges the gap between visual +information and clinical decision-making, enabling doctors to extract +understanding from clinical images and videos. In particular, surgical VQA can +enhance the interpretation of surgical data, aiding in accurate diagnoses, +effective education, and clinical interventions. However, the inability of VQA +models to visually indicate the regions of interest corresponding to the given +questions results in incomplete comprehension of the surgical scene. To tackle +this, we propose the surgical visual question localized-answering (VQLA) for +precise and context-aware responses to specific queries regarding surgical +images. Furthermore, to address the strong demand for safety in surgical +scenarios and potential corruptions in image acquisition and transmission, we +propose a novel approach called Calibrated Co-Attention Gated Vision-Language +(C$^2$G-ViL) embedding to integrate and align multimodal information +effectively. Additionally, we leverage the adversarial sample-based contrastive +learning strategy to boost our performance and robustness. We also extend our +EndoVis-18-VQLA and EndoVis-17-VQLA datasets to broaden the scope and +application of our data. Extensive experiments on the aforementioned datasets +demonstrate the remarkable performance and robustness of our solution. Our +solution can effectively combat real-world image corruption. Thus, our proposed +approach can serve as an effective tool for assisting surgical education, +patient care, and enhancing surgical outcomes. + +
+
+ comment: Accepted by Information Fusion. Code and data availability: + https://github.com/longbai1006/Surgical-VQLAPlus +
+
+
+
+
+ + ♻ ☆ CSAD: Unsupervised Component Segmentation for Logical Anomaly Detection + + +
+ To improve logical anomaly detection, some previous works have integrated +segmentation techniques with conventional anomaly detection methods. Although +these methods are effective, they frequently lead to unsatisfactory +segmentation results and require manual annotations. To address these +drawbacks, we develop an unsupervised component segmentation technique that +leverages foundation models to autonomously generate training labels for a +lightweight segmentation network without human labeling. Integrating this new +segmentation technique with our proposed Patch Histogram module and the +Local-Global Student-Teacher (LGST) module, we achieve a detection AUROC of +95.3% in the MVTec LOCO AD dataset, which surpasses previous SOTA methods. +Furthermore, our proposed method provides lower latency and higher throughput +than most existing approaches. + +
+
+
+
+
+ + ♻ ☆ Quantum Implicit Neural Representations + + +
+ Implicit neural representations have emerged as a powerful paradigm to +represent signals such as images and sounds. This approach aims to utilize +neural networks to parameterize the implicit function of the signal. However, +when representing implicit functions, traditional neural networks such as +ReLU-based multilayer perceptrons face challenges in accurately modeling +high-frequency components of signals. Recent research has begun to explore the +use of Fourier Neural Networks (FNNs) to overcome this limitation. In this +paper, we propose Quantum Implicit Representation Network (QIREN), a novel +quantum generalization of FNNs. Furthermore, through theoretical analysis, we +demonstrate that QIREN possesses a quantum advantage over classical FNNs. +Lastly, we conducted experiments in signal representation, image +superresolution, and image generation tasks to show the superior performance of +QIREN compared to state-of-the-art (SOTA) models. Our work not only +incorporates quantum advantages into implicit neural representations but also +uncovers a promising application direction for Quantum Neural Networks. + +
+
+ comment: This paper was accepted by icml 2024 +
+
+
+
+
+ + ♻ ☆ Generic Objects as Pose Probes for Few-Shot View Synthesis + + +
+ Radiance fields including NeRFs and 3D Gaussians demonstrate great potential +in high-fidelity rendering and scene reconstruction, while they require a +substantial number of posed images as inputs. COLMAP is frequently employed for +preprocessing to estimate poses, while it necessitates a large number of +feature matches to operate effectively, and it struggles with scenes +characterized by sparse features, large baselines between images, or a limited +number of input images. We aim to tackle few-view NeRF reconstruction using +only 3 to 6 unposed scene images. Traditional methods often use calibration +boards but they are not common in images. We propose a novel idea of utilizing +everyday objects, commonly found in both images and real life, as "pose +probes". The probe object is automatically segmented by SAM, whose shape is +initialized from a cube. We apply a dual-branch volume rendering optimization +(object NeRF and scene NeRF) to constrain the pose optimization and jointly +refine the geometry. Specifically, object poses of two views are first +estimated by PnP matching in an SDF representation, which serves as initial +poses. PnP matching, requiring only a few features, is suitable for +feature-sparse scenes. Additional views are incrementally incorporated to +refine poses from preceding views. In experiments, PoseProbe achieves +state-of-the-art performance in both pose estimation and novel view synthesis +across multiple datasets. We demonstrate its effectiveness, particularly in +few-view and large-baseline scenes where COLMAP struggles. In ablations, using +different objects in a scene yields comparable performance. Our project page is +available at: \href{https://zhirui-gao.github.io/PoseProbe.github.io/}{this +https URL} + +
+
+
+
+
+ + ♻ ☆ Pensieve: Retrospect-then-Compare Mitigates Visual Hallucination + + +
+ Multi-modal Large Language Models (MLLMs) demonstrate remarkable success +across various vision-language tasks. However, they suffer from visual +hallucination, where the generated responses diverge from the provided image. +Are MLLMs oblivious to the accurate visual cues when they hallucinate? Our +investigation reveals that the visual branch may equally advocate both accurate +and erroneous content. To address this issue, we propose Pensieve, a +training-free method that leverages the analogous visual hallucinations, which +are induced by images sharing common semantic and appearance characteristics, +to mitigate hallucination. Specifically, Pensieve enables MLLMs to retrospect +relevant images as references and compare their visual content with the test +image via confidence score subtraction. Moreover, our paradigm balances the +effects of addressing errors from both the visual and textual branches by +adaptively scaling the subtracted scores. Experiments on Whoops, LLaVA Bench, +POPE, and MME demonstrate the efficacy of Pensieve in mitigating visual +hallucination, surpassing other advanced decoding strategies. Pensieve also +aids MLLMs in identifying visual details and enhance the specificity of +generated image descriptions. + +
+
+
+
+
+ + ♻ ☆ Tur[k]ingBench: A Challenge Benchmark for Web Agents + + +
+ Can advanced multi-modal models effectively tackle complex web-based tasks? +Such tasks are often found on crowdsourcing platforms, where crowdworkers +engage in challenging micro-tasks within web-based environments. + Building on this idea, we present TurkingBench, a benchmark consisting of +tasks presented as web pages with textual instructions and multi-modal +contexts. Unlike previous approaches that rely on artificially synthesized web +pages, our benchmark uses natural HTML pages originally designed for +crowdsourcing workers to perform various annotation tasks. Each task's HTML +instructions are instantiated with different values derived from crowdsourcing +tasks, creating diverse instances. This benchmark includes 32.2K instances +spread across 158 tasks. + To support the evaluation of TurkingBench, we have developed a framework that +links chatbot responses to actions on web pages (e.g., modifying a text box, +selecting a radio button). We assess the performance of cutting-edge private +and open-source models, including language-only and vision-language models +(such as GPT4 and InternVL), on this benchmark. Our results show that while +these models outperform random chance, there is still significant room for +improvement. We hope that this benchmark will drive progress in the evaluation +and development of web-based agents. + +
+
+
+
+
+ + ♻ ☆ MBSS-T1: Model-Based Self-Supervised Motion Correction for Robust + Cardiac T1 Mapping + + +
+ T1 mapping is a valuable quantitative MRI technique for diagnosing diffuse +myocardial diseases. Traditional methods, relying on breath-hold sequences and +echo triggering, face challenges with patient compliance and arrhythmias, +limiting their effectiveness. Image registration can enable motion-robust T1 +mapping, but inherent intensity differences between time points pose a +challenge. We introduce MBSS-T1, a self-supervised model for motion correction +in cardiac T1 mapping, constrained by physical and anatomical principles. The +physical constraints ensure expected signal decay behavior, while the +anatomical constraints maintain realistic deformations. The unique combination +of these constraints ensures accurate T1 mapping along the longitudinal +relaxation axis. MBSS-T1 outperformed baseline deep-learning-based image +registration approaches in a 5-fold experiment on a public dataset of 210 +patients (STONE sequence) and an internal dataset of 19 patients (MOLLI +sequence). MBSS-T1 excelled in model fitting quality ($R^2$: 0.975 vs. 0.941, +0.946), anatomical alignment (Dice score: 0.89 vs. 0.84, 0.88), and expert +visual quality assessment for the presence of visible motion artifacts (4.33 +vs. 3.38, 3.66). MBSS-T1 has the potential to enable motion-robust T1 mapping +for a broader range of patients, overcoming challenges such as arrhythmias and +suboptimal compliance, and allowing for free-breathing T1 mapping without +requiring large training datasets. Our code will be publicly available upon +acceptance. + +
+
+
+
+
+ + ♻ ☆ Achieving Resolution-Agnostic DNN-based Image Watermarking: A Novel + Perspective of Implicit Neural Representation ACM MM'24 + + +
+ DNN-based watermarking methods are rapidly developing and delivering +impressive performances. Recent advances achieve resolution-agnostic image +watermarking by reducing the variant resolution watermarking problem to a fixed +resolution watermarking problem. However, such a reduction process can +potentially introduce artifacts and low robustness. To address this issue, we +propose the first, to the best of our knowledge, Resolution-Agnostic Image +WaterMarking (RAIMark) framework by watermarking the implicit neural +representation (INR) of image. Unlike previous methods, our method does not +rely on the previous reduction process by directly watermarking the continuous +signal instead of image pixels, thus achieving resolution-agnostic +watermarking. Precisely, given an arbitrary-resolution image, we fit an INR for +the target image. As a continuous signal, such an INR can be sampled to obtain +images with variant resolutions. Then, we quickly fine-tune the fitted INR to +get a watermarked INR conditioned on a binary secret message. A pre-trained +watermark decoder extracts the hidden message from any sampled images with +arbitrary resolutions. By directly watermarking INR, we achieve +resolution-agnostic watermarking with increased robustness. Extensive +experiments show that our method outperforms previous methods with significant +improvements: averagely improved bit accuracy by 7%$\sim$29%. Notably, we +observe that previous methods are vulnerable to at least one watermarking +attack (e.g. JPEG, crop, resize), while ours are robust against all +watermarking attacks. + +
+
+ comment: Accepted by ACM MM'24 +
+
+
+
+
+ + ♻ ☆ NEDS-SLAM: A Neural Explicit Dense Semantic SLAM Framework using 3D + Gaussian Splatting + + +
+ We propose NEDS-SLAM, a dense semantic SLAM system based on 3D Gaussian +representation, that enables robust 3D semantic mapping, accurate camera +tracking, and high-quality rendering in real-time. In the system, we propose a +Spatially Consistent Feature Fusion model to reduce the effect of erroneous +estimates from pre-trained segmentation head on semantic reconstruction, +achieving robust 3D semantic Gaussian mapping. Additionally, we employ a +lightweight encoder-decoder to compress the high-dimensional semantic features +into a compact 3D Gaussian representation, mitigating the burden of excessive +memory consumption. Furthermore, we leverage the advantage of 3D Gaussian +splatting, which enables efficient and differentiable novel view rendering, and +propose a Virtual Camera View Pruning method to eliminate outlier gaussians, +thereby effectively enhancing the quality of scene representations. Our +NEDS-SLAM method demonstrates competitive performance over existing dense +semantic SLAM methods in terms of mapping and tracking accuracy on Replica and +ScanNet datasets, while also showing excellent capabilities in 3D dense +semantic mapping. + +
+
+ comment: accepted by RA-L, IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ CRS-Diff: Controllable Remote Sensing Image Generation with Diffusion + Model + + +
+ The emergence of generative models has revolutionized the field of remote +sensing (RS) image generation. Despite generating high-quality images, existing +methods are limited in relying mainly on text control conditions, and thus do +not always generate images accurately and stably. In this paper, we propose +CRS-Diff, a new RS generative framework specifically tailored for RS image +generation, leveraging the inherent advantages of diffusion models while +integrating more advanced control mechanisms. Specifically, CRS-Diff can +simultaneously support text-condition, metadata-condition, and image-condition +control inputs, thus enabling more precise control to refine the generation +process. To effectively integrate multiple condition control information, we +introduce a new conditional control mechanism to achieve multi-scale feature +fusion, thus enhancing the guiding effect of control conditions. To our +knowledge, CRS-Diff is the first multiple-condition controllable RS generative +model. Experimental results in single-condition and multiple-condition cases +have demonstrated the superior ability of our CRS-Diff to generate RS images +both quantitatively and qualitatively compared with previous methods. +Additionally, our CRS-Diff can serve as a data engine that generates +high-quality training data for downstream tasks, e.g., road extraction. The +code is available at https://github.com/Sonettoo/CRS-Diff. + +
+
+
+
+
+ + ♻ ☆ Training and Tuning Generative Neural Radiance Fields for + Attribute-Conditional 3D-Aware Face Generation + + +
+ Generative Neural Radiance Fields (GNeRF)-based 3D-aware GANs have showcased +remarkable prowess in crafting high-fidelity images while upholding robust 3D +consistency, particularly face generation. However, specific existing models +prioritize view consistency over disentanglement, leading to constrained +semantic or attribute control during the generation process. While many methods +have explored incorporating semantic masks or leveraging 3D Morphable Models +(3DMM) priors to imbue models with semantic control, these methods often demand +training from scratch, entailing significant computational overhead. In this +paper, we propose a novel approach: a conditional GNeRF model that integrates +specific attribute labels as input, thus amplifying the controllability and +disentanglement capabilities of 3D-aware generative models. Our approach builds +upon a pre-trained 3D-aware face model, and we introduce a Training as Init and +Optimizing for Tuning (TRIOT) method to train a conditional normalized flow +module to enable the facial attribute editing, then optimize the latent vector +to improve attribute-editing precision further. Our extensive experiments +substantiate the efficacy of our model, showcasing its ability to generate +high-quality edits with enhanced view consistency while safeguarding non-target +regions. The code for our model is publicly available at +https://github.com/zhangqianhui/TT-GNeRF. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ ICAL: Implicit Character-Aided Learning for Enhanced Handwritten + Mathematical Expression Recognition ICDAR 2024 + + +
+ Significant progress has been made in the field of handwritten mathematical +expression recognition, while existing encoder-decoder methods are usually +difficult to model global information in $LaTeX$. Therefore, this paper +introduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine +the global expression information and enhance handwritten mathematical +expression recognition. Specifically, we propose the Implicit Character +Construction Module (ICCM) to predict implicit character sequences and use a +Fusion Module to merge the outputs of the ICCM and the decoder, thereby +producing corrected predictions. By modeling and utilizing implicit character +information, ICAL achieves a more accurate and context-aware interpretation of +handwritten mathematical expressions. Experimental results demonstrate that +ICAL notably surpasses the state-of-the-art(SOTA) models, improving the +expression recognition rate (ExpRate) by 2.25\%/1.81\%/1.39\% on the CROHME +2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\% on the +challenging HME100k test set. We make our code available on the GitHub: +https://github.com/qingzhenduyu/ICAL + +
+
+ comment: ICDAR 2024 Oral Paper +
+
+
+
+
+ + ♻ ☆ DiffMap: Enhancing Map Segmentation with Map Prior Using Diffusion Model + + +
+ Constructing high-definition (HD) maps is a crucial requirement for enabling +autonomous driving. In recent years, several map segmentation algorithms have +been developed to address this need, leveraging advancements in Bird's-Eye View +(BEV) perception. However, existing models still encounter challenges in +producing realistic and consistent semantic map layouts. One prominent issue is +the limited utilization of structured priors inherent in map segmentation +masks. In light of this, we propose DiffMap, a novel approach specifically +designed to model the structured priors of map segmentation masks using latent +diffusion model. By incorporating this technique, the performance of existing +semantic segmentation methods can be significantly enhanced and certain +structural errors present in the segmentation outputs can be effectively +rectified. Notably, the proposed module can be seamlessly integrated into any +map segmentation model, thereby augmenting its capability to accurately +delineate semantic information. Furthermore, through extensive visualization +analysis, our model demonstrates superior proficiency in generating results +that more accurately reflect real-world map layouts, further validating its +efficacy in improving the quality of the generated maps. + +
+
+
+
+
+ + ♻ ☆ Advancing Medical Image Segmentation: Morphology-Driven Learning with + Diffusion Transformer BMVC 2024 + + +
+ Understanding the morphological structure of medical images and precisely +segmenting the region of interest or abnormality is an important task that can +assist in diagnosis. However, the unique properties of medical imaging make +clear segmentation difficult,and the high cost and time-consuming task of +labeling leads to a coarse-grained representation of ground truth. Facing with +these problems, we propose a novel Diffusion Transformer Segmentation (DTS) +model for robust segmentation in the presence of noise. We propose an +alternative to the dominant Denoising U-Net encoder through experiments +applying a transformer architecture, which captures global dependency through +self-attention. Additionally, we propose k-neighbor label smoothing, reverse +boundary attention, and self-supervised learning with morphology-driven +learning to improve the ability to identify complex structures. Our model, +which analyzes the morphological representation of images, shows better results +than the previous models in various medical imaging modalities, including CT, +MRI, and lesion images. + +
+
+ comment: Accepted in BMVC 2024 +
+
+
+
+
+ + ♻ ☆ BEVFusion: Multi-Task Multi-Sensor Fusion with Unified Bird's-Eye View + Representation ICRA 2023 + + +
+ Multi-sensor fusion is essential for an accurate and reliable autonomous +driving system. Recent approaches are based on point-level fusion: augmenting +the LiDAR point cloud with camera features. However, the camera-to-LiDAR +projection throws away the semantic density of camera features, hindering the +effectiveness of such methods, especially for semantic-oriented tasks (such as +3D scene segmentation). In this paper, we break this deeply-rooted convention +with BEVFusion, an efficient and generic multi-task multi-sensor fusion +framework. It unifies multi-modal features in the shared bird's-eye view (BEV) +representation space, which nicely preserves both geometric and semantic +information. To achieve this, we diagnose and lift key efficiency bottlenecks +in the view transformation with optimized BEV pooling, reducing latency by more +than 40x. BEVFusion is fundamentally task-agnostic and seamlessly supports +different 3D perception tasks with almost no architectural changes. It +establishes the new state of the art on nuScenes, achieving 1.3% higher mAP and +NDS on 3D object detection and 13.6% higher mIoU on BEV map segmentation, with +1.9x lower computation cost. Code to reproduce our results is available at +https://github.com/mit-han-lab/bevfusion. + +
+
+ comment: ICRA 2023. The first two authors contributed equally to this work. + Project page: https://bevfusion.mit.edu +
+
+
+
+
+ + ♻ ☆ PanopticPartFormer++: A Unified and Decoupled View for Panoptic Part + Segmentation ECCV 2022 + + +
+ Panoptic Part Segmentation (PPS) unifies panoptic and part segmentation into +one task. Previous works utilize separate approaches to handle things, stuff, +and part predictions without shared computation and task association. We aim to +unify these tasks at the architectural level, designing the first end-to-end +unified framework, Panoptic-PartFormer. Moreover, we find the previous metric +PartPQ biases to PQ. To handle both issues, we first design a meta-architecture +that decouples part features and things/stuff features, respectively. We model +things, stuff, and parts as object queries and directly learn to optimize all +three forms of prediction as a unified mask prediction and classification +problem. We term our model as Panoptic-PartFormer. Second, we propose a new +metric Part-Whole Quality (PWQ), better to measure this task from pixel-region +and part-whole perspectives. It also decouples the errors for part segmentation +and panoptic segmentation. Third, inspired by Mask2Former, based on our +meta-architecture, we propose Panoptic-PartFormer++ and design a new part-whole +cross-attention scheme to boost part segmentation qualities further. We design +a new part-whole interaction method using masked cross attention. Finally, +extensive ablation studies and analysis demonstrate the effectiveness of both +Panoptic-PartFormer and Panoptic-PartFormer++. Compared with previous +Panoptic-PartFormer, our Panoptic-PartFormer++ achieves 2% PartPQ and 3% PWQ +improvements on the Cityscapes PPS dataset and 5% PartPQ on the Pascal Context +PPS dataset. On both datasets, Panoptic-PartFormer++ achieves new +state-of-the-art results. Our models can serve as a strong baseline and aid +future research in PPS. The source code and trained models will be available +at~\url{https://github.com/lxtGH/Panoptic-PartFormer}. + +
+
+ comment: T-PAMI-2024, Extension of PanopticPartFormer (ECCV 2022) +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ A Counterfactual Explanation Framework for Retrieval Models + + +
+ Explainability has become a crucial concern in today's world, aiming to +enhance transparency in machine learning and deep learning models. Information +retrieval is no exception to this trend. In existing literature on +explainability of information retrieval, the emphasis has predominantly been on +illustrating the concept of relevance concerning a retrieval model. The +questions addressed include why a document is relevant to a query, why one +document exhibits higher relevance than another, or why a specific set of +documents is deemed relevant for a query. + However, limited attention has been given to understanding why a particular +document is considered non-relevant to a query with respect to a retrieval +model. In an effort to address this gap, our work focus on the question of what +terms need to be added within a document to improve its ranking. This in turn +answers the question of which words played a role in not being favored by a +retrieval model for a particular query. We use an optimization framework to +solve the above-mentioned research problem. % To the best of our knowledge, we +mark the first attempt to tackle this specific counterfactual problem. Our +experiments show the effectiveness of our proposed approach in predicting +counterfactuals for both statistical (e.g. BM25) and deep-learning-based models +(e.g. DRMM, DSSM, ColBERT). + +
+
+
+
+
+ + ☆ Dissecting Temporal Understanding in Text-to-Audio Retrieval + + +
+ Recent advancements in machine learning have fueled research on multimodal +tasks, such as for instance text-to-video and text-to-audio retrieval. These +tasks require models to understand the semantic content of video and audio +data, including objects, and characters. The models also need to learn spatial +arrangements and temporal relationships. In this work, we analyse the temporal +ordering of sounds, which is an understudied problem in the context of +text-to-audio retrieval. In particular, we dissect the temporal understanding +capabilities of a state-of-the-art model for text-to-audio retrieval on the +AudioCaps and Clotho datasets. Additionally, we introduce a synthetic +text-audio dataset that provides a controlled setting for evaluating temporal +capabilities of recent models. Lastly, we present a loss function that +encourages text-audio models to focus on the temporal ordering of events. Code +and data are available at +https://www.robots.ox.ac.uk/~vgg/research/audio-retrieval/dtu/. + +
+
+ comment: 9 pages, 5 figures, ACM Multimedia 2024, + https://www.robots.ox.ac.uk/~vgg/research/audio-retrieval/dtu/ +
+
+
+
+
+ + ☆ Building FKG.in: a Knowledge Graph for Indian Food + + +
+ This paper presents an ontology design along with knowledge engineering, and +multilingual semantic reasoning techniques to build an automated system for +assimilating culinary information for Indian food in the form of a knowledge +graph. The main focus is on designing intelligent methods to derive ontology +designs and capture all-encompassing knowledge about food, recipes, +ingredients, cooking characteristics, and most importantly, nutrition, at +scale. We present our ongoing work in this workshop paper, describe in some +detail the relevant challenges in curating knowledge of Indian food, and +propose our high-level ontology design. We also present a novel workflow that +uses AI, LLM, and language technology to curate information from recipe blog +sites in the public domain to build knowledge graphs for Indian food. The +methods for knowledge curation proposed in this paper are generic and can be +replicated for any domain. The design is application-agnostic and can be used +for AI-driven smart analysis, building recommendation systems for Personalized +Digital Health, and complementing the knowledge graph for Indian food with +contextual information such as user information, food biochemistry, geographic +information, agricultural information, etc. + +
+
+ comment: 14 pages, 3 figures, 25 references, Formal Ontology in Information + Systems Conference 2024 - Integrated Food Ontology Workshop +
+
+
+
+
+ + ☆ Hound: Hunting Supervision Signals for Few and Zero Shot Node + Classification on Text-attributed Graph + + +
+ Text-attributed graph (TAG) is an important type of graph structured data +with text descriptions for each node. Few- and zero-shot node classification on +TAGs have many applications in fields such as academia and social networks. +However, the two tasks are challenging due to the lack of supervision signals, +and existing methods only use the contrastive loss to align graph-based node +embedding and language-based text embedding. In this paper, we propose Hound to +improve accuracy by introducing more supervision signals, and the core idea is +to go beyond the node-text pairs that come with data. Specifically, we design +three augmentation techniques, i.e., node perturbation, text matching, and +semantics negation to provide more reference nodes for each text and vice +versa. Node perturbation adds/drops edges to produce diversified node +embeddings that can be matched with a text. Text matching retrieves texts with +similar embeddings to match with a node. Semantics negation uses a negative +prompt to construct a negative text with the opposite semantics, which is +contrasted with the original node and text. We evaluate Hound on 5 datasets and +compare with 13 state-of-the-art baselines. The results show that Hound +consistently outperforms all baselines, and its accuracy improvements over the +best-performing baseline are usually over 5%. + +
+
+
+
+
+ + ☆ Fair Reciprocal Recommendation in Matching Markets RecSys2024 + + +
+ Recommender systems play an increasingly crucial role in shaping people's +opportunities, particularly in online dating platforms. It is essential from +the user's perspective to increase the probability of matching with a suitable +partner while ensuring an appropriate level of fairness in the matching +opportunities. We investigate reciprocal recommendation in two-sided matching +markets between agents divided into two sides. In our model, a match is +considered successful only when both individuals express interest in each +other. Additionally, we assume that agents prefer to appear prominently in the +recommendation lists presented to those on the other side. We define each +agent's opportunity to be recommended and introduce its fairness criterion, +envy-freeness, from the perspective of fair division theory. The +recommendations that approximately maximize the expected number of matches, +empirically obtained by heuristic algorithms, are likely to result in +significant unfairness of opportunity. Therefore, there can be a trade-off +between maximizing the expected matches and ensuring fairness of opportunity. +To address this challenge, we propose a method to find a policy that is close +to being envy-free by leveraging the Nash social welfare function. Experiments +on synthetic and real-world datasets demonstrate the effectiveness of our +approach in achieving both relatively high expected matches and fairness for +opportunities of both sides in reciprocal recommender systems. + +
+
+ comment: Accepted at RecSys2024 +
+
+
+
+
+ + ☆ A Learnable Agent Collaboration Network Framework for Personalized + Multimodal AI Search Engine + + +
+ Large language models (LLMs) and retrieval-augmented generation (RAG) +techniques have revolutionized traditional information access, enabling AI +agent to search and summarize information on behalf of users during dynamic +dialogues. Despite their potential, current AI search engines exhibit +considerable room for improvement in several critical areas. These areas +include the support for multimodal information, the delivery of personalized +responses, the capability to logically answer complex questions, and the +facilitation of more flexible interactions. This paper proposes a novel AI +Search Engine framework called the Agent Collaboration Network (ACN). The ACN +framework consists of multiple specialized agents working collaboratively, each +with distinct roles such as Account Manager, Solution Strategist, Information +Manager, and Content Creator. This framework integrates mechanisms for picture +content understanding, user profile tracking, and online evolution, enhancing +the AI search engine's response quality, personalization, and interactivity. A +highlight of the ACN is the introduction of a Reflective Forward Optimization +method (RFO), which supports the online synergistic adjustment among agents. +This feature endows the ACN with online learning capabilities, ensuring that +the system has strong interactive flexibility and can promptly adapt to user +feedback. This learning method may also serve as an optimization approach for +agent-based systems, potentially influencing other domains of agent +applications. + +
+
+ comment: ACMMM 2024 MMGR WORKSHOP +
+
+
+
+
+ + ♻ ☆ Dynamic Boundary Time Warping for Sub-sequence Matching with Few + Examples + + +
+ The paper presents a novel method of finding a fragment in a long temporal +sequence similar to the set of shorter sequences. We are the first to propose +an algorithm for such a search that does not rely on computing the average +sequence from query examples. Instead, we use query examples as is, utilizing +all of them simultaneously. The introduced method based on the Dynamic Time +Warping (DTW) technique is suited explicitly for few-shot query-by-example +retrieval tasks. We evaluate it on two different few-shot problems from the +field of Natural Language Processing. The results show it either outperforms +baselines and previous approaches or achieves comparable results when a low +number of examples is available. + +
+
+
+
+
+ + ♻ ☆ Forecasting Live Chat Intent from Browsing History CIKM 2024 + + +
+ Customers reach out to online live chat agents with various intents, such as +asking about product details or requesting a return. In this paper, we propose +the problem of predicting user intent from browsing history and address it +through a two-stage approach. The first stage classifies a user's browsing +history into high-level intent categories. Here, we represent each browsing +history as a text sequence of page attributes and use the ground-truth class +labels to fine-tune pretrained Transformers. The second stage provides a large +language model (LLM) with the browsing history and predicted intent class to +generate fine-grained intents. For automatic evaluation, we use a separate LLM +to judge the similarity between generated and ground-truth intents, which +closely aligns with human judgments. Our two-stage approach yields significant +performance gains compared to generating intents without the classification +stage. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Sparsity-regularized coded ptychography for robust and efficient + lensless microscopy on a chip + + +
+ Coded ptychography has emerged as a powerful technique for high-throughput, +high-resolution lensless imaging. However, the trade-off between acquisition +speed and image quality remains a significant challenge. To address this, we +introduce a novel sparsity-regularized approach to coded ptychography that +dramatically reduces the number of required measurements while maintaining high +reconstruction quality. The reported approach, termed the ptychographic +proximal total-variation (PPTV) solver, formulates the reconstruction task as a +total variation regularized optimization problem. Unlike previous +implementations that rely on specialized hardware or illumination schemes, PPTV +integrates seamlessly into existing coded ptychography setups. Through +comprehensive numerical simulations, we demonstrate that PPTV-driven coded +ptychography can produce accurate reconstructions with as few as eight +intensity measurements, a significant reduction compared to conventional +methods. Convergence analysis confirms the robustness and stability of the PPTV +algorithm. Experimental results from our optical prototype, featuring a +disorder-engineered surface for wavefront modulation, validate PPTV's ability +to achieve high-throughput, high-resolution imaging with a substantially +reduced measurement burden. By enabling high-quality reconstructions from fewer +measurements, PPTV paves the way for more compact, efficient, and +cost-effective lensless microscopy systems on a chip, with potential +applications in digital pathology, endoscopy, point-of-care diagnostics, and +high-content screening. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Vague Preference Policy Learning for Conversational Recommendation + + +
+ Conversational recommendation systems (CRS) commonly assume users have clear +preferences, leading to potential over-filtering of relevant alternatives. +However, users often exhibit vague, non-binary preferences. We introduce the +Vague Preference Multi-round Conversational Recommendation (VPMCR) scenario, +employing a soft estimation mechanism to accommodate users' vague and dynamic +preferences while mitigating over-filtering. In VPMCR, we propose Vague +Preference Policy Learning (VPPL), consisting of Ambiguity-aware Soft +Estimation (ASE) and Dynamism-aware Policy Learning (DPL). ASE captures +preference vagueness by estimating scores for clicked and non-clicked options, +using a choice-based approach and time-aware preference decay. DPL leverages +ASE's preference distribution to guide the conversation and adapt to preference +changes for recommendations or attribute queries. Extensive experiments +demonstrate VPPL's effectiveness within VPMCR, outperforming existing methods +and setting a new benchmark. Our work advances CRS by accommodating users' +inherent ambiguity and relative decision-making processes, improving real-world +applicability. + +
+
+
+
+
+
+
+
+ + Machine Learning 28 + +
+
+
+ + ♻ ☆ Horseshoe-type Priors for Independent Component Estimation + + +
+ Independent Component Estimation (ICE) has many applications in modern day +machine learning as a feature engineering extraction method. Horseshoe-type +priors are used to provide scalable algorithms that enables both point +estimates via expectation-maximization (EM) and full posterior sampling via +Markov Chain Monte Carlo (MCMC) algorithms. Our methodology also applies to +flow-based methods for nonlinear feature extraction and deep learning. We also +discuss how to implement conditional posteriors and envelope-based methods for +optimization. Through this hierarchy representation, we unify a number of +hitherto disparate estimation procedures. We illustrate our methodology and +algorithms on a numerical example. Finally, we conclude with directions for +future research. + +
+
+ comment: 23 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Multi-Microphone Speech Emotion Recognition using the Hierarchical + Token-semantic Audio Transformer Architecture + + +
+ The performance of most emotion recognition systems degrades in real-life +situations ('in the wild' scenarios) where the audio is contaminated by +reverberation. Our study explores new methods to alleviate the performance +degradation of SER algorithms and develop a more robust system for adverse +conditions. We propose processing multi-microphone signals to address these +challenges and improve emotion classification accuracy. We adopt a +state-of-the-art transformer model, the HTS-AT, to handle multi-channel audio +inputs. We evaluate two strategies: averaging mel-spectrograms across channels +and summing patch-embedded representations. Our multi-microphone model achieves +superior performance compared to single-channel baselines when tested on +real-world reverberant environments. + +
+
+
+
+
+ + ♻ ☆ Discovering Preference Optimization Algorithms with and for Large + Language Models + + +
+ Offline preference optimization is a key method for enhancing and controlling +the quality of Large Language Model (LLM) outputs. Typically, preference +optimization is approached as an offline supervised learning task using +manually-crafted convex loss functions. While these methods are based on +theoretical insights, they are inherently constrained by human creativity, so +the large search space of possible loss functions remains under explored. We +address this by performing LLM-driven objective discovery to automatically +discover new state-of-the-art preference optimization algorithms without +(expert) human intervention. Specifically, we iteratively prompt an LLM to +propose and implement new preference optimization loss functions based on +previously-evaluated performance metrics. This process leads to the discovery +of previously-unknown and performant preference optimization algorithms. The +best performing of these we call Discovered Preference Optimization (DiscoPOP), +a novel algorithm that adaptively blends logistic and exponential losses. +Experiments demonstrate the state-of-the-art performance of DiscoPOP and its +successful transfer to held-out tasks. + +
+
+
+
+
+ + ♻ ☆ REBEL: Reinforcement Learning via Regressing Relative Rewards + + +
+ While originally developed for continuous control problems, Proximal Policy +Optimization (PPO) has emerged as the work-horse of a variety of reinforcement +learning (RL) applications, including the fine-tuning of generative models. +Unfortunately, PPO requires multiple heuristics to enable stable convergence +(e.g. value networks, clipping), and is notorious for its sensitivity to the +precise implementation of these components. In response, we take a step back +and ask what a minimalist RL algorithm for the era of generative models would +look like. We propose REBEL, an algorithm that cleanly reduces the problem of +policy optimization to regressing the relative reward between two completions +to a prompt in terms of the policy, enabling strikingly lightweight +implementation. In theory, we prove that fundamental RL algorithms like Natural +Policy Gradient can be seen as variants of REBEL, which allows us to match the +strongest known theoretical guarantees in terms of convergence and sample +complexity in the RL literature. REBEL can also cleanly incorporate offline +data and be extended to handle the intransitive preferences we frequently see +in practice. Empirically, we find that REBEL provides a unified approach to +language modeling and image generation with stronger or similar performance as +PPO and DPO, all while being simpler to implement and more computationally +efficient than PPO. When fine-tuning Llama-3-8B-Instruct, REBEL achieves strong +performance in AlpacaEval 2.0, MT-Bench, and Open LLM Leaderboard. + +
+
+ comment: New experimental results on general chat +
+
+
+
+
+ + ♻ ☆ MedFuzz: Exploring the Robustness of Large Language Models in Medical + Question Answering + + +
+ Large language models (LLM) have achieved impressive performance on medical +question-answering benchmarks. However, high benchmark accuracy does not imply +that the performance generalizes to real-world clinical settings. Medical +question-answering benchmarks rely on assumptions consistent with quantifying +LLM performance but that may not hold in the open world of the clinic. Yet LLMs +learn broad knowledge that can help the LLM generalize to practical conditions +regardless of unrealistic assumptions in celebrated benchmarks. We seek to +quantify how well LLM medical question-answering benchmark performance +generalizes when benchmark assumptions are violated. Specifically, we present +an adversarial method that we call MedFuzz (for medical fuzzing). MedFuzz +attempts to modify benchmark questions in ways aimed at confounding the LLM. We +demonstrate the approach by targeting strong assumptions about patient +characteristics presented in the MedQA benchmark. Successful "attacks" modify a +benchmark item in ways that would be unlikely to fool a medical expert but +nonetheless "trick" the LLM into changing from a correct to an incorrect +answer. Further, we present a permutation test technique that can ensure a +successful attack is statistically significant. We show how to use performance +on a "MedFuzzed" benchmark, as well as individual successful attacks. The +methods show promise at providing insights into the ability of an LLM to +operate robustly in more realistic settings. + +
+
+ comment: 9 pages, 3 figures, 2 algorithms, appendix +
+
+
+
+
+ + ♻ ☆ CardioLab: Laboratory Values Estimation from Electrocardiogram Features + -- An Exploratory Study + + +
+ Introduction: Laboratory value represents a cornerstone of medical +diagnostics, but suffers from slow turnaround times, and high costs and only +provides information about a single point in time. The continuous estimation of +laboratory values from non-invasive data such as electrocardiogram (ECG) would +therefore mark a significant frontier in healthcare monitoring. Despite its +transformative potential, this domain remains relatively underexplored within +the medical community. + Methods: In this preliminary study, we used a publicly available dataset +(MIMIC-IV-ECG) to investigate the feasibility of inferring laboratory values +from ECG features and patient demographics using tree-based models (XGBoost). +We define the prediction task as a binary prediction problem of predicting +whether the lab value falls into low or high abnormalities. The model +performance can then be assessed using AUROC. + Results: Our findings demonstrate promising results in the estimation of +laboratory values related to different organ systems based on a small yet +comprehensive set of features. While further research and validation are +warranted to fully assess the clinical utility and generalizability of +ECG-based estimation in healthcare monitoring, our findings lay the groundwork +for future investigations into approaches to laboratory value estimation using +ECG data. Such advancements hold promise for revolutionizing predictive +healthcare applications, offering faster, non-invasive, and more affordable +means of patient monitoring. + +
+
+ comment: 4 pages, (updated dataset features set description version) code + under https://github.com/AI4HealthUOL/CardioLab +
+
+
+
+
+ + ♻ ☆ Equivariant Scalar Fields for Molecular Docking with Fast Fourier + Transforms ICLR 2024 + + +
+ Molecular docking is critical to structure-based virtual screening, yet the +throughput of such workflows is limited by the expensive optimization of +scoring functions involved in most docking algorithms. We explore how machine +learning can accelerate this process by learning a scoring function with a +functional form that allows for more rapid optimization. Specifically, we +define the scoring function to be the cross-correlation of multi-channel ligand +and protein scalar fields parameterized by equivariant graph neural networks, +enabling rapid optimization over rigid-body degrees of freedom with fast +Fourier transforms. The runtime of our approach can be amortized at several +levels of abstraction, and is particularly favorable for virtual screening +settings with a common binding pocket. We benchmark our scoring functions on +two simplified docking-related tasks: decoy pose scoring and rigid conformer +docking. Our method attains similar but faster performance on crystal +structures compared to the widely-used Vina and Gnina scoring functions, and is +more robust on computationally predicted structures. Code is available at +https://github.com/bjing2016/scalar-fields. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ ApisTox: a new benchmark dataset for the classification of small + molecules toxicity on honey bees + + +
+ The global decline in bee populations poses significant risks to agriculture, +biodiversity, and environmental stability. To bridge the gap in existing data, +we introduce ApisTox, a comprehensive dataset focusing on the toxicity of +pesticides to honey bees (Apis mellifera). This dataset combines and leverages +data from existing sources such as ECOTOX and PPDB, providing an extensive, +consistent, and curated collection that surpasses the previous datasets. +ApisTox incorporates a wide array of data, including toxicity levels for +chemicals, details such as time of their publication in literature, and +identifiers linking them to external chemical databases. This dataset may serve +as an important tool for environmental and agricultural research, but also can +support the development of policies and practices aimed at minimizing harm to +bee populations. Finally, ApisTox offers a unique resource for benchmarking +molecular property prediction methods on agrochemical compounds, facilitating +advancements in both environmental science and cheminformatics. This makes it a +valuable tool for both academic research and practical applications in bee +conservation. + +
+
+
+
+
+ + ♻ ☆ T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models + + +
+ The recent development of Sora leads to a new era in text-to-video (T2V) +generation. Along with this comes the rising concern about its security risks. +The generated videos may contain illegal or unethical content, and there is a +lack of comprehensive quantitative understanding of their safety, posing a +challenge to their reliability and practical deployment. Previous evaluations +primarily focus on the quality of video generation. While some evaluations of +text-to-image models have considered safety, they cover fewer aspects and do +not address the unique temporal risk inherent in video generation. To bridge +this research gap, we introduce T2VSafetyBench, a new benchmark designed for +conducting safety-critical assessments of text-to-video models. We define 12 +critical aspects of video generation safety and construct a malicious prompt +dataset including real-world prompts, LLM-generated prompts and jailbreak +attack-based prompts. Based on our evaluation results, we draw several +important findings, including: 1) no single model excels in all aspects, with +different models showing various strengths; 2) the correlation between GPT-4 +assessments and manual reviews is generally high; 3) there is a trade-off +between the usability and safety of text-to-video generative models. This +indicates that as the field of video generation rapidly advances, safety risks +are set to surge, highlighting the urgency of prioritizing video safety. We +hope that T2VSafetyBench can provide insights for better understanding the +safety of video generation in the era of generative AI. + +
+
+
+
+
+ + ♻ ☆ Data-Aware Gradient Compression for DML in Communication-Constrained + Mobile Computing + + +
+ Distributed machine learning (DML) in mobile environments faces significant +communication bottlenecks. Gradient compression has proven as an effective +solution to this issue, offering substantial benefits in environments with +limited bandwidth and metered data. Yet, it encounters severe performance drops +in non-IID environments due to a one-size-fits-all compression approach, which +does not account for the varying data volumes across workers. Assigning varying +compression ratios to workers with distinct data distributions and volumes is +therefore a promising solution. This work derives the convergence rate of +distributed SGD with non-uniform compression, which reveals the intricate +relationship between model convergence and the compression ratios applied to +individual workers. Accordingly, we frame the relative compression ratio +assignment as an $n$-variable chi-squared nonlinear optimization problem, +constrained by a limited communication budget. We propose DAGC-R, which assigns +conservative compression to workers handling larger data volumes. Recognizing +the computational limitations of mobile devices, we propose the DAGC-A, which +is computationally less demanding and enhances the robustness of compression in +non-IID scenarios. Our experiments confirm that the DAGC-A and DAGC-R can speed +up the training speed by up to $16.65\%$ and $25.43\%$ compared to the uniform +compression respectively, when dealing with highly imbalanced data volume +distribution and restricted communication. + +
+
+
+
+
+ + ♻ ☆ Creating Temporally Correlated High-Resolution Profiles of Load + Injection Using Constrained Generative Adversarial Networks + + +
+ Traditional smart meters, which measure energy usage every 15 minutes or more +and report it at least a few hours later, lack the granularity needed for +real-time decision-making. To address this practical problem, we introduce a +new method using generative adversarial networks (GAN) that enforces temporal +consistency on its high-resolution outputs via hard inequality constraints +using convex optimization. A unique feature of our GAN model is that it is +trained solely on slow timescale aggregated historical energy data obtained +from smart meters. The results demonstrate that the model can successfully +create minute-by-minute temporally correlated profiles of power usage from +15-minute interval average power consumption information. This innovative +approach, emphasizing inter-neuron constraints, offers a promising avenue for +improved high-speed state estimation in distribution systems and enhances the +applicability of data-driven solutions for monitoring and subsequently +controlling such systems. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ Generalizing Graph Transformers Across Diverse Graphs and Tasks via + Pre-Training on Industrial-Scale Data + + +
+ Graph pre-training has been concentrated on graph-level on small graphs +(e.g., molecular graphs) or learning node representations on a fixed graph. +Extending graph pre-trained models to web-scale graphs with billions of nodes +in industrial scenarios, while avoiding negative transfer across graphs or +tasks, remains a challenge. We aim to develop a general graph pre-trained model +with inductive ability that can make predictions for unseen new nodes and even +new graphs. In this work, we introduce a scalable transformer-based graph +pre-training framework called PGT (Pre-trained Graph Transformer). +Specifically, we design a flexible and scalable graph transformer as the +backbone network. Meanwhile, based on the masked autoencoder architecture, we +design two pre-training tasks: one for reconstructing node features and the +other one for reconstructing local structures. Unlike the original autoencoder +architecture where the pre-trained decoder is discarded, we propose a novel +strategy that utilizes the decoder for feature augmentation. We have deployed +our framework on Tencent's online game data. Extensive experiments have +demonstrated that our framework can perform pre-training on real-world +web-scale graphs with over 540 million nodes and 12 billion edges and +generalizes effectively to unseen new graphs with different downstream tasks. +We further conduct experiments on the publicly available ogbn-papers100M +dataset, which consists of 111 million nodes and 1.6 billion edges. Our +framework achieves state-of-the-art performance on both industrial datasets and +public datasets, while also enjoying scalability and efficiency. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Harmonizing Generalization and Personalization in Federated Prompt + Learning + + +
+ Federated Prompt Learning (FPL) incorporates large pre-trained +Vision-Language models (VLM) into federated learning through prompt tuning. The +transferable representations and remarkable generalization capacity of VLM make +them highly compatible with the integration of federated learning. Addressing +data heterogeneity in federated learning requires personalization, but +excessive focus on it across clients could compromise the model's ability to +generalize effectively. To preserve the impressive generalization capability of +VLM, it is crucial to strike a balance between personalization and +generalization in FPL. To tackle this challenge, we proposed Federated Prompt +Learning with CLIP Generalization and low-rank Personalization (FedPGP), which +employs pre-trained CLIP to provide knowledge-guidance on the global prompt for +improved generalization and incorporates a low-rank adaptation term to +personalize the global prompt. Further, FedPGP integrates a prompt-wise +contrastive loss to achieve knowledge guidance and personalized adaptation +simultaneously, enabling a harmonious balance between personalization and +generalization in FPL. We conduct extensive experiments on various datasets to +explore base-to-novel generalization in both category-level and domain-level +scenarios with heterogeneous data, showing the superiority of FedPGP in +balancing generalization and personalization. + +
+
+
+
+
+ + ♻ ☆ A Versatile Graph Learning Approach through LLM-based Agent + + +
+ Designing versatile graph learning approaches is important, considering the +diverse graphs and tasks existing in real-world applications. Existing methods +have attempted to achieve this target through automated machine learning +techniques, pre-training and fine-tuning strategies, and large language models. +However, these methods are not versatile enough for graph learning, as they +work on either limited types of graphs or a single task. In this paper, we +propose to explore versatile graph learning approaches with LLM-based agents, +and the key insight is customizing the graph learning procedures for diverse +graphs and tasks. To achieve this, we develop several LLM-based agents, +equipped with diverse profiles, tools, functions and human experience. They +collaborate to configure each procedure with task and data-specific settings +step by step towards versatile solutions, and the proposed method is dubbed +GL-Agent. By evaluating on diverse tasks and graphs, the correct results of the +agent and its comparable performance showcase the versatility of the proposed +method, especially in complex scenarios.The low resource cost and the potential +to use open-source LLMs highlight the efficiency of GL-Agent. + +
+
+
+
+
+ + ♻ ☆ MLRegTest: A Benchmark for the Machine Learning of Regular Languages + + +
+ Synthetic datasets constructed from formal languages allow fine-grained +examination of the learning and generalization capabilities of machine learning +systems for sequence classification. This article presents a new benchmark for +machine learning systems on sequence classification called MLRegTest, which +contains training, development, and test sets from 1,800 regular languages. +Different kinds of formal languages represent different kinds of long-distance +dependencies, and correctly identifying long-distance dependencies in sequences +is a known challenge for ML systems to generalize successfully. MLRegTest +organizes its languages according to their logical complexity (monadic second +order, first order, propositional, or monomial expressions) and the kind of +logical literals (string, tier-string, subsequence, or combinations thereof). +The logical complexity and choice of literal provides a systematic way to +understand different kinds of long-distance dependencies in regular languages, +and therefore to understand the capacities of different ML systems to learn +such long-distance dependencies. Finally, the performance of different neural +networks (simple RNN, LSTM, GRU, transformer) on MLRegTest is examined. The +main conclusion is that performance depends significantly on the kind of test +set, the class of language, and the neural network architecture. + +
+
+ comment: Accepted for publication in the Journal of Machine Learning Research. + Dataset available at https://doi.org/10.5061/dryad.dncjsxm4h , code available + at https://github.com/heinz-jeffrey/subregular-learning +
+
+
+
+
+ + ♻ ☆ SFR-GNN: Simple and Fast Robust GNNs against Structural Attacks + + +
+ Graph Neural Networks (GNNs) have demonstrated commendable performance for +graph-structured data. Yet, GNNs are often vulnerable to adversarial structural +attacks as embedding generation relies on graph topology. Existing efforts are +dedicated to purifying the maliciously modified structure or applying adaptive +aggregation, thereby enhancing the robustness against adversarial structural +attacks. It is inevitable for a defender to consume heavy computational costs +due to lacking prior knowledge about modified structures. To this end, we +propose an efficient defense method, called Simple and Fast Robust Graph Neural +Network (SFR-GNN), supported by mutual information theory. The SFR-GNN first +pre-trains a GNN model using node attributes and then fine-tunes it over the +modified graph in the manner of contrastive learning, which is free of +purifying modified structures and adaptive aggregation, thus achieving great +efficiency gains. Consequently, SFR-GNN exhibits a 24%--162% speedup compared +to advanced robust models, demonstrating superior robustness for node +classification tasks. + +
+
+
+
+
+ + ♻ ☆ Finite-dimensional approximations of push-forwards on locally analytic + functionals + + +
+ This paper introduces a novel theoretical framework for investigating +analytic maps from finite discrete data. Our approach is to consider the +push-forward on the space of locally analytic functionals, instead of directly +handling the analytic map itself. We establish a methodology enabling +appropriate finite-dimensional approximation of the push-forward from finite +discrete data, through the theory of the Fourier--Borel transform and the Fock +space. Moreover, we prove a rigorous convergence result with a convergence +rate. As an application, we prove that it is not the least-squares polynomial, +but the polynomial obtained by truncating its higher-degree terms, that +approximates analytic functions and further allows for approximation beyond the +support of the data distribution. One advantage of our theory is that it +enables us to apply linear algebraic operations to the finite-dimensional +approximation of the push-forward. Utilizing this, we prove the convergence of +a method for approximating an analytic vector field from finite data of the +flow map of an ordinary differential equation. + +
+
+ comment: 32 pages. 2 figures. We modified resutls. Comments are welcome +
+
+
+
+
+ + ♻ Multi-Fidelity Active Learning with GFlowNets + + +
+ In the last decades, the capacity to generate large amounts of data in +science and engineering applications has been growing steadily. Meanwhile, +machine learning has progressed to become a suitable tool to process and +utilise the available data. Nonetheless, many relevant scientific and +engineering problems present challenges where current machine learning methods +cannot yet efficiently leverage the available data and resources. For example, +in scientific discovery, we are often faced with the problem of exploring very +large, structured and high-dimensional spaces. Moreover, the high fidelity, +black-box objective function is often very expensive to evaluate. Progress in +machine learning methods that can efficiently tackle such challenges would help +accelerate currently crucial areas such as drug and materials discovery. In +this paper, we propose a multi-fidelity active learning algorithm with +GFlowNets as a sampler, to efficiently discover diverse, high-scoring +candidates where multiple approximations of the black-box function are +available at lower fidelity and cost. Our evaluation on molecular discovery +tasks shows that multi-fidelity active learning with GFlowNets can discover +high-scoring candidates at a fraction of the budget of its single-fidelity +counterpart while maintaining diversity, unlike RL-based alternatives. These +results open new avenues for multi-fidelity active learning to accelerate +scientific discovery and engineering design. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR) 07/2024 + https://openreview.net/forum?id=dLaazW9zuF +
+
+
+
+
+ + ♻ ☆ Bridging the Sim-to-Real Gap with Bayesian Inference + + +
+ We present SIM-FSVGD for learning robot dynamics from data. As opposed to +traditional methods, SIM-FSVGD leverages low-fidelity physical priors, e.g., in +the form of simulators, to regularize the training of neural network models. +While learning accurate dynamics already in the low data regime, SIM-FSVGD +scales and excels also when more data is available. We empirically show that +learning with implicit physical priors results in accurate mean model +estimation as well as precise uncertainty quantification. We demonstrate the +effectiveness of SIM-FSVGD in bridging the sim-to-real gap on a +high-performance RC racecar system. Using model-based RL, we demonstrate a +highly dynamic parking maneuver with drifting, using less than half the data +compared to the state of the art. + +
+
+
+
+
+ + ♻ ☆ Quantum Implicit Neural Representations + + +
+ Implicit neural representations have emerged as a powerful paradigm to +represent signals such as images and sounds. This approach aims to utilize +neural networks to parameterize the implicit function of the signal. However, +when representing implicit functions, traditional neural networks such as +ReLU-based multilayer perceptrons face challenges in accurately modeling +high-frequency components of signals. Recent research has begun to explore the +use of Fourier Neural Networks (FNNs) to overcome this limitation. In this +paper, we propose Quantum Implicit Representation Network (QIREN), a novel +quantum generalization of FNNs. Furthermore, through theoretical analysis, we +demonstrate that QIREN possesses a quantum advantage over classical FNNs. +Lastly, we conducted experiments in signal representation, image +superresolution, and image generation tasks to show the superior performance of +QIREN compared to state-of-the-art (SOTA) models. Our work not only +incorporates quantum advantages into implicit neural representations but also +uncovers a promising application direction for Quantum Neural Networks. + +
+
+ comment: This paper was accepted by icml 2024 +
+
+
+
+
+ + ♻ ☆ Uncertainty Modeling in Graph Neural Networks via Stochastic + Differential Equations + + +
+ We address the problem of learning uncertainty-aware representations for +graph-structured data. While Graph Neural Ordinary Differential Equations +(GNODE) are effective in learning node representations, they fail to quantify +uncertainty. To address this, we introduce Latent Graph Neural Stochastic +Differential Equations (LGNSDE), which enhance GNODE by embedding randomness +through Brownian motion to quantify uncertainty. We provide theoretical +guarantees for LGNSDE and empirically show better performance in uncertainty +quantification. + +
+
+ comment: 9 pages including appendix +
+
+
+
+
+ + ♻ ☆ Downlink CCM Estimation via Representation Learning with Graph + Regularization + + +
+ In this paper, we propose an algorithm for downlink (DL) channel covariance +matrix (CCM) estimation for frequency division duplexing (FDD) massive +multiple-input multiple-output (MIMO) communication systems with base station +(BS) possessing a uniform linear array (ULA) antenna structure. We consider a +setting where the UL CCM is mapped to DL CCM by a mapping function. We first +present a theoretical error analysis of learning a nonlinear embedding by +constructing a mapping function, which points to the importance of the +Lipschitz regularity of the mapping function for achieving high estimation +performance. Then, based on the theoretical ground, we propose a representation +learning algorithm as a solution for the estimation problem, where Gaussian RBF +kernel interpolators are chosen to map UL CCMs to their DL counterparts. The +proposed algorithm is based on the optimization of an objective function that +fits a regression model between the DL CCM and UL CCM samples in the training +dataset and preserves the local geometric structure of the data in the UL CCM +space, while explicitly regulating the Lipschitz continuity of the mapping +function in light of our theoretical findings. The proposed algorithm surpasses +benchmark methods in terms of three error metrics as shown by simulations. + +
+
+
+
+
+ + ♻ ☆ Editing Personality for Large Language Models NLPCC 2024 + + +
+ This paper introduces an innovative task focused on editing the personality +traits of Large Language Models (LLMs). This task seeks to adjust the models' +responses to opinion-related questions on specified topics since an +individual's personality often manifests in the form of their expressed +opinions, thereby showcasing different personality traits. Specifically, we +construct PersonalityEdit, a new benchmark dataset to address this task. +Drawing on the theory in Social Psychology, we isolate three representative +traits, namely Neuroticism, Extraversion, and Agreeableness, as the foundation +for our benchmark. We then gather data using GPT-4, generating responses that +align with a specified topic and embody the targeted personality trait. We +conduct comprehensive experiments involving various baselines and discuss the +representation of personality behavior in LLMs. Our findings uncover potential +challenges of the proposed task, illustrating several remaining issues. We +anticipate that our work can stimulate further annotation in model editing and +personality-related research. Code is available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: NLPCC 2024 +
+
+
+
+
+ + ♻ ☆ UniHPF : Universal Healthcare Predictive Framework with Zero Domain + Knowledge ML4H + + +
+ Despite the abundance of Electronic Healthcare Records (EHR), its +heterogeneity restricts the utilization of medical data in building predictive +models. To address this challenge, we propose Universal Healthcare Predictive +Framework (UniHPF), which requires no medical domain knowledge and minimal +pre-processing for multiple prediction tasks. Experimental results demonstrate +that UniHPF is capable of building large-scale EHR models that can process any +form of medical data from distinct EHR systems. We believe that our findings +can provide helpful insights for further research on the multi-source learning +of EHRs. + +
+
+ comment: The original paper is published on Journal of Biomedical and Health + Informatics(JBHI) 2023, https://ieeexplore.ieee.org/document/10298642. + Extended Abstract presented at Machine Learning for Health (ML4H) symposium + 2022, November 28th, 2022, New Orleans, United States, 19 pages(main paper 6 + pages). arXiv admin note: substantial text overlap with arXiv:2207.09858 +
+
+
+
+
+ + ♻ ☆ Amplifying Training Data Exposure through Fine-Tuning with + Pseudo-Labeled Memberships + + +
+ Neural language models (LMs) are vulnerable to training data extraction +attacks due to data memorization. This paper introduces a novel attack scenario +wherein an attacker adversarially fine-tunes pre-trained LMs to amplify the +exposure of the original training data. This strategy differs from prior +studies by aiming to intensify the LM's retention of its pre-training dataset. +To achieve this, the attacker needs to collect generated texts that are closely +aligned with the pre-training data. However, without knowledge of the actual +dataset, quantifying the amount of pre-training data within generated texts is +challenging. To address this, we propose the use of pseudo-labels for these +generated texts, leveraging membership approximations indicated by +machine-generated probabilities from the target LM. We subsequently fine-tune +the LM to favor generations with higher likelihoods of originating from the +pre-training data, based on their membership probabilities. Our empirical +findings indicate a remarkable outcome: LMs with over 1B parameters exhibit a +four to eight-fold increase in training data exposure. We discuss potential +mitigations and suggest future research directions. + +
+
+ comment: 20 pages, 6 figures, 15 tables +
+
+
+
+
+ + ♻ ☆ AirPilot: Interpretable PPO-based DRL Auto-Tuned Nonlinear PID Drone + Controller for Robust Autonomous Flights + + +
+ Navigation precision, speed and stability are crucial for safe Unmanned +Aerial Vehicle (UAV) flight maneuvers and effective flight mission executions +in dynamic environments. Different flight missions may have varying objectives, +such as minimizing energy consumption, achieving precise positioning, or +maximizing speed. A controller that can adapt to different objectives on the +fly is highly valuable. Proportional Integral Derivative (PID) controllers are +one of the most popular and widely used control algorithms for drones and other +control systems, but their linear control algorithm fails to capture the +nonlinear nature of the dynamic wind conditions and complex drone system. +Manually tuning the PID gains for various missions can be time-consuming and +requires significant expertise. This paper aims to revolutionize drone flight +control by presenting the AirPilot, a nonlinear Deep Reinforcement Learning +(DRL) - enhanced Proportional Integral Derivative (PID) drone controller using +Proximal Policy Optimization (PPO). AirPilot controller combines the simplicity +and effectiveness of traditional PID control with the adaptability, learning +capability, and optimization potential of DRL. This makes it better suited for +modern drone applications where the environment is dynamic, and +mission-specific performance demands are high. We employed a COEX Clover +autonomous drone for training the DRL agent within the simulator and +implemented it in a real-world lab setting, which marks a significant milestone +as one of the first attempts to apply a DRL-based flight controller on an +actual drone. Airpilot is capable of reducing the navigation error of the +default PX4 PID position controller by 90%, improving effective navigation +speed of a fine-tuned PID controller by 21%, reducing settling time and +overshoot by 17% and 16% respectively. + +
+
+ comment: 9 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ The AI Scientist: Towards Fully Automated Open-Ended Scientific + Discovery + + +
+ One of the grand challenges of artificial general intelligence is developing +agents capable of conducting scientific research and discovering new knowledge. +While frontier models have already been used as aides to human scientists, e.g. +for brainstorming ideas, writing code, or prediction tasks, they still conduct +only a small part of the scientific process. This paper presents the first +comprehensive framework for fully automatic scientific discovery, enabling +frontier large language models to perform research independently and +communicate their findings. We introduce The AI Scientist, which generates +novel research ideas, writes code, executes experiments, visualizes results, +describes its findings by writing a full scientific paper, and then runs a +simulated review process for evaluation. In principle, this process can be +repeated to iteratively develop ideas in an open-ended fashion, acting like the +human scientific community. We demonstrate its versatility by applying it to +three distinct subfields of machine learning: diffusion modeling, +transformer-based language modeling, and learning dynamics. Each idea is +implemented and developed into a full paper at a cost of less than $15 per +paper. To evaluate the generated papers, we design and validate an automated +reviewer, which we show achieves near-human performance in evaluating paper +scores. The AI Scientist can produce papers that exceed the acceptance +threshold at a top machine learning conference as judged by our automated +reviewer. This approach signifies the beginning of a new era in scientific +discovery in machine learning: bringing the transformative benefits of AI +agents to the entire research process of AI itself, and taking us closer to a +world where endless affordable creativity and innovation can be unleashed on +the world's most challenging problems. Our code is open-sourced at +https://github.com/SakanaAI/AI-Scientist + +
+
+
+
+
+ + ♻ ☆ Enhancing Training Efficiency Using Packing with Flash Attention + + +
+ Padding is often used in tuning LLM models by adding special tokens to +shorter training examples to match the length of the longest sequence in each +batch. While this ensures uniformity for batch processing, it introduces +inefficiencies by including irrelevant padding tokens in the computation and +wastes GPU resources. Hugging Face SFT trainer has always offered the option to +use packing to combine multiple training examples, allowing for maximal +utilization of GPU resources. However, up till now, it did not offer proper +masking of each packed training example. This capability has been added to +Hugging Face Transformers 4.44. We analyse this new feature and show the +benefits across different variations of packing. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ MetaDigiHuman: Haptic Interfaces for Digital Humans in Metaverse + + +
+ The way we engage with digital spaces and the digital world has undergone +rapid changes in recent years, largely due to the emergence of the Metaverse. +As technology continues to advance, the demand for sophisticated and immersive +interfaces to interact with the Metaverse has become increasingly crucial. +Haptic interfaces have been developed to meet this need and provide users with +tactile feedback and realistic touch sensations. These interfaces play a vital +role in creating a more authentic and immersive experience within the +Metaverse. This article introduces the concept of MetaDigiHuman, a +groundbreaking framework that combines blended digital humans and haptic +interfaces. By harnessing cutting-edge technologies, MetaDigiHuman enables +seamless and immersive interaction within the Metaverse. Through this +framework, users can simulate the sensation of touching, feeling, and +interacting with digital beings as if they were physically present in the +environments, offering a more compelling and immersive experience within the +Metaverse. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Multimodal Multi-turn Conversation Stance Detection: A Challenge Dataset + and Effective Model ACM MM2024 + + +
+ Stance detection, which aims to identify public opinion towards specific +targets using social media data, is an important yet challenging task. With the +proliferation of diverse multimodal social media content including text, and +images multimodal stance detection (MSD) has become a crucial research area. +However, existing MSD studies have focused on modeling stance within individual +text-image pairs, overlooking the multi-party conversational contexts that +naturally occur on social media. This limitation stems from a lack of datasets +that authentically capture such conversational scenarios, hindering progress in +conversational MSD. To address this, we introduce a new multimodal multi-turn +conversational stance detection dataset (called MmMtCSD). To derive stances +from this challenging dataset, we propose a novel multimodal large language +model stance detection framework (MLLM-SD), that learns joint stance +representations from textual and visual modalities. Experiments on MmMtCSD show +state-of-the-art performance of our proposed MLLM-SD approach for multimodal +stance detection. We believe that MmMtCSD will contribute to advancing +real-world applications of stance detection research. + +
+
+ comment: ACM MM2024 +
+
+
+
+
+ + ♻ ☆ Harmonizing Attention: Training-free Texture-aware Geometry Transfer WACV2025 + + +
+ Extracting geometry features from photographic images independently of +surface texture and transferring them onto different materials remains a +complex challenge. In this study, we introduce Harmonizing Attention, a novel +training-free approach that leverages diffusion models for texture-aware +geometry transfer. Our method employs a simple yet effective modification of +self-attention layers, allowing the model to query information from multiple +reference images within these layers. This mechanism is seamlessly integrated +into the inversion process as Texture-aligning Attention and into the +generation process as Geometry-aligning Attention. This dual-attention approach +ensures the effective capture and transfer of material-independent geometry +features while maintaining material-specific textural continuity, all without +the need for model fine-tuning. + +
+
+ comment: Accepted at WACV2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 15 + +
+
+
+ + ♻ ☆ Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in + medicine + + +
+ Recent studies indicate that Generative Pre-trained Transformer 4 with Vision +(GPT-4V) outperforms human physicians in medical challenge tasks. However, +these evaluations primarily focused on the accuracy of multi-choice questions +alone. Our study extends the current scope by conducting a comprehensive +analysis of GPT-4V's rationales of image comprehension, recall of medical +knowledge, and step-by-step multimodal reasoning when solving New England +Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test +the knowledge and diagnostic capabilities of medical professionals. Evaluation +results confirmed that GPT-4V performs comparatively to human physicians +regarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in +cases where physicians incorrectly answer, with over 78% accuracy. However, we +discovered that GPT-4V frequently presents flawed rationales in cases where it +makes the correct final choices (35.5%), most prominent in image comprehension +(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our +findings emphasize the necessity for further in-depth evaluations of its +rationales before integrating such multimodal AI models into clinical +workflows. + +
+
+
+
+
+ + ♻ ☆ Diffusion Explainer: Visual Explanation for Text-to-image Stable + Diffusion + + +
+ Diffusion-based generative models' impressive ability to create convincing +images has garnered global attention. However, their complex structures and +operations often pose challenges for non-experts to grasp. We present Diffusion +Explainer, the first interactive visualization tool that explains how Stable +Diffusion transforms text prompts into images. Diffusion Explainer tightly +integrates a visual overview of Stable Diffusion's complex structure with +explanations of the underlying operations. By comparing image generation of +prompt variants, users can discover the impact of keyword changes on image +generation. A 56-participant user study demonstrates that Diffusion Explainer +offers substantial learning benefits to non-experts. Our tool has been used by +over 10,300 users from 124 countries at +https://poloclub.github.io/diffusion-explainer/. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ The Foundational Capabilities of Large Language Models in Predicting + Postoperative Risks Using Clinical Notes + + +
+ Clinical notes recorded during a patient's perioperative journey holds +immense informational value. Advances in large language models (LLMs) offer +opportunities for bridging this gap. Using 84,875 pre-operative notes and its +associated surgical cases from 2018 to 2021, we examine the performance of LLMs +in predicting six postoperative risks using various fine-tuning strategies. +Pretrained LLMs outperformed traditional word embeddings by an absolute AUROC +of 38.3% and AUPRC of 33.2%. Self-supervised fine-tuning further improved +performance by 3.2% and 1.5%. Incorporating labels into training further +increased AUROC by 1.8% and AUPRC by 2%. The highest performance was achieved +with a unified foundation model, with improvements of 3.6% for AUROC and 2.6% +for AUPRC compared to self-supervision, highlighting the foundational +capabilities of LLMs in predicting postoperative risks, which could be +potentially beneficial when deployed for perioperative care + +
+
+ comment: Codes are publicly available at: + https://github.com/cja5553/LLMs_in_perioperative_care +
+
+
+
+
+ + ♻ ☆ A Survey of Neural Code Intelligence: Paradigms, Advances and Beyond + + +
+ Neural Code Intelligence -- leveraging deep learning to understand, generate, +and optimize code -- holds immense potential for transformative impacts on the +whole society. Bridging the gap between Natural Language and Programming +Language, this domain has drawn significant attention from researchers in both +research communities over the past few years. This survey presents a systematic +and chronological review of the advancements in code intelligence, encompassing +over 50 representative models and their variants, more than 20 categories of +tasks, and an extensive coverage of over 680 related works. We follow the +historical progression to trace the paradigm shifts across different research +phases (e.g., from modeling code with recurrent neural networks to the era of +Large Language Models). Concurrently, we highlight the major technical +transitions in models, tasks, and evaluations spanning through different +stages. For applications, we also observe a co-evolving shift. It spans from +initial endeavors to tackling specific scenarios, through exploring a diverse +array of tasks during its rapid expansion, to currently focusing on tackling +increasingly complex and varied real-world challenges. Building on our +examination of the developmental trajectories, we further investigate the +emerging synergies between code intelligence and broader machine intelligence, +uncovering new cross-domain opportunities and illustrating the substantial +influence of code intelligence across various domains. Finally, we delve into +both the opportunities and challenges associated with this field, alongside +elucidating our insights on the most promising research directions. An ongoing, +dynamically updated project and resources associated with this survey have been +released at https://github.com/QiushiSun/NCISurvey. + +
+
+ comment: 64 pages, 6 figures, 10 tables, 695 references +
+
+
+
+
+ + ♻ ☆ Characterizing Online Toxicity During the 2022 Mpox Outbreak: A + Computational Analysis of Topical and Network Dynamics + + +
+ Background: Online toxicity, encompassing behaviors such as harassment, +bullying, hate speech, and the dissemination of misinformation, has become a +pressing social concern in the digital age. The 2022 Mpox outbreak, initially +termed "Monkeypox" but subsequently renamed to mitigate associated stigmas and +societal concerns, serves as a poignant backdrop to this issue. Objective: In +this research, we undertake a comprehensive analysis of the toxic online +discourse surrounding the 2022 Mpox outbreak. Our objective is to dissect its +origins, characterize its nature and content, trace its dissemination patterns, +and assess its broader societal implications, with the goal of providing +insights that can inform strategies to mitigate such toxicity in future crises. +Methods: We collected more than 1.6 million unique tweets and analyzed them +from five dimensions, including context, extent, content, speaker, and intent. +Utilizing BERT-based topic modeling and social network community clustering, we +delineated the toxic dynamics on Twitter. Results: We identified five +high-level topic categories in the toxic online discourse on Twitter, including +disease (46.6%), health policy and healthcare (19.3%), homophobia (23.9%), +politics (6.0%), and racism (4.1%). Through the toxicity diffusion networks of +mentions, retweets, and the top users, we found that retweets of toxic content +were widespread, while influential users rarely engaged with or countered this +toxicity through retweets. Conclusions: By tracking topical dynamics, we can +track the changing popularity of toxic content online, providing a better +understanding of societal challenges. Network dynamics spotlight key social +media influencers and their intents, indicating that addressing these central +figures in toxic discourse can enhance crisis communication and inform +policy-making. + +
+
+ comment: 36 pages, 8 figure, and 12 tables +
+
+
+
+
+ + ♻ ☆ Is There No Such Thing as a Bad Question? H4R: HalluciBot For + Ratiocination, Rewriting, Ranking, and Routing + + +
+ Hallucination continues to be one of the most critical challenges in the +institutional adoption journey of Large Language Models (LLMs). While prior +studies have primarily focused on the post-generation analysis and refinement +of outputs, this paper centers on the effectiveness of queries in eliciting +accurate responses from LLMs. We present HalluciBot, a model that estimates the +query's propensity to hallucinate before generation, without invoking any LLMs +during inference. HalluciBot can serve as a proxy reward model for query +rewriting, offering a general framework to estimate query quality based on +accuracy and consensus. In essence, HalluciBot investigates how poorly +constructed queries can lead to erroneous outputs - moreover, by employing +query rewriting guided by HalluciBot's empirical estimates, we demonstrate that +95.7% output accuracy can be achieved for Multiple Choice questions. The +training procedure for HalluciBot consists of perturbing 369,837 queries n +times, employing n+1 independent LLM agents, sampling an output from each +query, conducting a Multi-Agent Monte Carlo simulation on the sampled outputs, +and training an encoder classifier. The idea of perturbation is the outcome of +our ablation studies that measures the increase in output diversity (+12.5 +agreement spread) by perturbing a query in lexically different but semantically +similar ways. Therefore, HalluciBot paves the way to ratiocinate (76.0% test F1 +score, 46.6% in saved computation on hallucinatory queries), rewrite (+30.2% +positive class transition from hallucinatory to non-hallucinatory), rank +(+50.6% positive class transition from hallucinatory to non-hallucinatory), and +route queries to effective pipelines. + +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models for Health-related Queries with + Presuppositions ACL 2024 + + +
+ As corporations rush to integrate large language models (LLMs) to their +search offerings, it is critical that they provide factually accurate +information that is robust to any presuppositions that a user may express. In +this work, we introduce UPHILL, a dataset consisting of health-related queries +with varying degrees of presuppositions. Using UPHILL, we evaluate the factual +accuracy and consistency of InstructGPT, ChatGPT, and BingChat models. We find +that while model responses rarely disagree with true health claims (posed as +questions), they often fail to challenge false claims: responses from +InstructGPT agree with 32% of the false claims, ChatGPT 26% and BingChat 23%. +As we increase the extent of presupposition in input queries, the responses +from InstructGPT and ChatGPT agree with the claim considerably more often, +regardless of its veracity. Responses from BingChat, which rely on retrieved +webpages, are not as susceptible. Given the moderate factual accuracy, and the +inability of models to consistently correct false assumptions, our work calls +for a careful assessment of current LLMs for use in high-stakes scenarios. + +
+
+ comment: Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Towards a theory of how the structure of language is acquired by deep + neural networks + + +
+ How much data is required to learn the structure of a language via next-token +prediction? We study this question for synthetic datasets generated via a +Probabilistic Context-Free Grammar (PCFG) -- a tree-like generative model that +captures many of the hierarchical structures found in natural languages. We +determine token-token correlations analytically in our model and show that they +can be used to build a representation of the grammar's hidden variables, the +longer the range the deeper the variable. In addition, a finite training set +limits the resolution of correlations to an effective range, whose size grows +with that of the training set. As a result, a Language Model trained with +increasingly many examples can build a deeper representation of the grammar's +structure, thus reaching good performance despite the high dimensionality of +the problem. We conjecture that the relationship between training set size and +effective range of correlations holds beyond our synthetic datasets. In +particular, our conjecture predicts how the scaling law for the test loss +behaviour with training set size depends on the length of the context window, +which we confirm empirically in Shakespeare's plays and Wikipedia articles. + +
+
+ comment: 9 pages, 4 figures (main) +
+
+
+
+
+ + ♻ ☆ Adversarial Representation with Intra-Modal and Inter-Modal Graph + Contrastive Learning for Multimodal Emotion Recognition + + +
+ With the release of increasing open-source emotion recognition datasets on +social media platforms and the rapid development of computing resources, +multimodal emotion recognition tasks (MER) have begun to receive widespread +research attention. The MER task extracts and fuses complementary semantic +information from different modalities, which can classify the speaker's +emotions. However, the existing feature fusion methods have usually mapped the +features of different modalities into the same feature space for information +fusion, which can not eliminate the heterogeneity between different modalities. +Therefore, it is challenging to make the subsequent emotion class boundary +learning. To tackle the above problems, we have proposed a novel Adversarial +Representation with Intra-Modal and Inter-Modal Graph Contrastive for +Multimodal Emotion Recognition (AR-IIGCN) method. Firstly, we input video, +audio, and text features into a multi-layer perceptron (MLP) to map them into +separate feature spaces. Secondly, we build a generator and a discriminator for +the three modal features through adversarial representation, which can achieve +information interaction between modalities and eliminate heterogeneity among +modalities. Thirdly, we introduce contrastive graph representation learning to +capture intra-modal and inter-modal complementary semantic information and +learn intra-class and inter-class boundary information of emotion categories. +Specifically, we construct a graph structure for three modal features and +perform contrastive representation learning on nodes with different emotions in +the same modality and the same emotion in different modalities, which can +improve the feature representation ability of nodes. Extensive experimental +works show that the ARL-IIGCN method can significantly improve emotion +recognition accuracy on IEMOCAP and MELD datasets. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Efficient Long-distance Latent Relation-aware Graph Neural Network for + Multi-modal Emotion Recognition in Conversations + + +
+ The task of multi-modal emotion recognition in conversation (MERC) aims to +analyze the genuine emotional state of each utterance based on the multi-modal +information in the conversation, which is crucial for conversation +understanding. Existing methods focus on using graph neural networks (GNN) to +model conversational relationships and capture contextual latent semantic +relationships. However, due to the complexity of GNN, existing methods cannot +efficiently capture the potential dependencies between long-distance +utterances, which limits the performance of MERC. In this paper, we propose an +Efficient Long-distance Latent Relation-aware Graph Neural Network (ELR-GNN) +for multi-modal emotion recognition in conversations. Specifically, we first +use pre-extracted text, video and audio features as input to Bi-LSTM to capture +contextual semantic information and obtain low-level utterance features. Then, +we use low-level utterance features to construct a conversational emotion +interaction graph. To efficiently capture the potential dependencies between +long-distance utterances, we use the dilated generalized forward push algorithm +to precompute the emotional propagation between global utterances and design an +emotional relation-aware operator to capture the potential semantic +associations between different utterances. Furthermore, we combine early fusion +and adaptive late fusion mechanisms to fuse latent dependency information +between speaker relationship information and context. Finally, we obtain +high-level discourse features and feed them into MLP for emotion prediction. +Extensive experimental results show that ELR-GNN achieves state-of-the-art +performance on the benchmark datasets IEMOCAP and MELD, with running times +reduced by 52\% and 35\%, respectively. + +
+
+ comment: 11 pages, 3 tables +
+
+
+
+
+ + ♻ ☆ DER-GCN: Dialogue and Event Relation-Aware Graph Convolutional Neural + Network for Multimodal Dialogue Emotion Recognition + + +
+ With the continuous development of deep learning (DL), the task of multimodal +dialogue emotion recognition (MDER) has recently received extensive research +attention, which is also an essential branch of DL. The MDER aims to identify +the emotional information contained in different modalities, e.g., text, video, +and audio, in different dialogue scenes. However, existing research has focused +on modeling contextual semantic information and dialogue relations between +speakers while ignoring the impact of event relations on emotion. To tackle the +above issues, we propose a novel Dialogue and Event Relation-Aware Graph +Convolutional Neural Network for Multimodal Emotion Recognition (DER-GCN) +method. It models dialogue relations between speakers and captures latent event +relations information. Specifically, we construct a weighted multi-relationship +graph to simultaneously capture the dependencies between speakers and event +relations in a dialogue. Moreover, we also introduce a Self-Supervised Masked +Graph Autoencoder (SMGAE) to improve the fusion representation ability of +features and structures. Next, we design a new Multiple Information Transformer +(MIT) to capture the correlation between different relations, which can provide +a better fuse of the multivariate information between relations. Finally, we +propose a loss optimization strategy based on contrastive learning to enhance +the representation learning ability of minority class features. We conduct +extensive experiments on the IEMOCAP and MELD benchmark datasets, which verify +the effectiveness of the DER-GCN model. The results demonstrate that our model +significantly improves both the average accuracy and the f1 value of emotion +recognition. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Towards Tracing Trustworthiness Dynamics: Revisiting Pre-training Period + of Large Language Models ACL 2024 + + +
+ Ensuring the trustworthiness of large language models (LLMs) is crucial. Most +studies concentrate on fully pre-trained LLMs to better understand and improve +LLMs' trustworthiness. In this paper, to reveal the untapped potential of +pre-training, we pioneer the exploration of LLMs' trustworthiness during this +period, focusing on five key dimensions: reliability, privacy, toxicity, +fairness, and robustness. To begin with, we apply linear probing to LLMs. The +high probing accuracy suggests that \textit{LLMs in early pre-training can +already distinguish concepts in each trustworthiness dimension}. Therefore, to +further uncover the hidden possibilities of pre-training, we extract steering +vectors from a LLM's pre-training checkpoints to enhance the LLM's +trustworthiness. Finally, inspired by~\citet{choi2023understanding} that mutual +information estimation is bounded by linear probing accuracy, we also probe +LLMs with mutual information to investigate the dynamics of trustworthiness +during pre-training. We are the first to observe a similar two-phase +phenomenon: fitting and compression~\citep{shwartz2017opening}. This research +provides an initial exploration of trustworthiness modeling during LLM +pre-training, seeking to unveil new insights and spur further developments in +the field. We will make our code publicly accessible at +\url{https://github.com/ChnQ/TracingLLM}. + +
+
+ comment: Accepted at ACL 2024 +
+
+
+
+
+ + ♻ ☆ A Novel ICD Coding Method Based on Associated and Hierarchical Code + Description Distillation + + +
+ ICD(International Classification of Diseases) coding involves assigning ICD +codes to patients visit based on their medical notes. ICD coding is a +challenging multilabel text classification problem due to noisy medical +document inputs. Recent advancements in automated ICD coding have enhanced +performance by integrating additional data and knowledge bases with the +encoding of medical notes and codes. However, most of them ignore the code +hierarchy, leading to improper code assignments. To address these problems, we +propose a novel framework based on associated and hierarchical code description +distillation (AHDD) for better code representation learning and avoidance of +improper code assignment.we utilize the code description and the hierarchical +structure inherent to the ICD codes. Therefore, in this paper, we leverage the +code description and the hierarchical structure inherent to the ICD codes. The +code description is also applied to aware the attention layer and output layer. +Experimental results on the benchmark dataset show the superiority of the +proposed framework over several state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ FENICE: Factuality Evaluation of summarization based on Natural language + Inference and Claim Extraction ACL 2024 + + +
+ Recent advancements in text summarization, particularly with the advent of +Large Language Models (LLMs), have shown remarkable performance. However, a +notable challenge persists as a substantial number of automatically-generated +summaries exhibit factual inconsistencies, such as hallucinations. In response +to this issue, various approaches for the evaluation of consistency for +summarization have emerged. Yet, these newly-introduced metrics face several +limitations, including lack of interpretability, focus on short document +summaries (e.g., news articles), and computational impracticality, especially +for LLM-based metrics. To address these shortcomings, we propose Factuality +Evaluation of summarization based on Natural language Inference and Claim +Extraction (FENICE), a more interpretable and efficient factuality-oriented +metric. FENICE leverages an NLI-based alignment between information in the +source document and a set of atomic facts, referred to as claims, extracted +from the summary. Our metric sets a new state of the art on AGGREFACT, the +de-facto benchmark for factuality evaluation. Moreover, we extend our +evaluation to a more challenging setting by conducting a human annotation +process of long-form summarization. In the hope of fostering research in +summarization factuality evaluation, we release the code of our metric and our +factuality annotations of long-form summarization at +https://github.com/Babelscape/FENICE. + +
+
+ comment: ACL 2024 camera ready. Code and data at + https://github.com/Babelscape/FENICE +
+
+
+
+
+ + ♻ ☆ ConSiDERS-The-Human Evaluation Framework: Rethinking Human Evaluation + for Generative Large Language Models ACL 2024 + + +
+ In this position paper, we argue that human evaluation of generative large +language models (LLMs) should be a multidisciplinary undertaking that draws +upon insights from disciplines such as user experience research and human +behavioral psychology to ensure that the experimental design and results are +reliable. The conclusions from these evaluations, thus, must consider factors +such as usability, aesthetics, and cognitive biases. We highlight how cognitive +biases can conflate fluent information and truthfulness, and how cognitive +uncertainty affects the reliability of rating scores such as Likert. +Furthermore, the evaluation should differentiate the capabilities and +weaknesses of increasingly powerful large language models -- which requires +effective test sets. The scalability of human evaluation is also crucial to +wider adoption. Hence, to design an effective human evaluation system in the +age of generative NLP, we propose the ConSiDERS-The-Human evaluation framework +consisting of 6 pillars -- Consistency, Scoring Criteria, Differentiating, User +Experience, Responsible, and Scalability. + +
+
+ comment: Accepted in ACL 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 17 + +
+
+
+ + ♻ ☆ Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in + medicine + + +
+ Recent studies indicate that Generative Pre-trained Transformer 4 with Vision +(GPT-4V) outperforms human physicians in medical challenge tasks. However, +these evaluations primarily focused on the accuracy of multi-choice questions +alone. Our study extends the current scope by conducting a comprehensive +analysis of GPT-4V's rationales of image comprehension, recall of medical +knowledge, and step-by-step multimodal reasoning when solving New England +Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test +the knowledge and diagnostic capabilities of medical professionals. Evaluation +results confirmed that GPT-4V performs comparatively to human physicians +regarding multi-choice accuracy (81.6% vs. 77.8%). GPT-4V also performs well in +cases where physicians incorrectly answer, with over 78% accuracy. However, we +discovered that GPT-4V frequently presents flawed rationales in cases where it +makes the correct final choices (35.5%), most prominent in image comprehension +(27.2%). Regardless of GPT-4V's high accuracy in multi-choice questions, our +findings emphasize the necessity for further in-depth evaluations of its +rationales before integrating such multimodal AI models into clinical +workflows. + +
+
+
+
+
+ + ♻ ☆ CURLing the Dream: Contrastive Representations for World Modeling in + Reinforcement Learning + + +
+ In this work, we present Curled-Dreamer, a novel reinforcement learning +algorithm that integrates contrastive learning into the DreamerV3 framework to +enhance performance in visual reinforcement learning tasks. By incorporating +the contrastive loss from the CURL algorithm and a reconstruction loss from +autoencoder, Curled-Dreamer achieves significant improvements in various +DeepMind Control Suite tasks. Our extensive experiments demonstrate that +Curled-Dreamer consistently outperforms state-of-the-art algorithms, achieving +higher mean and median scores across a diverse set of tasks. The results +indicate that the proposed approach not only accelerates learning but also +enhances the robustness of the learned policies. This work highlights the +potential of combining different learning paradigms to achieve superior +performance in reinforcement learning applications. + +
+
+ comment: Paper accepted for 24th International Conference on Control, + Automation and Systems (ICCAS) +
+
+
+
+
+ + ♻ ☆ Explainable AI: Comparative Analysis of Normal and Dilated ResNet Models + for Fundus Disease Classification + + +
+ This paper presents dilated Residual Network (ResNet) models for disease +classification from retinal fundus images. Dilated convolution filters are used +to replace normal convolution filters in the higher layers of the ResNet model +(dilated ResNet) in order to improve the receptive field compared to the normal +ResNet model for disease classification. This study introduces +computer-assisted diagnostic tools that employ deep learning, enhanced with +explainable AI techniques. These techniques aim to make the tool's +decision-making process transparent, thereby enabling medical professionals to +understand and trust the AI's diagnostic decision. They are particularly +relevant in today's healthcare landscape, where there is a growing demand for +transparency in AI applications to ensure their reliability and ethical use. +The dilated ResNet is used as a replacement for the normal ResNet to enhance +the classification accuracy of retinal eye diseases and reduce the required +computing time. The dataset used in this work is the Ocular Disease Intelligent +Recognition (ODIR) dataset which is a structured ophthalmic database with eight +classes covering most of the common retinal eye diseases. The evaluation +metrics used in this work include precision, recall, accuracy, and F1 score. In +this work, a comparative study has been made between normal ResNet models and +dilated ResNet models on five variants namely ResNet-18, ResNet-34, ResNet-50, +ResNet-101, and ResNet-152. The dilated ResNet model shows promising results as +compared to normal ResNet with an average F1 score of 0.71, 0.70, 0.69, 0.67, +and 0.70 respectively for the above respective variants in ODIR multiclass +disease classification. + +
+
+ comment: Added authors' contributions +
+
+
+
+
+ + ♻ ☆ Convex Hull Prediction for Adaptive Video Streaming by Recurrent + Learning + + +
+ Adaptive video streaming relies on the construction of efficient bitrate +ladders to deliver the best possible visual quality to viewers under bandwidth +constraints. The traditional method of content dependent bitrate ladder +selection requires a video shot to be pre-encoded with multiple encoding +parameters to find the optimal operating points given by the convex hull of the +resulting rate-quality curves. However, this pre-encoding step is equivalent to +an exhaustive search process over the space of possible encoding parameters, +which causes significant overhead in terms of both computation and time +expenditure. To reduce this overhead, we propose a deep learning based method +of content aware convex hull prediction. We employ a recurrent convolutional +network (RCN) to implicitly analyze the spatiotemporal complexity of video +shots in order to predict their convex hulls. A two-step transfer learning +scheme is adopted to train our proposed RCN-Hull model, which ensures +sufficient content diversity to analyze scene complexity, while also making it +possible to capture the scene statistics of pristine source videos. Our +experimental results reveal that our proposed model yields better +approximations of the optimal convex hulls, and offers competitive time savings +as compared to existing approaches. On average, the pre-encoding time was +reduced by 53.8% by our method, while the average Bjontegaard delta bitrate +(BD-rate) of the predicted convex hulls against ground truth was 0.26%, and the +mean absolute deviation of the BD-rate distribution was 0.57%. + +
+
+
+
+
+ + ♻ ☆ Thinking Racial Bias in Fair Forgery Detection: Models, Datasets and + Evaluations + + +
+ Due to the successful development of deep image generation technology, +forgery detection plays a more important role in social and economic security. +Racial bias has not been explored thoroughly in the deep forgery detection +field. In the paper, we first contribute a dedicated dataset called the Fair +Forgery Detection (FairFD) dataset, where we prove the racial bias of public +state-of-the-art (SOTA) methods. Different from existing forgery detection +datasets, the self-constructed FairFD dataset contains a balanced racial ratio +and diverse forgery generation images with the largest-scale subjects. +Additionally, we identify the problems with naive fairness metrics when +benchmarking forgery detection models. To comprehensively evaluate fairness, we +design novel metrics including Approach Averaged Metric and Utility Regularized +Metric, which can avoid deceptive results. We also present an effective and +robust post-processing technique, Bias Pruning with Fair Activations (BPFA), +which improves fairness without requiring retraining or weight updates. +Extensive experiments conducted with 12 representative forgery detection models +demonstrate the value of the proposed dataset and the reasonability of the +designed fairness metrics. By applying the BPFA to the existing fairest +detector, we achieve a new SOTA. Furthermore, we conduct more in-depth analyses +to offer more insights to inspire researchers in the community. + +
+
+
+
+
+ + ♻ ☆ S3E: A Mulit-Robot Multimodal Dataset for Collaborative SLAM + + +
+ The burgeoning demand for collaborative robotic systems to execute complex +tasks collectively has intensified the research community's focus on advancing +simultaneous localization and mapping (SLAM) in a cooperative context. Despite +this interest, the scalability and diversity of existing datasets for +collaborative trajectories remain limited, especially in scenarios with +constrained perspectives where the generalization capabilities of Collaborative +SLAM (C-SLAM) are critical for the feasibility of multi-agent missions. +Addressing this gap, we introduce S3E, an expansive multimodal dataset. +Captured by a fleet of unmanned ground vehicles traversing four distinct +collaborative trajectory paradigms, S3E encompasses 13 outdoor and 5 indoor +sequences. These sequences feature meticulously synchronized and spatially +calibrated data streams, including 360-degree LiDAR point cloud, +high-resolution stereo imagery, high-frequency inertial measurement units +(IMU), and Ultra-wideband (UWB) relative observations. Our dataset not only +surpasses previous efforts in scale, scene diversity, and data intricacy but +also provides a thorough analysis and benchmarks for both collaborative and +individual SLAM methodologies. For access to the dataset and the latest +information, please visit our repository at https://pengyu-team.github.io/S3E. + +
+
+
+
+
+ + ♻ ☆ CILF-CIAE: CLIP-driven Image-Language Fusion for Correcting Inverse Age + Estimation + + +
+ The age estimation task aims to predict the age of an individual by analyzing +facial features in an image. The development of age estimation can improve the +efficiency and accuracy of various applications (e.g., age verification and +secure access control, etc.). In recent years, contrastive language-image +pre-training (CLIP) has been widely used in various multimodal tasks and has +made some progress in the field of age estimation. However, existing CLIP-based +age estimation methods require high memory usage (quadratic complexity) when +globally modeling images, and lack an error feedback mechanism to prompt the +model about the quality of age prediction results. To tackle the above issues, +we propose a novel CLIP-driven Image-Language Fusion for Correcting Inverse Age +Estimation (CILF-CIAE). Specifically, we first introduce the CLIP model to +extract image features and text semantic information respectively, and map them +into a highly semantically aligned high-dimensional feature space. Next, we +designed a new Transformer architecture (i.e., FourierFormer) to achieve +channel evolution and spatial interaction of images, and to fuse image and text +semantic information. Compared with the quadratic complexity of the attention +mechanism, the proposed Fourierformer is of linear log complexity. To further +narrow the semantic gap between image and text features, we utilize an +efficient contrastive multimodal learning module that supervises the multimodal +fusion process of FourierFormer through contrastive loss for image-text +matching, thereby improving the interaction effect between different +modalities. Finally, we introduce reversible age estimation, which uses +end-to-end error feedback to reduce the error rate of age predictions. Through +extensive experiments on multiple data sets, CILF-CIAE has achieved better age +prediction results. + +
+
+ comment: 14 pages, 14 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Graph Information Bottleneck for Remote Sensing Segmentation + + +
+ Remote sensing segmentation has a wide range of applications in environmental +protection, and urban change detection, etc. Despite the success of deep +learning-based remote sensing segmentation methods (e.g., CNN and Transformer), +they are not flexible enough to model irregular objects. In addition, existing +graph contrastive learning methods usually adopt the way of maximizing mutual +information to keep the node representations consistent between different graph +views, which may cause the model to learn task-independent redundant +information. To tackle the above problems, this paper treats images as graph +structures and introduces a simple contrastive vision GNN (SC-ViG) architecture +for remote sensing segmentation. Specifically, we construct a node-masked and +edge-masked graph view to obtain an optimal graph structure representation, +which can adaptively learn whether to mask nodes and edges. Furthermore, this +paper innovatively introduces information bottleneck theory into graph +contrastive learning to maximize task-related information while minimizing +task-independent redundant information. Finally, we replace the convolutional +module in UNet with the SC-ViG module to complete the segmentation and +classification tasks of remote sensing images. Extensive experiments on +publicly available real datasets demonstrate that our method outperforms +state-of-the-art remote sensing image segmentation methods. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Palantir: Towards Efficient Super Resolution for Ultra-high-definition + Live Streaming + + +
+ Neural enhancement through super-resolution (SR) deep neural networks (DNNs) +opens up new possibilities for ultra-high-definition (UHD) live streaming over +existing encoding and networking infrastructure. Yet, the heavy SR DNN +inference overhead leads to severe deployment challenges. To reduce the +overhead, existing systems propose to apply DNN-based SR only on carefully +selected anchor frames while upscaling non-anchor frames via the lightweight +reusing-based SR approach. However, frame-level scheduling is coarse-grained +and fails to deliver optimal efficiency. In this work, we propose Palantir, the +first neural-enhanced UHD live streaming system with fine-grained patch-level +scheduling. Two novel techniques are incorporated into Palantir to select the +most beneficial anchor patches and support latency-sensitive UHD live streaming +applications. Firstly, under the guidance of our pioneering and theoretical +analysis, Palantir constructs a directed acyclic graph (DAG) for lightweight +yet accurate SR quality estimation under any possible anchor patch set. +Secondly, to further optimize the scheduling latency, Palantir improves +parallelizability by refactoring the computation subprocedure of the estimation +process into a sparse matrix-matrix multiplication operation. + The evaluation results suggest that Palantir incurs a negligible scheduling +latency accounting for less than 5.7% of the end-to-end latency requirement. +When compared to the naive method of applying DNN-based SR on all the frames, +Palantir can reduce the SR DNN inference overhead by 20 times (or 60 times) +while preserving 54.0-82.6% (or 32.8-64.0%) of the quality gain. When compared +to the state-of-the-art real-time frame-level scheduling strategy, Palantir can +reduce the SR DNN inference overhead by 80.1% at most (and 38.4% on average) +without sacrificing the video quality. + +
+
+
+
+
+ + ♻ ☆ Anticipating Future Object Compositions without Forgetting + + +
+ Despite the significant advancements in computer vision models, their ability +to generalize to novel object-attribute compositions remains limited. Existing +methods for Compositional Zero-Shot Learning (CZSL) mainly focus on image +classification. This paper aims to enhance CZSL in object detection without +forgetting prior learned knowledge. We use Grounding DINO and incorporate +Compositional Soft Prompting (CSP) into it and extend it with Compositional +Anticipation. We achieve a 70.5% improvement over CSP on the harmonic mean (HM) +between seen and unseen compositions on the CLEVR dataset. Furthermore, we +introduce Contrastive Prompt Tuning to incrementally address model confusion +between similar compositions. We demonstrate the effectiveness of this method +and achieve an increase of 14.5% in HM across the pretrain, increment, and +unseen sets. Collectively, these methods provide a framework for learning +various compositions with limited data, as well as improving the performance of +underperforming compositions when additional data becomes available. + +
+
+
+
+
+ + ♻ ☆ InfiniBench: A Comprehensive Benchmark for Large Multimodal Models in + Very Long Video Understanding + + +
+ Understanding long videos, ranging from tens of minutes to several hours, +presents unique challenges in video comprehension. Despite the increasing +importance of long-form video content, existing benchmarks primarily focus on +shorter clips. To address this gap, we introduce InfiniBench a comprehensive +benchmark for very long video understanding which presents 1)The longest video +duration, averaging 52.59 minutes per video 2) The largest number of +question-answer pairs, 108.2K 3) Diversity in questions that examine nine +different skills and include both multiple-choice questions and open-ended +questions 4) Human-centric, as the video sources come from movies and daily TV +shows, with specific human-level question designs such as Movie Spoiler +Questions that require critical thinking and comprehensive understanding. Using +InfiniBench, we comprehensively evaluate existing Large Multi-Modality Models +(LMMs) on each skill, including the commercial models such as GPT-4o and Gemini +1.5 Flash and the open-source models. The evaluation shows significant +challenges in our benchmark. Our findings reveal that even leading AI models +like GPT-4o and Gemini 1.5 Flash face challenges in achieving high performance +in long video understanding, with average accuracies of just 49.16\% and +42.72\%, and average scores of 3.22 and 2.71 out of 5, respectively. We hope +this benchmark will stimulate the LMMs community towards long video and +human-level understanding. Our benchmark can be accessed at +https://vision-cair.github.io/InfiniBench/ + +
+
+ comment: 24 pages,25 figures +
+
+
+
+
+ + ♻ ☆ Translating Images to Road Network: A Sequence-to-Sequence Perspective ICCV 2023 + + +
+ The extraction of road network is essential for the generation of +high-definition maps since it enables the precise localization of road +landmarks and their interconnections. However, generating road network poses a +significant challenge due to the conflicting underlying combination of +Euclidean (e.g., road landmarks location) and non-Euclidean (e.g., road +topological connectivity) structures. Existing methods struggle to merge the +two types of data domains effectively, but few of them address it properly. +Instead, our work establishes a unified representation of both types of data +domain by projecting both Euclidean and non-Euclidean data into an integer +series called RoadNet Sequence. Further than modeling an auto-regressive +sequence-to-sequence Transformer model to understand RoadNet Sequence, we +decouple the dependency of RoadNet Sequence into a mixture of auto-regressive +and non-autoregressive dependency. Building on this, our proposed +non-autoregressive sequence-to-sequence approach leverages non-autoregressive +dependencies while fixing the gap towards auto-regressive dependencies, +resulting in success on both efficiency and accuracy. We further identify two +main bottlenecks in the current RoadNetTransformer on a non-overfitting split +of the dataset: poor landmark detection limited by the BEV Encoder and error +propagation to topology reasoning. Therefore, we propose Topology-Inherited +Training to inherit better topology knowledge into RoadNetTransformer. +Additionally, we collect SD-Maps from open-source map datasets and use this +prior information to significantly improve landmark detection and reachability. +Extensive experiments on nuScenes dataset demonstrate the superiority of +RoadNet Sequence representation and the non-autoregressive approach compared to +existing state-of-the-art alternatives. + +
+
+ comment: V1 is the ICCV 2023 conference version, and V2 is the extended + version +
+
+
+
+
+ + ♻ ☆ Geometric Prior Guided Feature Representation Learning for Long-Tailed + Classification + + +
+ Real-world data are long-tailed, the lack of tail samples leads to a +significant limitation in the generalization ability of the model. Although +numerous approaches of class re-balancing perform well for moderate class +imbalance problems, additional knowledge needs to be introduced to help the +tail class recover the underlying true distribution when the observed +distribution from a few tail samples does not represent its true distribution +properly, thus allowing the model to learn valuable information outside the +observed domain. In this work, we propose to leverage the geometric information +of the feature distribution of the well-represented head class to guide the +model to learn the underlying distribution of the tail class. Specifically, we +first systematically define the geometry of the feature distribution and the +similarity measures between the geometries, and discover four phenomena +regarding the relationship between the geometries of different feature +distributions. Then, based on four phenomena, feature uncertainty +representation is proposed to perturb the tail features by utilizing the +geometry of the head class feature distribution. It aims to make the perturbed +features cover the underlying distribution of the tail class as much as +possible, thus improving the model's generalization performance in the test +domain. Finally, we design a three-stage training scheme enabling feature +uncertainty modeling to be successfully applied. Experiments on +CIFAR-10/100-LT, ImageNet-LT, and iNaturalist2018 show that our proposed +approach outperforms other similar methods on most metrics. In addition, the +experimental phenomena we discovered are able to provide new perspectives and +theoretical foundations for subsequent studies. + +
+
+ comment: This work was accepted by the IJCV 2024 +
+
+
+
+
+ + ♻ ☆ xGen-VideoSyn-1: High-fidelity Text-to-Video Synthesis with Compressed + Representations ECCV24 + + +
+ We present xGen-VideoSyn-1, a text-to-video (T2V) generation model capable of +producing realistic scenes from textual descriptions. Building on recent +advancements, such as OpenAI's Sora, we explore the latent diffusion model +(LDM) architecture and introduce a video variational autoencoder (VidVAE). +VidVAE compresses video data both spatially and temporally, significantly +reducing the length of visual tokens and the computational demands associated +with generating long-sequence videos. To further address the computational +costs, we propose a divide-and-merge strategy that maintains temporal +consistency across video segments. Our Diffusion Transformer (DiT) model +incorporates spatial and temporal self-attention layers, enabling robust +generalization across different timeframes and aspect ratios. We have devised a +data processing pipeline from the very beginning and collected over 13M +high-quality video-text pairs. The pipeline includes multiple steps such as +clipping, text detection, motion estimation, aesthetics scoring, and dense +captioning based on our in-house video-LLM model. Training the VidVAE and DiT +models required approximately 40 and 642 H100 days, respectively. Our model +supports over 14-second 720p video generation in an end-to-end way and +demonstrates competitive performance against state-of-the-art T2V models. + +
+
+ comment: Accepted by ECCV24 AI4VA +
+
+
+
+
+ + ♻ ☆ CMAB: A First National-Scale Multi-Attribute Building Dataset in China + Derived from Open Source Data and GeoAI + + +
+ Rapidly acquiring three-dimensional (3D) building data, including geometric +attributes like rooftop, height and orientations, as well as indicative +attributes like function, quality, and age, is essential for accurate urban +analysis, simulations, and policy updates. Current building datasets suffer +from incomplete coverage of building multi-attributes. This paper introduces a +geospatial artificial intelligence (GeoAI) framework for large-scale building +modeling, presenting the first national-scale Multi-Attribute Building dataset +(CMAB), covering 3,667 spatial cities, 29 million buildings, and 21.3 billion +square meters of rooftops with an F1-Score of 89.93% in OCRNet-based +extraction, totaling 337.7 billion cubic meters of building stock. We trained +bootstrap aggregated XGBoost models with city administrative classifications, +incorporating features such as morphology, location, and function. Using +multi-source data, including billions of high-resolution Google Earth images +and 60 million street view images (SVIs), we generated rooftop, height, +function, age, and quality attributes for each building. Accuracy was validated +through model benchmarks, existing similar products, and manual SVI validation, +mostly above 80%. Our dataset and results are crucial for global SDGs and urban +planning. + +
+
+ comment: 43 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video + Anomaly Detection WACV + + +
+ Skeleton-based video anomaly detection (SVAD) is a crucial task in computer +vision. Accurately identifying abnormal patterns or events enables operators to +promptly detect suspicious activities, thereby enhancing safety. Achieving this +demands a comprehensive understanding of human motions, both at body and region +levels, while also accounting for the wide variations of performing a single +action. However, existing studies fail to simultaneously address these crucial +properties. This paper introduces a novel, practical and lightweight framework, +namely Graph-Jigsaw Conditioned Diffusion Model for Skeleton-based Video +Anomaly Detection (GiCiSAD) to overcome the challenges associated with SVAD. +GiCiSAD consists of three novel modules: the Graph Attention-based Forecasting +module to capture the spatio-temporal dependencies inherent in the data, the +Graph-level Jigsaw Puzzle Maker module to distinguish subtle region-level +discrepancies between normal and abnormal motions, and the Graph-based +Conditional Diffusion model to generate a wide spectrum of human motions. +Extensive experiments on four widely used skeleton-based video datasets show +that GiCiSAD outperforms existing methods with significantly fewer training +parameters, establishing it as the new state-of-the-art. + +
+
+ comment: Accepted at the Winter Conference on Applications of Computer Vision + (WACV). 17 pages, 6 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+
+
+
+ + Information Retrieval 3 + +
+
+
+ + ☆ PSLF: A PID Controller-incorporated Second-order Latent Factor Analysis + Model for Recommender System + + +
+ A second-order-based latent factor (SLF) analysis model demonstrates superior +performance in graph representation learning, particularly for high-dimensional +and incomplete (HDI) interaction data, by incorporating the curvature +information of the loss landscape. However, its objective function is commonly +bi-linear and non-convex, causing the SLF model to suffer from a low +convergence rate. To address this issue, this paper proposes a PID +controller-incorporated SLF (PSLF) model, leveraging two key strategies: a) +refining learning error estimation by incorporating the PID controller +principles, and b) acquiring second-order information insights through +Hessian-vector products. Experimental results on multiple HDI datasets indicate +that the proposed PSLF model outperforms four state-of-the-art latent factor +models based on advanced optimizers regarding convergence rates and +generalization performance. + +
+
+
+
+
+ + ☆ An Enhanced Batch Query Architecture in Real-time Recommendation CIKM 2024 + + +
+ In industrial recommendation systems on websites and apps, it is essential to +recall and predict top-n results relevant to user interests from a content pool +of billions within milliseconds. To cope with continuous data growth and +improve real-time recommendation performance, we have designed and implemented +a high-performance batch query architecture for real-time recommendation +systems. Our contributions include optimizing hash structures with a +cacheline-aware probing method to enhance coalesced hashing, as well as the +implementation of a hybrid storage key-value service built upon it. Our +experiments indicate this approach significantly surpasses conventional hash +tables in batch query throughput, achieving up to 90% of the query throughput +of random memory access when incorporating parallel optimization. The support +for NVMe, integrating two-tier storage for hot and cold data, notably reduces +resource consumption. Additionally, the system facilitates dynamic updates, +automated sharding of attributes and feature embedding tables, and introduces +innovative protocols for consistency in batch queries, thereby enhancing the +effectiveness of real-time incremental learning updates. This architecture has +been deployed and in use in the bilibili recommendation system for over a year, +a video content community with hundreds of millions of users, supporting 10x +increase in model computation with minimal resource growth, improving outcomes +while preserving the system's real-time performance. + +
+
+ comment: 8 pages, 10 figures, CIKM 2024 Applied Research Paper +
+
+
+
+
+ + ♻ ☆ Decoding Knowledge Claims: The Evaluation of Scientific Publication + Contributions through Semantic Analysis + + +
+ The surge in scientific publications challenges the use of publication counts +as a measure of scientific progress, requiring alternative metrics that +emphasize the quality and novelty of scientific contributions rather than sheer +quantity. This paper proposes the use of Relaxed Word Mover's Distance (RWMD), +a semantic text similarity measure, to evaluate the novelty of scientific +papers. We hypothesize that RWMD can more effectively gauge the growth of +scientific knowledge. To test such an assumption, we apply RWMD to evaluate +seminal papers, with Hirsch's H-Index paper as a primary case study. We compare +RWMD results across three groups: 1) H-Index-related papers, 2) scientometric +studies, and 3) unrelated papers, aiming to discern redundant literature and +hype from genuine innovations. Findings suggest that emphasizing knowledge +claims offers a deeper insight into scientific contributions, marking RWMD as a +promising alternative method to traditional citation metrics, thus better +tracking significant scientific breakthroughs. + +
+
+
+
+
+
+
+
+ + Machine Learning 22 + +
+
+
+ + ♻ ☆ Adversarial Domain Adaptation for Cross-user Activity Recognition Using + Diffusion-based Noise-centred Learning + + +
+ Human Activity Recognition (HAR) plays a crucial role in various applications +such as human-computer interaction and healthcare monitoring. However, +challenges persist in HAR models due to the data distribution differences +between training and real-world data distributions, particularly evident in +cross-user scenarios. This paper introduces a novel framework, termed +Diffusion-based Noise-centered Adversarial Learning Domain Adaptation +(Diff-Noise-Adv-DA), designed to address these challenges by leveraging +generative diffusion modeling and adversarial learning techniques. Traditional +HAR models often struggle with the diversity of user behaviors and sensor data +distributions. Diff-Noise-Adv-DA innovatively integrates the inherent noise +within diffusion models, harnessing its latent information to enhance domain +adaptation. Specifically, the framework transforms noise into a critical +carrier of activity and domain class information, facilitating robust +classification across different user domains. Experimental evaluations +demonstrate the effectiveness of Diff-Noise-Adv-DA in improving HAR model +performance across different users, surpassing traditional domain adaptation +methods. The framework not only mitigates distribution mismatches but also +enhances data quality through noise-based denoising techniques. + +
+
+
+
+
+ + ♻ ☆ CURLing the Dream: Contrastive Representations for World Modeling in + Reinforcement Learning + + +
+ In this work, we present Curled-Dreamer, a novel reinforcement learning +algorithm that integrates contrastive learning into the DreamerV3 framework to +enhance performance in visual reinforcement learning tasks. By incorporating +the contrastive loss from the CURL algorithm and a reconstruction loss from +autoencoder, Curled-Dreamer achieves significant improvements in various +DeepMind Control Suite tasks. Our extensive experiments demonstrate that +Curled-Dreamer consistently outperforms state-of-the-art algorithms, achieving +higher mean and median scores across a diverse set of tasks. The results +indicate that the proposed approach not only accelerates learning but also +enhances the robustness of the learned policies. This work highlights the +potential of combining different learning paradigms to achieve superior +performance in reinforcement learning applications. + +
+
+ comment: Paper accepted for 24th International Conference on Control, + Automation and Systems (ICCAS) +
+
+
+
+
+ + ♻ ☆ Online-Score-Aided Federated Learning: Taming the Resource Constraints + in Wireless Networks + + +
+ While FL is a widely popular distributed ML strategy that protects data +privacy, time-varying wireless network parameters and heterogeneous system +configurations of the wireless device pose significant challenges. Although the +limited radio and computational resources of the network and the clients, +respectively, are widely acknowledged, two critical yet often ignored aspects +are (a) wireless devices can only dedicate a small chunk of their limited +storage for the FL task and (b) new training samples may arrive in an online +manner in many practical wireless applications. Therefore, we propose a new FL +algorithm called OSAFL, specifically designed to learn tasks relevant to +wireless applications under these practical considerations. Since it has long +been proven that under extreme resource constraints, clients may perform an +arbitrary number of local training steps, which may lead to client drift under +statistically heterogeneous data distributions, we leverage normalized gradient +similarities and exploit weighting clients' updates based on optimized scores +that facilitate the convergence rate of the proposed OSAFL algorithm. Our +extensive simulation results on two different tasks -- each with three +different datasets -- with four popular ML models validate the effectiveness of +OSAFL compared to six existing state-of-the-art FL baselines. + +
+
+ comment: Under review for possible publication in IEEE Transactions on + Communications +
+
+
+
+
+ + ♻ ☆ Kolmogorov-Arnold Network for Online Reinforcement Learning + + +
+ Kolmogorov-Arnold Networks (KANs) have shown potential as an alternative to +Multi-Layer Perceptrons (MLPs) in neural networks, providing universal function +approximation with fewer parameters and reduced memory usage. In this paper, we +explore the use of KANs as function approximators within the Proximal Policy +Optimization (PPO) algorithm. We evaluate this approach by comparing its +performance to the original MLP-based PPO using the DeepMind Control Proprio +Robotics benchmark. Our results indicate that the KAN-based reinforcement +learning algorithm can achieve comparable performance to its MLP-based +counterpart, often with fewer parameters. These findings suggest that KANs may +offer a more efficient option for reinforcement learning models. + +
+
+ comment: Paper accepted at 24th International Conference on Control, + Automation and Systems (ICCAS) +
+
+
+
+
+ + ♻ ☆ Diffusion Explainer: Visual Explanation for Text-to-image Stable + Diffusion + + +
+ Diffusion-based generative models' impressive ability to create convincing +images has garnered global attention. However, their complex structures and +operations often pose challenges for non-experts to grasp. We present Diffusion +Explainer, the first interactive visualization tool that explains how Stable +Diffusion transforms text prompts into images. Diffusion Explainer tightly +integrates a visual overview of Stable Diffusion's complex structure with +explanations of the underlying operations. By comparing image generation of +prompt variants, users can discover the impact of keyword changes on image +generation. A 56-participant user study demonstrates that Diffusion Explainer +offers substantial learning benefits to non-experts. Our tool has been used by +over 10,300 users from 124 countries at +https://poloclub.github.io/diffusion-explainer/. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Do Concept Bottleneck Models Respect Localities? + + +
+ Concept-based methods explain model predictions using human-understandable +concepts. These models require accurate concept predictors, yet the +faithfulness of existing concept predictors to their underlying concepts is +unclear. In this paper, we investigate the faithfulness of Concept Bottleneck +Models (CBMs), a popular family of concept-based architectures, by looking at +whether they respect "localities" in datasets. Localities involve using only +relevant features when predicting a concept's value. When localities are not +considered, concepts may be predicted based on spuriously correlated features, +degrading performance and robustness. This work examines how CBM predictions +change when perturbing model inputs, and reveals that CBMs may not capture +localities, even when independent concepts are localised to non-overlapping +feature subsets. Our empirical and theoretical results demonstrate that +datasets with correlated concepts may lead to accurate but uninterpretable +models that fail to learn localities. Overall, we find that CBM +interpretability is fragile, as CBMs occasionally rely upon spurious features, +necessitating further research into the robustness of concept predictors. + +
+
+ comment: Previous Version Accepted at NeurIPs 23 XAI in Action Workshop +
+
+
+
+
+ + ♻ ☆ Discovery of Small Ultra-short-period Planets Orbiting KG Dwarfs in + Kepler Survey Using GPU Phase Folding and Deep Learning Detection System + + +
+ Since the discovery of the first hot Jupiter orbiting a solar-type star, 51 +Peg, in 1995, more than 4000 exoplanets have been identified using various +observational techniques. The formation process of these sub-Earths remains +elusive, and acquiring additional samples is essential for investigating this +unique population. In our study, we employ a novel GPU Phase Folding algorithm +combined with a Convolutional Neural Network, termed the GPFC method, on Kepler +photometry data. This method enhances the transit search speed significantly +over the traditional Box-fitting Least Squares method, allowing a complete +search of the known KOI photometry data within hours using a commercial GPU +card. To date, we have identified five promising sub-Earth short-period +candidates: K00446.c, K01821.b, K01522.c, K03404.b, and K04978.b. A closer +analysis reveals the following characteristics: K00446.c orbits a K dwarf on a +0.645091-day period. With a radius of $0.461R_\oplus$, it ranks as the second +smallest USP discovered to date. K01821.b is a sub-Earth with a radius of +$0.648R_\oplus$, orbiting a G dwarf over a 0.91978-day period. It is the second +smallest USP among all confirmed USPs orbiting G dwarfs in the NASA Archive. +K01522.c has a radius of $0.704 R_\oplus$ and completes an orbit around a +Sun-like G dwarf in 0.64672 days; K03404.b, with a radius of $0.738 R_\oplus$, +orbits a G dwarf on a 0.68074-day period; and K04978.b, with its planetary +radius of $0.912 R_\oplus$, orbits a G dwarf, completing an orbit every 0.94197 +days. Three of our finds, K01821.b, K01522.c and K03404.b, rank as the smallest +planets among all confirmed USPs orbiting G dwarfs in the Kepler dataset. The +discovery of these small exoplanets underscores the promising capability of the +GPFC method for searching for small, new transiting exoplanets in photometry +data from Kepler, TESS, and future space transit missions. + +
+
+ comment: 24 pages, 40 figures; To be published in the Monthly Notices of the + Royal Astronomical Society (MNRAS) +
+
+
+
+
+ + ♻ ☆ Convex Hull Prediction for Adaptive Video Streaming by Recurrent + Learning + + +
+ Adaptive video streaming relies on the construction of efficient bitrate +ladders to deliver the best possible visual quality to viewers under bandwidth +constraints. The traditional method of content dependent bitrate ladder +selection requires a video shot to be pre-encoded with multiple encoding +parameters to find the optimal operating points given by the convex hull of the +resulting rate-quality curves. However, this pre-encoding step is equivalent to +an exhaustive search process over the space of possible encoding parameters, +which causes significant overhead in terms of both computation and time +expenditure. To reduce this overhead, we propose a deep learning based method +of content aware convex hull prediction. We employ a recurrent convolutional +network (RCN) to implicitly analyze the spatiotemporal complexity of video +shots in order to predict their convex hulls. A two-step transfer learning +scheme is adopted to train our proposed RCN-Hull model, which ensures +sufficient content diversity to analyze scene complexity, while also making it +possible to capture the scene statistics of pristine source videos. Our +experimental results reveal that our proposed model yields better +approximations of the optimal convex hulls, and offers competitive time savings +as compared to existing approaches. On average, the pre-encoding time was +reduced by 53.8% by our method, while the average Bjontegaard delta bitrate +(BD-rate) of the predicted convex hulls against ground truth was 0.26%, and the +mean absolute deviation of the BD-rate distribution was 0.57%. + +
+
+
+
+
+ + ♻ ☆ Public Transit Arrival Prediction: a Seq2Seq RNN Approach + + +
+ Arrival/Travel times for public transit exhibit variability on account of +factors like seasonality, dwell times at bus stops, traffic signals, travel +demand fluctuation etc. The developing world in particular is plagued by +additional factors like lack of lane discipline, excess vehicles, diverse modes +of transport and so on. This renders the bus arrival time prediction (BATP) to +be a challenging problem especially in the developing world. A novel +data-driven model based on recurrent neural networks (RNNs) is proposed for +BATP (in real-time) in the current work. The model intelligently incorporates +both spatial and temporal correlations in a unique (non-linear) fashion +distinct from existing approaches. In particular, we propose a Gated Recurrent +Unit (GRU) based Encoder-Decoder(ED) OR Seq2Seq RNN model (originally +introduced for language translation) for BATP. The geometry of the dynamic real +time BATP problem enables a nice fit with the Encoder-Decoder based RNN +structure. We feed relevant additional synchronized inputs (from previous +trips) at each step of the decoder (a feature classically unexplored in machine +translation applications). Further motivated from accurately modelling +congestion influences on travel time prediction, we additionally propose to use +a bidirectional layer at the decoder (something unexplored in other time-series +based ED application contexts). The effectiveness of the proposed algorithms is +demonstrated on real field data collected from challenging traffic conditions. +Our experiments indicate that the proposed method outperforms diverse existing +state-of-art data-driven approaches proposed for the same problem. + +
+
+
+
+
+ + ♻ ☆ Is There No Such Thing as a Bad Question? H4R: HalluciBot For + Ratiocination, Rewriting, Ranking, and Routing + + +
+ Hallucination continues to be one of the most critical challenges in the +institutional adoption journey of Large Language Models (LLMs). While prior +studies have primarily focused on the post-generation analysis and refinement +of outputs, this paper centers on the effectiveness of queries in eliciting +accurate responses from LLMs. We present HalluciBot, a model that estimates the +query's propensity to hallucinate before generation, without invoking any LLMs +during inference. HalluciBot can serve as a proxy reward model for query +rewriting, offering a general framework to estimate query quality based on +accuracy and consensus. In essence, HalluciBot investigates how poorly +constructed queries can lead to erroneous outputs - moreover, by employing +query rewriting guided by HalluciBot's empirical estimates, we demonstrate that +95.7% output accuracy can be achieved for Multiple Choice questions. The +training procedure for HalluciBot consists of perturbing 369,837 queries n +times, employing n+1 independent LLM agents, sampling an output from each +query, conducting a Multi-Agent Monte Carlo simulation on the sampled outputs, +and training an encoder classifier. The idea of perturbation is the outcome of +our ablation studies that measures the increase in output diversity (+12.5 +agreement spread) by perturbing a query in lexically different but semantically +similar ways. Therefore, HalluciBot paves the way to ratiocinate (76.0% test F1 +score, 46.6% in saved computation on hallucinatory queries), rewrite (+30.2% +positive class transition from hallucinatory to non-hallucinatory), rank +(+50.6% positive class transition from hallucinatory to non-hallucinatory), and +route queries to effective pipelines. + +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models for Health-related Queries with + Presuppositions ACL 2024 + + +
+ As corporations rush to integrate large language models (LLMs) to their +search offerings, it is critical that they provide factually accurate +information that is robust to any presuppositions that a user may express. In +this work, we introduce UPHILL, a dataset consisting of health-related queries +with varying degrees of presuppositions. Using UPHILL, we evaluate the factual +accuracy and consistency of InstructGPT, ChatGPT, and BingChat models. We find +that while model responses rarely disagree with true health claims (posed as +questions), they often fail to challenge false claims: responses from +InstructGPT agree with 32% of the false claims, ChatGPT 26% and BingChat 23%. +As we increase the extent of presupposition in input queries, the responses +from InstructGPT and ChatGPT agree with the claim considerably more often, +regardless of its veracity. Responses from BingChat, which rely on retrieved +webpages, are not as susceptible. Given the moderate factual accuracy, and the +inability of models to consistently correct false assumptions, our work calls +for a careful assessment of current LLMs for use in high-stakes scenarios. + +
+
+ comment: Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Towards a theory of how the structure of language is acquired by deep + neural networks + + +
+ How much data is required to learn the structure of a language via next-token +prediction? We study this question for synthetic datasets generated via a +Probabilistic Context-Free Grammar (PCFG) -- a tree-like generative model that +captures many of the hierarchical structures found in natural languages. We +determine token-token correlations analytically in our model and show that they +can be used to build a representation of the grammar's hidden variables, the +longer the range the deeper the variable. In addition, a finite training set +limits the resolution of correlations to an effective range, whose size grows +with that of the training set. As a result, a Language Model trained with +increasingly many examples can build a deeper representation of the grammar's +structure, thus reaching good performance despite the high dimensionality of +the problem. We conjecture that the relationship between training set size and +effective range of correlations holds beyond our synthetic datasets. In +particular, our conjecture predicts how the scaling law for the test loss +behaviour with training set size depends on the length of the context window, +which we confirm empirically in Shakespeare's plays and Wikipedia articles. + +
+
+ comment: 9 pages, 4 figures (main) +
+
+
+
+
+ + ♻ ☆ Enhanced Federated Optimization: Adaptive Unbiased Client Sampling with + Reduced Variance + + +
+ Federated Learning (FL) is a distributed learning paradigm to train a global +model across multiple devices without collecting local data. In FL, a server +typically selects a subset of clients for each training round to optimize +resource usage. Central to this process is the technique of unbiased client +sampling, which ensures a representative selection of clients. Current methods +primarily utilize a random sampling procedure which, despite its effectiveness, +achieves suboptimal efficiency owing to the loose upper bound caused by the +sampling variance. In this work, by adopting an independent sampling procedure, +we propose a federated optimization framework focused on adaptive unbiased +client sampling, improving the convergence rate via an online variance +reduction strategy. In particular, we present the first adaptive client +sampler, K-Vib, employing an independent sampling procedure. K-Vib achieves a +linear speed-up on the regret bound +$\tilde{\mathcal{O}}\big(N^{\frac{1}{3}}T^{\frac{2}{3}}/K^{\frac{4}{3}}\big)$ +within a set communication budget $K$. Empirical studies indicate that K-Vib +doubles the speed compared to baseline algorithms, demonstrating significant +potential in federated optimization. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ An experimental evaluation of Deep Reinforcement Learning algorithms for + HVAC control + + +
+ Heating, Ventilation, and Air Conditioning (HVAC) systems are a major driver +of energy consumption in commercial and residential buildings. Recent studies +have shown that Deep Reinforcement Learning (DRL) algorithms can outperform +traditional reactive controllers. However, DRL-based solutions are generally +designed for ad hoc setups and lack standardization for comparison. To fill +this gap, this paper provides a critical and reproducible evaluation, in terms +of comfort and energy consumption, of several state-of-the-art DRL algorithms +for HVAC control. The study examines the controllers' robustness, adaptability, +and trade-off between optimization goals by using the Sinergym framework. The +results obtained confirm the potential of DRL algorithms, such as SAC and TD3, +in complex scenarios and reveal several challenges related to generalization +and incremental learning. + +
+
+
+
+
+ + ♻ ☆ Second-Order Fine-Tuning without Pain for LLMs:A Hessian Informed + Zeroth-Order Optimizer + + +
+ Fine-tuning large language models (LLMs) with classic first-order optimizers +entails prohibitive GPU memory due to the backpropagation process. Recent works +have turned to zeroth-order optimizers for fine-tuning, which save substantial +memory by using two forward passes. However, these optimizers are plagued by +the heterogeneity of parameter curvatures across different dimensions. In this +work, we propose HiZOO, a diagonal Hessian informed zeroth-order optimizer +which is the first work to leverage the diagonal Hessian to enhance +zeroth-order optimizer for fine-tuning LLMs. What's more, HiZOO avoids the +expensive memory cost and only increases one forward pass per step. Extensive +experiments on various models (350M~66B parameters) indicate that HiZOO +improves model convergence, significantly reducing training steps and +effectively enhancing model accuracy. Moreover, we visualize the optimization +trajectories of HiZOO on test functions, illustrating its effectiveness in +handling heterogeneous curvatures. Lastly, we provide theoretical proofs of +convergence for HiZOO. Code is publicly available at +https://anonymous.4open.science/r/HiZOO27F8. + +
+
+
+
+
+ + ♻ ☆ A rapid approach to urban traffic noise mapping with a generative + adversarial network + + +
+ With rapid urbanisation and the accompanying increase in traffic density, +traffic noise has become a major concern in urban planning. However, +traditional grid noise mapping methods have limitations in terms of time +consumption, software costs, and a lack of parameter integration interfaces. +These limitations hinder their ability to meet the need for iterative updates +and rapid performance feedback in the early design stages of street-scale urban +planning. Herein, we developed a rapid urban traffic noise mapping technique +that leverages generative adversarial networks (GANs) as a surrogate model. +This approach enables the rapid assessment of urban traffic noise distribution +by using urban elements such as roads and buildings as the input. The mean +values for the mean squared error (RMSE) and structural similarity index (SSIM) +are 0.3024 dB(A) and 0.8528, respectively, for the validation dataset. The +trained model is integrated into Grasshopper as a tool, facilitating the rapid +generation of traffic noise maps. This integration allows urban designers and +planners, even those without expertise in acoustics, to easily anticipate +changes in acoustics impacts caused by design in the early design stages. + +
+
+ comment: Accepted by Applied Acoustics +
+
+
+
+
+ + ♻ ☆ Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model + Predictive Control + + +
+ Model-based control requires an accurate model of the system dynamics for +precisely and safely controlling the robot in complex and dynamic environments. +Moreover, in the presence of variations in the operating conditions, the model +should be continuously refined to compensate for dynamics changes. In this +paper, we present a self-supervised learning approach that actively models the +dynamics of nonlinear robotic systems. We combine offline learning from past +experience and online learning from current robot interaction with the unknown +environment. These two ingredients enable a highly sample-efficient and +adaptive learning process, capable of accurately inferring model dynamics in +real-time even in operating regimes that greatly differ from the training +distribution. Moreover, we design an uncertainty-aware model predictive +controller that is heuristically conditioned to the aleatoric (data) +uncertainty of the learned dynamics. This controller actively chooses the +optimal control actions that (i) optimize the control performance and (ii) +improve the efficiency of online learning sample collection. We demonstrate the +effectiveness of our method through a series of challenging real-world +experiments using a quadrotor system. Our approach showcases high resilience +and generalization capabilities by consistently adapting to unseen flight +conditions, while it significantly outperforms classical and adaptive control +baselines. + +
+
+
+
+
+ + ♻ ☆ Primal-dual extrapolation methods for monotone inclusions under local + Lipschitz continuity + + +
+ In this paper we consider a class of monotone inclusion (MI) problems of +finding a zero of the sum of two monotone operators, in which one operator is +maximal monotone while the other is {\it locally Lipschitz} continuous. We +propose primal-dual extrapolation methods to solve them using a point and +operator extrapolation technique, whose parameters are chosen by a backtracking +line search scheme. The proposed methods enjoy an operation complexity of +${\cal O}(\log \epsilon^{-1})$ and ${\cal O}(\epsilon^{-1}\log \epsilon^{-1})$, +measured by the number of fundamental operations consisting only of evaluations +of one operator and resolvent of the other operator, for finding an +$\varepsilon$-residual solution of strongly and non-strongly MI problems, +respectively. The latter complexity significantly improves the previously best +operation complexity ${\cal O}(\varepsilon^{-2})$. As a byproduct, complexity +results of the primal-dual extrapolation methods are also obtained for finding +an $\varepsilon$-KKT or $\varepsilon$-residual solution of convex conic +optimization, conic constrained saddle point, and variational inequality +problems under {\it local Lipschitz} continuity. We provide preliminary +numerical results to demonstrate the performance of the proposed methods. + +
+
+ comment: To appear in Mathematics of Operations Research +
+
+
+
+
+ + ♻ ☆ Efficient Long-distance Latent Relation-aware Graph Neural Network for + Multi-modal Emotion Recognition in Conversations + + +
+ The task of multi-modal emotion recognition in conversation (MERC) aims to +analyze the genuine emotional state of each utterance based on the multi-modal +information in the conversation, which is crucial for conversation +understanding. Existing methods focus on using graph neural networks (GNN) to +model conversational relationships and capture contextual latent semantic +relationships. However, due to the complexity of GNN, existing methods cannot +efficiently capture the potential dependencies between long-distance +utterances, which limits the performance of MERC. In this paper, we propose an +Efficient Long-distance Latent Relation-aware Graph Neural Network (ELR-GNN) +for multi-modal emotion recognition in conversations. Specifically, we first +use pre-extracted text, video and audio features as input to Bi-LSTM to capture +contextual semantic information and obtain low-level utterance features. Then, +we use low-level utterance features to construct a conversational emotion +interaction graph. To efficiently capture the potential dependencies between +long-distance utterances, we use the dilated generalized forward push algorithm +to precompute the emotional propagation between global utterances and design an +emotional relation-aware operator to capture the potential semantic +associations between different utterances. Furthermore, we combine early fusion +and adaptive late fusion mechanisms to fuse latent dependency information +between speaker relationship information and context. Finally, we obtain +high-level discourse features and feed them into MLP for emotion prediction. +Extensive experimental results show that ELR-GNN achieves state-of-the-art +performance on the benchmark datasets IEMOCAP and MELD, with running times +reduced by 52\% and 35\%, respectively. + +
+
+ comment: 11 pages, 3 tables +
+
+
+
+
+ + ♻ ☆ VA-learning as a more efficient alternative to Q-learning ICML 2023 + + +
+ In reinforcement learning, the advantage function is critical for policy +improvement, but is often extracted from a learned Q-function. A natural +question is: Why not learn the advantage function directly? In this work, we +introduce VA-learning, which directly learns advantage function and value +function using bootstrapping, without explicit reference to Q-functions. +VA-learning learns off-policy and enjoys similar theoretical guarantees as +Q-learning. Thanks to the direct learning of advantage function and value +function, VA-learning improves the sample efficiency over Q-learning both in +tabular implementations and deep RL agents on Atari-57 games. We also identify +a close connection between VA-learning and the dueling architecture, which +partially explains why a simple architectural change to DQN agents tends to +improve performance. + +
+
+ comment: Accepted to ICML 2023 as a conference paper +
+
+
+
+
+ + ♻ ☆ Adaptive Split Balancing for Optimal Random Forest + + +
+ In this paper, we propose a new random forest algorithm that constructs the +trees using a novel adaptive split-balancing method. Rather than relying on the +widely-used random feature selection, we propose a permutation-based balanced +splitting criterion. The adaptive split balancing forest (ASBF), achieves +minimax optimality under the Lipschitz class. Its localized version, which fits +local regressions at the leaf level, attains the minimax rate under the broad +H\"older class $\mathcal{H}^{q,\beta}$ of problems for any $q\in\mathbb{N}$ and +$\beta\in(0,1]$. We identify that over-reliance on auxiliary randomness in tree +construction may compromise the approximation power of trees, leading to +suboptimal results. Conversely, the proposed less random, permutation-based +approach demonstrates optimality over a wide range of models. Although random +forests are known to perform well empirically, their theoretical convergence +rates are slow. Simplified versions that construct trees without data +dependence offer faster rates but lack adaptability during tree growth. Our +proposed method achieves optimality in simple, smooth scenarios while +adaptively learning the tree structure from the data. Additionally, we +establish uniform upper bounds and demonstrate that ASBF improves +dimensionality dependence in average treatment effect estimation problems. +Simulation studies and real-world applications demonstrate our methods' +superior performance over existing random forests. + +
+
+
+
+
+ + ♻ ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Comparative Analysis of Modality Fusion Approaches for Audio-Visual + Person Identification and Verification + + +
+ Multimodal learning involves integrating information from various modalities +to enhance learning and comprehension. We compare three modality fusion +strategies in person identification and verification by processing two +modalities: voice and face. In this paper, a one-dimensional convolutional +neural network is employed for x-vector extraction from voice, while the +pre-trained VGGFace2 network and transfer learning are utilized for face +modality. In addition, gammatonegram is used as speech representation in +engagement with the Darknet19 pre-trained network. The proposed systems are +evaluated using the K-fold cross-validation technique on the 118 speakers of +the test set of the VoxCeleb2 dataset. The comparative evaluations are done for +single-modality and three proposed multimodal strategies in equal situations. +Results demonstrate that the feature fusion strategy of gammatonegram and +facial features achieves the highest performance, with an accuracy of 98.37% in +the person identification task. However, concatenating facial features with the +x-vector reaches 0.62% for EER in verification tasks. + +
+
+ comment: This paper has been submitted to a conference +
+
+
+
+
+ + ☆ Digit Recognition using Multimodal Spiking Neural Networks + + +
+ Spiking neural networks (SNNs) are the third generation of neural networks +that are biologically inspired to process data in a fashion that emulates the +exchange of signals in the brain. Within the Computer Vision community SNNs +have garnered significant attention due in large part to the availability of +event-based sensors that produce a spatially resolved spike train in response +to changes in scene radiance. SNNs are used to process event-based data due to +their neuromorphic nature. The proposed work examines the neuromorphic +advantage of fusing multiple sensory inputs in classification tasks. +Specifically we study the performance of a SNN in digit classification by +passing in a visual modality branch (Neuromorphic-MNIST [N-MNIST]) and an +auditory modality branch (Spiking Heidelberg Digits [SHD]) from datasets that +were created using event-based sensors to generate a series of time-dependent +events. It is observed that multi-modal SNNs outperform unimodal visual and +unimodal auditory SNNs. Furthermore, it is observed that the process of sensory +fusion is insensitive to the depth at which the visual and auditory branches +are combined. This work achieves a 98.43% accuracy on the combined N-MNIST and +SHD dataset using a multimodal SNN that concatenates the visual and auditory +branches at a late depth. + +
+
+ comment: 4 pages, 2 figures, submitted to 2025 IEEE International Conference + on Acoustics, Speech, and Signal Processing +
+
+
+
+
+ + ☆ Multi-scale Multi-instance Visual Sound Localization and Segmentation + + +
+ Visual sound localization is a typical and challenging problem that predicts +the location of objects corresponding to the sound source in a video. Previous +methods mainly used the audio-visual association between global audio and +one-scale visual features to localize sounding objects in each image. Despite +their promising performance, they omitted multi-scale visual features of the +corresponding image, and they cannot learn discriminative regions compared to +ground truths. To address this issue, we propose a novel multi-scale +multi-instance visual sound localization framework, namely M2VSL, that can +directly learn multi-scale semantic features associated with sound sources from +the input image to localize sounding objects. Specifically, our M2VSL leverages +learnable multi-scale visual features to align audio-visual representations at +multi-level locations of the corresponding image. We also introduce a novel +multi-scale multi-instance transformer to dynamically aggregate multi-scale +cross-modal representations for visual sound localization. We conduct extensive +experiments on VGGSound-Instruments, VGG-Sound Sources, and AVSBench +benchmarks. The results demonstrate that the proposed M2VSL can achieve +state-of-the-art performance on sounding object localization and segmentation. + +
+
+
+
+
+ + ♻ ☆ Palantir: Towards Efficient Super Resolution for Ultra-high-definition + Live Streaming + + +
+ Neural enhancement through super-resolution (SR) deep neural networks (DNNs) +opens up new possibilities for ultra-high-definition (UHD) live streaming over +existing encoding and networking infrastructure. Yet, the heavy SR DNN +inference overhead leads to severe deployment challenges. To reduce the +overhead, existing systems propose to apply DNN-based SR only on carefully +selected anchor frames while upscaling non-anchor frames via the lightweight +reusing-based SR approach. However, frame-level scheduling is coarse-grained +and fails to deliver optimal efficiency. In this work, we propose Palantir, the +first neural-enhanced UHD live streaming system with fine-grained patch-level +scheduling. Two novel techniques are incorporated into Palantir to select the +most beneficial anchor patches and support latency-sensitive UHD live streaming +applications. Firstly, under the guidance of our pioneering and theoretical +analysis, Palantir constructs a directed acyclic graph (DAG) for lightweight +yet accurate SR quality estimation under any possible anchor patch set. +Secondly, to further optimize the scheduling latency, Palantir improves +parallelizability by refactoring the computation subprocedure of the estimation +process into a sparse matrix-matrix multiplication operation. + The evaluation results suggest that Palantir incurs a negligible scheduling +latency accounting for less than 5.7% of the end-to-end latency requirement. +When compared to the naive method of applying DNN-based SR on all the frames, +Palantir can reduce the SR DNN inference overhead by 20 times (or 60 times) +while preserving 54.0-82.6% (or 32.8-64.0%) of the quality gain. When compared +to the state-of-the-art real-time frame-level scheduling strategy, Palantir can +reduce the SR DNN inference overhead by 80.1% at most (and 38.4% on average) +without sacrificing the video quality. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 55 + +
+
+
+ + ☆ Bridging Episodes and Semantics: A Novel Framework for Long-Form Video + Understanding ECCV'24 + + +
+ While existing research often treats long-form videos as extended short +videos, we propose a novel approach that more accurately reflects human +cognition. This paper introduces BREASE: BRidging Episodes And SEmantics for +Long-Form Video Understanding, a model that simulates episodic memory +accumulation to capture action sequences and reinforces them with semantic +knowledge dispersed throughout the video. Our work makes two key contributions: +First, we develop an Episodic COmpressor (ECO) that efficiently aggregates +crucial representations from micro to semi-macro levels. Second, we propose a +Semantics reTRiever (SeTR) that enhances these aggregated representations with +semantic information by focusing on the broader context, dramatically reducing +feature dimensionality while preserving relevant macro-level information. +Extensive experiments demonstrate that BREASE achieves state-of-the-art +performance across multiple long video understanding benchmarks in both +zero-shot and fully-supervised settings. The project page and code are at: +https://joslefaure.github.io/assets/html/hermes.html. + +
+
+ comment: Accepted to the EVAL-FoMo Workshop at ECCV'24. Project page: + https://joslefaure.github.io/assets/html/hermes.html +
+
+
+
+
+ + ☆ SYNTHEVAL: Hybrid Behavioral Testing of NLP Models with Synthetic + CheckLists + + +
+ Traditional benchmarking in NLP typically involves using static held-out test +sets. However, this approach often results in an overestimation of performance +and lacks the ability to offer comprehensive, interpretable, and dynamic +assessments of NLP models. Recently, works like DynaBench (Kiela et al., 2021) +and CheckList (Ribeiro et al., 2020) have addressed these limitations through +behavioral testing of NLP models with test types generated by a multistep +human-annotated pipeline. Unfortunately, manually creating a variety of test +types requires much human labor, often at prohibitive cost. In this work, we +propose SYNTHEVAL, a hybrid behavioral testing framework that leverages large +language models (LLMs) to generate a wide range of test types for a +comprehensive evaluation of NLP models. SYNTHEVAL first generates sentences via +LLMs using controlled generation, and then identifies challenging examples by +comparing the predictions made by LLMs with task-specific NLP models. In the +last stage, human experts investigate the challenging examples, manually design +templates, and identify the types of failures the taskspecific models +consistently exhibit. We apply SYNTHEVAL to two classification tasks, sentiment +analysis and toxic language detection, and show that our framework is effective +in identifying weaknesses of strong models on these tasks. We share our code in +https://github.com/Loreley99/SynthEval_CheckList. + +
+
+
+
+
+ + ☆ CLOCR-C: Context Leveraging OCR Correction with Pre-trained Language + Models + + +
+ The digitisation of historical print media archives is crucial for increasing +accessibility to contemporary records. However, the process of Optical +Character Recognition (OCR) used to convert physical records to digital text is +prone to errors, particularly in the case of newspapers and periodicals due to +their complex layouts. This paper introduces Context Leveraging OCR Correction +(CLOCR-C), which utilises the infilling and context-adaptive abilities of +transformer-based language models (LMs) to improve OCR quality. The study aims +to determine if LMs can perform post-OCR correction, improve downstream NLP +tasks, and the value of providing the socio-cultural context as part of the +correction process. Experiments were conducted using seven LMs on three +datasets: the 19th Century Serials Edition (NCSE) and two datasets from the +Overproof collection. The results demonstrate that some LMs can significantly +reduce error rates, with the top-performing model achieving over a 60% +reduction in character error rate on the NCSE dataset. The OCR improvements +extend to downstream tasks, such as Named Entity Recognition, with increased +Cosine Named Entity Similarity. Furthermore, the study shows that providing +socio-cultural context in the prompts improves performance, while misleading +prompts lower performance. In addition to the findings, this study releases a +dataset of 91 transcribed articles from the NCSE, containing a total of 40 +thousand words, to support further research in this area. The findings suggest +that CLOCR-C is a promising approach for enhancing the quality of existing +digital archives by leveraging the socio-cultural information embedded in the +LMs and the text requiring correction. + +
+
+ comment: 13 pages, 3 figures, currently under peer review +
+
+
+
+
+ + ☆ NDP: Next Distribution Prediction as a More Broad Target + + +
+ Large language models (LLMs) trained on next-token prediction (NTP) paradigm +have demonstrated powerful capabilities. However, the existing NTP paradigm +contains several limitations, particularly related to planned task +complications and error propagation during inference. In our work, we extend +the critique of NTP, highlighting its limitation also due to training with a +narrow objective: the prediction of a sub-optimal one-hot distribution. To +support this critique, we conducted a pre-experiment treating the output +distribution from powerful LLMs as efficient world data compression. By +evaluating the similarity between the $n$-gram distribution and the one-hot +distribution with LLMs, we observed that the $n$-gram distributions align more +closely with the output distribution of LLMs. Based on this insight, we +introduce Next Distribution Prediction (NDP), which uses $n$-gram distributions +to replace the one-hot targets, enhancing learning without extra online +training time. We conducted experiments across translation, general task, +language transfer, and medical domain adaptation. Compared to NTP, NDP can +achieve up to +2.97 COMET improvement in translation tasks, +0.61 average +improvement in general tasks, and incredible +10.75 average improvement in the +medical domain. This demonstrates the concrete benefits of addressing the +target narrowing problem, pointing to a new direction for future work on +improving NTP. + +
+
+ comment: 8 pages,5 figures +
+
+
+
+
+ + ☆ Assessing Generative Language Models in Classification Tasks: + Performance and Self-Evaluation Capabilities in the Environmental and Climate + Change Domain + + +
+ This paper examines the performance of two Large Language Models (LLMs), +GPT3.5 and Llama2 and one Small Language Model (SLM) Gemma, across three +different classification tasks within the climate change (CC) and environmental +domain. Employing BERT-based models as a baseline, we compare their efficacy +against these transformer-based models. Additionally, we assess the models' +self-evaluation capabilities by analyzing the calibration of verbalized +confidence scores in these text classification tasks. Our findings reveal that +while BERT-based models generally outperform both the LLMs and SLM, the +performance of the large generative models is still noteworthy. Furthermore, +our calibration analysis reveals that although Gemma is well-calibrated in +initial tasks, it thereafter produces inconsistent results; Llama is reasonably +calibrated, and GPT consistently exhibits strong calibration. Through this +research, we aim to contribute to the ongoing discussion on the utility and +effectiveness of generative LMs in addressing some of the planet's most urgent +issues, highlighting their strengths and limitations in the context of ecology +and CC. + +
+
+ comment: 11 pages, to be published in NLDB 2024 +
+
+
+
+
+ + ☆ Impact of ChatGPT on the writing style of condensed matter physicists + + +
+ We apply a state-of-the-art difference-in-differences approach to estimate +the impact of ChatGPT's release on the writing style of condensed matter papers +on arXiv. Our analysis reveals a statistically significant improvement in the +English quality of abstracts written by non-native English speakers. +Importantly, this improvement remains robust even after accounting for other +potential factors, confirming that it can be attributed to the release of +ChatGPT. This indicates widespread adoption of the tool. Following the release +of ChatGPT, there is a significant increase in the use of unique words, while +the frequency of rare words decreases. Across language families, the changes in +writing style are significant for authors from the Latin and Ural-Altaic +groups, but not for those from the Germanic or other Indo-European groups. + +
+
+ comment: 9 pages, 1 figure, 7 tables +
+
+
+
+
+ + ☆ Modularity in Transformers: Investigating Neuron Separability & + Specialization + + +
+ Transformer models are increasingly prevalent in various applications, yet +our understanding of their internal workings remains limited. This paper +investigates the modularity and task specialization of neurons within +transformer architectures, focusing on both vision (ViT) and language (Mistral +7B) models. Using a combination of selective pruning and MoEfication clustering +techniques, we analyze the overlap and specialization of neurons across +different tasks and data subsets. Our findings reveal evidence of task-specific +neuron clusters, with varying degrees of overlap between related tasks. We +observe that neuron importance patterns persist to some extent even in randomly +initialized models, suggesting an inherent structure that training refines. +Additionally, we find that neuron clusters identified through MoEfication +correspond more strongly to task-specific neurons in earlier and later layers +of the models. This work contributes to a more nuanced understanding of +transformer internals and offers insights into potential avenues for improving +model interpretability and efficiency. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Investigating Neuron Ablation in Attention Heads: The Case for Peak + Activation Centering + + +
+ The use of transformer-based models is growing rapidly throughout society. +With this growth, it is important to understand how they work, and in +particular, how the attention mechanisms represent concepts. Though there are +many interpretability methods, many look at models through their neuronal +activations, which are poorly understood. We describe different lenses through +which to view neuron activations, and investigate the effectiveness in language +models and vision transformers through various methods of neural ablation: zero +ablation, mean ablation, activation resampling, and a novel approach we term +'peak ablation'. Through experimental analysis, we find that in different +regimes and models, each method can offer the lowest degradation of model +performance compared to other methods, with resampling usually causing the most +significant performance deterioration. We make our code available at +https://github.com/nickypro/investigating-ablation. + +
+
+ comment: 9 pages, 2 figures, XAI World Conference 2024 Late-Breaking Work +
+
+
+
+
+ + ☆ Bridging Domain Knowledge and Process Discovery Using Large Language + Models + + +
+ Discovering good process models is essential for different process analysis +tasks such as conformance checking and process improvements. Automated process +discovery methods often overlook valuable domain knowledge. This knowledge, +including insights from domain experts and detailed process documentation, +remains largely untapped during process discovery. This paper leverages Large +Language Models (LLMs) to integrate such knowledge directly into process +discovery. We use rules derived from LLMs to guide model construction, ensuring +alignment with both domain knowledge and actual process executions. By +integrating LLMs, we create a bridge between process knowledge expressed in +natural language and the discovery of robust process models, advancing process +discovery methodologies significantly. To showcase the usability of our +framework, we conducted a case study with the UWV employee insurance agency, +demonstrating its practical benefits and effectiveness. + +
+
+ comment: This paper is accepted at the AI4BPM 2024 workshop and to be + published in their proceedings +
+
+
+
+
+ + ☆ Towards Tailored Recovery of Lexical Diversity in Literary Machine + Translation + + +
+ Machine translations are found to be lexically poorer than human +translations. The loss of lexical diversity through MT poses an issue in the +automatic translation of literature, where it matters not only what is written, +but also how it is written. Current methods for increasing lexical diversity in +MT are rigid. Yet, as we demonstrate, the degree of lexical diversity can vary +considerably across different novels. Thus, rather than aiming for the rigid +increase of lexical diversity, we reframe the task as recovering what is lost +in the machine translation process. We propose a novel approach that consists +of reranking translation candidates with a classifier that distinguishes +between original and translated text. We evaluate our approach on 31 +English-to-Dutch book translations, and find that, for certain books, our +approach retrieves lexical diversity scores that are close to human +translation. + +
+
+ comment: Accepted to EAMT 2024 +
+
+
+
+
+ + ☆ Flexible and Effective Mixing of Large Language Models into a Mixture of + Domain Experts + + +
+ We present a toolkit for creating low-cost Mixture-of-Domain-Experts (MOE) +from trained models. The toolkit can be used for creating a mixture from models +or from adapters. We perform extensive tests and offer guidance on defining the +architecture of the resulting MOE using the toolkit. A public repository is +available. + +
+
+
+
+
+ + ☆ Improving Extraction of Clinical Event Contextual Properties from + Electronic Health Records: A Comparative Study + + +
+ Electronic Health Records are large repositories of valuable clinical data, +with a significant portion stored in unstructured text format. This textual +data includes clinical events (e.g., disorders, symptoms, findings, medications +and procedures) in context that if extracted accurately at scale can unlock +valuable downstream applications such as disease prediction. Using an existing +Named Entity Recognition and Linking methodology, MedCAT, these identified +concepts need to be further classified (contextualised) for their relevance to +the patient, and their temporal and negated status for example, to be useful +downstream. This study performs a comparative analysis of various natural +language models for medical text classification. Extensive experimentation +reveals the effectiveness of transformer-based language models, particularly +BERT. When combined with class imbalance mitigation techniques, BERT +outperforms Bi-LSTM models by up to 28% and the baseline BERT model by up to +16% for recall of the minority classes. The method has been implemented as part +of CogStack/MedCAT framework and made available to the community for further +research. + +
+
+
+
+
+ + ☆ Codec Does Matter: Exploring the Semantic Shortcoming of Codec for Audio + Language Model + + +
+ Recent advancements in audio generation have been significantly propelled by +the capabilities of Large Language Models (LLMs). The existing research on +audio LLM has primarily focused on enhancing the architecture and scale of +audio language models, as well as leveraging larger datasets, and generally, +acoustic codecs, such as EnCodec, are used for audio tokenization. However, +these codecs were originally designed for audio compression, which may lead to +suboptimal performance in the context of audio LLM. Our research aims to +address the shortcomings of current audio LLM codecs, particularly their +challenges in maintaining semantic integrity in generated audio. For instance, +existing methods like VALL-E, which condition acoustic token generation on text +transcriptions, often suffer from content inaccuracies and elevated word error +rates (WER) due to semantic misinterpretations of acoustic tokens, resulting in +word skipping and errors. To overcome these issues, we propose a +straightforward yet effective approach called X-Codec. X-Codec incorporates +semantic features from a pre-trained semantic encoder before the Residual +Vector Quantization (RVQ) stage and introduces a semantic reconstruction loss +after RVQ. By enhancing the semantic ability of the codec, X-Codec +significantly reduces WER in speech synthesis tasks and extends these benefits +to non-speech applications, including music and sound generation. Our +experiments in text-to-speech, music continuation, and text-to-sound tasks +demonstrate that integrating semantic information substantially improves the +overall performance of language models in audio generation. Our code and demo +are available (Demo: https://x-codec-audio.github.io Code: +https://github.com/zhenye234/xcodec) + +
+
+
+
+
+ + ☆ MaFeRw: Query Rewriting with Multi-Aspect Feedbacks for + Retrieval-Augmented Large Language Models + + +
+ In a real-world RAG system, the current query often involves spoken ellipses +and ambiguous references from dialogue contexts, necessitating query rewriting +to better describe user's information needs. However, traditional context-based +rewriting has minimal enhancement on downstream generation tasks due to the +lengthy process from query rewriting to response generation. Some researchers +try to utilize reinforcement learning with generation feedback to assist the +rewriter, but these sparse rewards provide little guidance in most cases, +leading to unstable training and generation results. We find that user's needs +are also reflected in the gold document, retrieved documents and ground truth. +Therefore, by feeding back these multi-aspect dense rewards to query rewriting, +more stable and satisfactory responses can be achieved. In this paper, we +propose a novel query rewriting method MaFeRw, which improves RAG performance +by integrating multi-aspect feedback from both the retrieval process and +generated results. Specifically, we first use manual data to train a T5 model +for the rewriter initialization. Next, we design three metrics as reinforcement +learning feedback: the similarity between the rewritten query and the gold +document, the ranking metrics, and ROUGE between the generation and the ground +truth. Inspired by RLAIF, we train three kinds of reward models for the above +metrics to achieve more efficient training. Finally, we combine the scores of +these reward models as feedback, and use PPO algorithm to explore the optimal +query rewriting strategy. Experimental results on two conversational RAG +datasets demonstrate that MaFeRw achieves superior generation metrics and more +stable training compared to baselines. + +
+
+
+
+
+ + ☆ Novel-WD: Exploring acquisition of Novel World Knowledge in LLMs Using + Prefix-Tuning + + +
+ Teaching new information to pre-trained large language models (PLM) is a +crucial but challenging task. Model adaptation techniques, such as fine-tuning +and parameter-efficient training have been shown to store new facts at a slow +rate; continual learning is an option but is costly and prone to catastrophic +forgetting. This work studies and quantifies how PLM may learn and remember new +world knowledge facts that do not occur in their pre-training corpus, which +only contains world knowledge up to a certain date. To that purpose, we first +propose Novel-WD, a new dataset consisting of sentences containing novel facts +extracted from recent Wikidata updates, along with two evaluation tasks in the +form of causal language modeling and multiple choice questions (MCQ). We make +this dataset freely available to the community, and release a procedure to +later build new versions of similar datasets with up-to-date information. We +also explore the use of prefix-tuning for novel information learning, and +analyze how much information can be stored within a given prefix. We show that +a single fact can reliably be encoded within a single prefix, and that the +prefix capacity increases with its length and with the base model size. + +
+
+
+
+
+ + ☆ From Text to Emotion: Unveiling the Emotion Annotation Capabilities of + LLMs + + +
+ Training emotion recognition models has relied heavily on human annotated +data, which present diversity, quality, and cost challenges. In this paper, we +explore the potential of Large Language Models (LLMs), specifically GPT4, in +automating or assisting emotion annotation. We compare GPT4 with supervised +models and or humans in three aspects: agreement with human annotations, +alignment with human perception, and impact on model training. We find that +common metrics that use aggregated human annotations as ground truth can +underestimate the performance, of GPT-4 and our human evaluation experiment +reveals a consistent preference for GPT-4 annotations over humans across +multiple datasets and evaluators. Further, we investigate the impact of using +GPT-4 as an annotation filtering process to improve model training. Together, +our findings highlight the great potential of LLMs in emotion annotation tasks +and underscore the need for refined evaluation methodologies. + +
+
+ comment: to be published in Interspeech 2024 +
+
+
+
+
+ + ☆ InkubaLM: A small language model for low-resource African languages + + +
+ High-resource language models often fall short in the African context, where +there is a critical need for models that are efficient, accessible, and locally +relevant, even amidst significant computing and data constraints. This paper +introduces InkubaLM, a small language model with 0.4 billion parameters, which +achieves performance comparable to models with significantly larger parameter +counts and more extensive training data on tasks such as machine translation, +question-answering, AfriMMLU, and the AfriXnli task. Notably, InkubaLM +outperforms many larger models in sentiment analysis and demonstrates +remarkable consistency across multiple languages. This work represents a +pivotal advancement in challenging the conventional paradigm that effective +language models must rely on substantial resources. Our model and datasets are +publicly available \footnote{\url{https://huggingface.co/lelapa}} to encourage +research and development on low-resource languages. + +
+
+
+
+
+ + ☆ Dynamic Self-Consistency: Leveraging Reasoning Paths for Efficient LLM + Sampling + + +
+ Self-Consistency (SC) is a widely used method to mitigate hallucinations in +Large Language Models (LLMs) by sampling the LLM multiple times and outputting +the most frequent solution. Despite its benefits, SC results in significant +computational costs proportional to the number of samples generated. Previous +early-stopping approaches, such as Early Stopping Self Consistency and Adaptive +Consistency, have aimed to reduce these costs by considering output +consistency, but they do not analyze the quality of the reasoning paths (RPs) +themselves. To address this issue, we propose Reasoning-Aware Self-Consistency +(RASC), an innovative early-stopping framework that dynamically adjusts the +number of sample generations by considering both the output answer and the RPs +from Chain of Thought (CoT) prompting. RASC assigns confidence scores +sequentially to the generated samples, stops when certain criteria are met, and +then employs weighted majority voting to optimize sample usage and enhance +answer reliability. We comprehensively test RASC with multiple LLMs across +varied QA datasets. RASC outperformed existing methods and significantly +reduces sample usage by an average of 80% while maintaining or improving +accuracy up to 5% compared to the original SC + +
+
+
+
+
+ + ☆ Tool-Assisted Agent on SQL Inspection and Refinement in Real-World + Scenarios + + +
+ Recent Text-to-SQL methods leverage large language models (LLMs) by +incorporating feedback from the database management system. While these methods +effectively address execution errors in SQL queries, they struggle with +database mismatches -- errors that do not trigger execution exceptions. +Database mismatches include issues such as condition mismatches and stricter +constraint mismatches, both of which are more prevalent in real-world +scenarios. To address these challenges, we propose a tool-assisted agent +framework for SQL inspection and refinement, equipping the LLM-based agent with +two specialized tools: a retriever and a detector, designed to diagnose and +correct SQL queries with database mismatches. These tools enhance the +capability of LLMs to handle real-world queries more effectively. We also +introduce Spider-Mismatch, a new dataset specifically constructed to reflect +the condition mismatch problems encountered in real-world scenarios. +Experimental results demonstrate that our method achieves the highest +performance on the averaged results of the Spider and Spider-Realistic datasets +in few-shot settings, and it significantly outperforms baseline methods on the +more realistic dataset, Spider-Mismatch. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ MemLong: Memory-Augmented Retrieval for Long Text Modeling + + +
+ Recent advancements in Large Language Models (LLMs) have yielded remarkable +success across diverse fields. However, handling long contexts remains a +significant challenge for LLMs due to the quadratic time and space complexity +of attention mechanisms and the growing memory consumption of the key-value +cache during generation. This work introduces MemLong: Memory-Augmented +Retrieval for Long Text Generation, a method designed to enhance the +capabilities of long-context language modeling by utilizing an external +retriever for historical information retrieval. MemLong combines a +non-differentiable ``ret-mem'' module with a partially trainable decoder-only +language model and introduces a fine-grained, controllable retrieval attention +mechanism that leverages semantic-level relevant chunks. Comprehensive +evaluations on multiple long-context language modeling benchmarks demonstrate +that MemLong consistently outperforms other state-of-the-art LLMs. More +importantly, MemLong can extend the context length on a single 3090 GPU from 4k +up to 80k. Our code is available at https://github.com/Bui1dMySea/MemLong + +
+
+
+
+
+ + ☆ UserSumBench: A Benchmark Framework for Evaluating User Summarization + Approaches + + +
+ Large language models (LLMs) have shown remarkable capabilities in generating +user summaries from a long list of raw user activity data. These summaries +capture essential user information such as preferences and interests, and +therefore are invaluable for LLM-based personalization applications, such as +explainable recommender systems. However, the development of new summarization +techniques is hindered by the lack of ground-truth labels, the inherent +subjectivity of user summaries, and human evaluation which is often costly and +time-consuming. To address these challenges, we introduce \UserSumBench, a +benchmark framework designed to facilitate iterative development of LLM-based +summarization approaches. This framework offers two key components: (1) A +reference-free summary quality metric. We show that this metric is effective +and aligned with human preferences across three diverse datasets (MovieLens, +Yelp and Amazon Review). (2) A novel robust summarization method that leverages +time-hierarchical summarizer and self-critique verifier to produce high-quality +summaries while eliminating hallucination. This method serves as a strong +baseline for further innovation in summarization techniques. + +
+
+
+
+
+ + ♻ ☆ Evaluating Named Entity Recognition: A comparative analysis of mono- and + multilingual transformer models on a novel Brazilian corporate earnings call + transcripts dataset + + +
+ Since 2018, when the Transformer architecture was introduced, Natural +Language Processing has gained significant momentum with pre-trained +Transformer-based models that can be fine-tuned for various tasks. Most models +are pre-trained on large English corpora, making them less applicable to other +languages, such as Brazilian Portuguese. In our research, we identified two +models pre-trained in Brazilian Portuguese (BERTimbau and PTT5) and two +multilingual models (mBERT and mT5). BERTimbau and mBERT use only the Encoder +module, while PTT5 and mT5 use both the Encoder and Decoder. Our study aimed to +evaluate their performance on a financial Named Entity Recognition (NER) task +and determine the computational requirements for fine-tuning and inference. To +this end, we developed the Brazilian Financial NER (BraFiNER) dataset, +comprising sentences from Brazilian banks' earnings calls transcripts annotated +using a weakly supervised approach. Additionally, we introduced a novel +approach that reframes the token classification task as a text generation +problem. After fine-tuning the models, we evaluated them using performance and +error metrics. Our findings reveal that BERT-based models consistently +outperform T5-based models. While the multilingual models exhibit comparable +macro F1-scores, BERTimbau demonstrates superior performance over PTT5. In +terms of error metrics, BERTimbau outperforms the other models. We also +observed that PTT5 and mT5 generated sentences with changes in monetary and +percentage values, highlighting the importance of accuracy and consistency in +the financial domain. Our findings provide insights into the differing +performance of BERT- and T5-based models for the NER task. + +
+
+
+
+
+ + ♻ ☆ Exploring Group and Symmetry Principles in Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated impressive performance across +a wide range of applications; however, assessing their reasoning capabilities +remains a significant challenge. In this paper, we introduce a framework +grounded in group and symmetry principles, which have played a crucial role in +fields such as physics and mathematics, and offer another way to evaluate their +capabilities. While the proposed framework is general, to showcase the benefits +of employing these properties, we focus on arithmetic reasoning and investigate +the performance of these models on four group properties: closure, identity, +inverse, and associativity. Our findings reveal that LLMs studied in this work +struggle to preserve group properties across different test regimes. In the +closure test, we observe biases towards specific outputs and an abrupt +degradation in their performance from 100% to 0% after a specific sequence +length. They also perform poorly in the identity test, which represents adding +irrelevant information in the context, and show sensitivity when subjected to +inverse test, which examines the robustness of the model with respect to +negation. In addition, we demonstrate that breaking down problems into smaller +steps helps LLMs in the associativity test that we have conducted. To support +these tests we have developed a synthetic dataset which will be released. + +
+
+
+
+
+ + ♻ ☆ Hoaxpedia: A Unified Wikipedia Hoax Articles Dataset + + +
+ Hoaxes are a recognised form of disinformation created deliberately, with +potential serious implications in the credibility of reference knowledge +resources such as Wikipedia. What makes detecting Wikipedia hoaxes hard is that +they often are written according to the official style guidelines. In this +work, we first provide a systematic analysis of similarities and discrepancies +between legitimate and hoax Wikipedia articles, and introduce Hoaxpedia, a +collection of 311 hoax articles (from existing literature and official +Wikipedia lists), together with semantically similar legitimate articles, which +together form a binary text classification dataset aimed at fostering research +in automated hoax detection. In this paper, We report results after analyzing +several language models, hoax-to-legit ratios, and the amount of text +classifiers are exposed to (full article vs the article's definition alone). +Our results suggest that detecting deceitful content in Wikipedia based on +content alone is hard but feasible, and complement our analysis with a study on +the differences in distributions in edit histories, and find that looking at +this feature yields better classification results than context. + +
+
+
+
+
+ + ♻ ☆ DualKanbaFormer: Kolmogorov-Arnold Networks and State Space Model + Transformer for Multimodal Aspect-based Sentiment Analysis + + +
+ Multimodal aspect-based sentiment analysis (MABSA) enhances sentiment +detection by combining text with other data types like images. However, despite +setting significant benchmarks, attention mechanisms exhibit limitations in +efficiently modelling long-range dependencies between aspect and opinion +targets within the text. They also face challenges in capturing global-context +dependencies for visual representations. To this end, we propose +Kolmogorov-Arnold Networks (KANs) and Selective State Space model (Mamba) +transformer (DualKanbaFormer), a novel architecture to address the above +issues. We leverage the power of Mamba to capture global context dependencies, +Multi-head Attention (MHA) to capture local context dependencies, and KANs to +capture non-linear modelling patterns for both textual representations (textual +KanbaFormer) and visual representations (visual KanbaFormer). Furthermore, we +fuse the textual KanbaFormer and visual KanbaFomer with a gated fusion layer to +capture the inter-modality dynamics. According to extensive experimental +results, our model outperforms some state-of-the-art (SOTA) studies on two +public datasets. + +
+
+ comment: 10 pages, 2 figures, and 3 tables +
+
+
+
+
+ + ♻ ☆ Question-Based Retrieval using Atomic Units for Enterprise RAG + + +
+ Enterprise retrieval augmented generation (RAG) offers a highly flexible +framework for combining powerful large language models (LLMs) with internal, +possibly temporally changing, documents. In RAG, documents are first chunked. +Relevant chunks are then retrieved for a user query, which are passed as +context to a synthesizer LLM to generate the query response. However, the +retrieval step can limit performance, as incorrect chunks can lead the +synthesizer LLM to generate a false response. This work applies a zero-shot +adaptation of standard dense retrieval steps for more accurate chunk recall. +Specifically, a chunk is first decomposed into atomic statements. A set of +synthetic questions are then generated on these atoms (with the chunk as the +context). Dense retrieval involves finding the closest set of synthetic +questions, and associated chunks, to the user query. It is found that retrieval +with the atoms leads to higher recall than retrieval with chunks. Further +performance gain is observed with retrieval using the synthetic questions +generated over the atoms. Higher recall at the retrieval step enables higher +performance of the enterprise LLM using the RAG pipeline. + +
+
+ comment: 14 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Beyond One-Size-Fits-All: Multi-Domain, Multi-Task Framework for + Embedding Model Selection + + +
+ This position paper proposes a systematic approach towards developing a +framework to help select the most effective embedding models for natural +language processing (NLP) tasks, addressing the challenge posed by the +proliferation of both proprietary and open-source encoder models. + +
+
+ comment: It was an initial idea - we plan to work on a detailed version +
+
+
+
+
+ + ♻ ☆ Docling Technical Report + + +
+ This technical report introduces Docling, an easy to use, self-contained, +MIT-licensed open-source package for PDF document conversion. It is powered by +state-of-the-art specialized AI models for layout analysis (DocLayNet) and +table structure recognition (TableFormer), and runs efficiently on commodity +hardware in a small resource budget. The code interface allows for easy +extensibility and addition of new features and models. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Retrieval Augmented Generation with + Chain-of-Thought SC + + +
+ Since the launch of ChatGPT at the end of 2022, generative dialogue models +represented by ChatGPT have quickly become essential tools in daily life. As +user expectations increase, enhancing the capability of generative dialogue +models to solve complex problems has become a focal point of current research. +This paper delves into the effectiveness of the RAFT (Retrieval Augmented +Fine-Tuning) method in improving the performance of Generative dialogue models. +RAFT combines chain-of-thought with model supervised fine-tuning (SFT) and +retrieval augmented generation (RAG), which significantly enhanced the model's +information extraction and logical reasoning abilities. We evaluated the RAFT +method across multiple datasets and analysed its performance in various +reasoning tasks, including long-form QA and short-form QA tasks, tasks in both +Chinese and English, and supportive and comparison reasoning tasks. Notably, it +addresses the gaps in previous research regarding long-form QA tasks and +Chinese datasets. Moreover, we also evaluate the benefit of the +chain-of-thought (CoT) in the RAFT method. This work offers valuable insights +for studies focused on enhancing the performance of generative dialogue models. + +
+
+ comment: Accepted by ISCSLP 2024 +
+
+
+
+
+ + ♻ ☆ Language models align with human judgments on key grammatical + constructions + + +
+ Do large language models (LLMs) make human-like linguistic generalizations? +Dentella et al. (2023) ("DGL") prompt several LLMs ("Is the following sentence +grammatically correct in English?") to elicit grammaticality judgments of 80 +English sentences, concluding that LLMs demonstrate a "yes-response bias" and a +"failure to distinguish grammatical from ungrammatical sentences". We +re-evaluate LLM performance using well-established practices and find that +DGL's data in fact provide evidence for just how well LLMs capture human +behaviors. Models not only achieve high accuracy overall, but also capture +fine-grained variation in human linguistic judgments. + +
+
+ comment: Published in PNAS at https://www.pnas.org/doi/10.1073/pnas.2400917121 + as response to Dentella et al. (2023) +
+
+
+
+
+ + ♻ ☆ Diversifying the Mixture-of-Experts Representation for Language Models + with Orthogonal Optimizer ECAI 2024 + + +
+ The Mixture of Experts (MoE) has emerged as a highly successful technique in +deep learning, based on the principle of divide-and-conquer to maximize model +capacity without significant additional computational cost. Even in the era of +large-scale language models (LLMs), MoE continues to play a crucial role, as +some researchers have indicated that GPT-4 adopts the MoE structure to ensure +diverse inference results. However, MoE is susceptible to performance +degeneracy, particularly evident in the issues of imbalance and homogeneous +representation among experts. While previous studies have extensively addressed +the problem of imbalance, the challenge of homogeneous representation remains +unresolved. In this study, we shed light on the homogeneous representation +problem, wherein experts in the MoE fail to specialize and lack diversity, +leading to frustratingly high similarities in their representations (up to 99\% +in a well-performed MoE model). This problem restricts the expressive power of +the MoE and, we argue, contradicts its original intention. To tackle this +issue, we propose a straightforward yet highly effective solution: OMoE, an +orthogonal expert optimizer. Additionally, we introduce an alternating training +strategy that encourages each expert to update in a direction orthogonal to the +subspace spanned by other experts. Our algorithm facilitates MoE training in +two key ways: firstly, it explicitly enhances representation diversity, and +secondly, it implicitly fosters interaction between experts during orthogonal +weights computation. Through extensive experiments, we demonstrate that our +proposed optimization algorithm significantly improves the performance of +fine-tuning the MoE model on the GLUE benchmark, SuperGLUE benchmark, +question-answering task, and name entity recognition tasks. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ♻ ☆ EUvsDisinfo: A Dataset for Multilingual Detection of Pro-Kremlin + Disinformation in News Articles CIKM 2024 + + +
+ This work introduces EUvsDisinfo, a multilingual dataset of disinformation +articles originating from pro-Kremlin outlets, along with trustworthy articles +from credible / less biased sources. It is sourced directly from the debunk +articles written by experts leading the EUvsDisinfo project. Our dataset is the +largest to-date resource in terms of the overall number of articles and +distinct languages. It also provides the largest topical and temporal coverage. +Using this dataset, we investigate the dissemination of pro-Kremlin +disinformation across different languages, uncovering language-specific +patterns targeting certain disinformation topics. We further analyse the +evolution of topic distribution over an eight-year period, noting a significant +surge in disinformation content before the full-scale invasion of Ukraine in +2022. Lastly, we demonstrate the dataset's applicability in training models to +effectively distinguish between disinformation and trustworthy content in +multilingual settings. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Jailbreak Attacks and Defenses Against Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have performed exceptionally in various +text-generative tasks, including question answering, translation, code +completion, etc. However, the over-assistance of LLMs has raised the challenge +of "jailbreaking", which induces the model to generate malicious responses +against the usage policy and society by designing adversarial prompts. With the +emergence of jailbreak attack methods exploiting different vulnerabilities in +LLMs, the corresponding safety alignment measures are also evolving. In this +paper, we propose a comprehensive and detailed taxonomy of jailbreak attack and +defense methods. For instance, the attack methods are divided into black-box +and white-box attacks based on the transparency of the target model. Meanwhile, +we classify defense methods into prompt-level and model-level defenses. +Additionally, we further subdivide these attack and defense methods into +distinct sub-classes and present a coherent diagram illustrating their +relationships. We also conduct an investigation into the current evaluation +methods and compare them from different perspectives. Our findings aim to +inspire future research and practical implementations in safeguarding LLMs +against adversarial attacks. Above all, although jailbreak remains a +significant concern within the community, we believe that our work enhances the +understanding of this domain and provides a foundation for developing more +secure LLMs. + +
+
+
+
+
+ + ♻ ☆ Expert-Token Resonance: Redefining MoE Routing through Affinity-Driven + Active Selection + + +
+ Mixture-of-Experts (MoE) architectures have emerged as a paradigm-shifting +approach for large language models (LLMs), offering unprecedented computational +efficiency. However, these architectures grapple with challenges of token +distribution imbalance and expert homogenization, impeding optimal semantic +generalization. We introduce a novel framework that redefines MoE routing +through affinity-driven active selection. The innovations for the framework +encompass: (1) A rigorous formulation of expert-token affinity metrics. (2) An +adaptive bidirectional selection mechanism leveraging resonance between experts +and tokens. (3) Theoretical derivation and experimental evidence of reduced +expert capacity bounds under dynamic token distribution evolution. It is also +integrated with orthogonal feature extraction module and an optimized loss +function for expert localization. Our theoretical analysis demonstrates that +this approach mitigates expert homogenization while enabling substantial +capacity boundary reduction. Experimental validation corroborates these +findings: it achieves a 40% reduction in token processed by each expert without +compromising model convergence or efficacy. When coupled with communication +optimizations, the training efficiency improvements of 5.4% to 46.6% can be +observed. After supervised fine-tuning, it exhibits performance gains of 9.7% +to 14.1% across GDAD, C-Eval, and TeleQnA benchmarks. + +
+
+
+
+
+ + ♻ ☆ TaSL: Task Skill Localization and Consolidation for Language Model + Continual Learning ACL 2024 + + +
+ Language model continual learning (CL) has recently attracted significant +interest for its ability to adapt large language models (LLMs) to dynamic +real-world scenarios without retraining. A major challenge in this domain is +catastrophic forgetting, where models lose previously acquired knowledge upon +learning new tasks. Existing approaches commonly utilize multiple +parameter-efficient fine-tuning (PEFT) blocks to acquire task-specific +knowledge, yet these methods are inefficient and fail to leverage potential +knowledge transfer across tasks. In this paper, we introduce a novel CL +framework for language models, named Task Skill Localization and Consolidation +(TaSL), which boosts knowledge transfer without depending on memory replay. +TaSL initially segregates the model into 'skill units' based on parameter +dependencies, allowing for more precise control. Subsequently, it employs a +novel group-wise skill localization technique to ascertain the importance +distribution of skill units for a new task. By comparing this importance +distribution with those from previous tasks, we implement a fine-grained skill +consolidation strategy that retains task-specific knowledge, thereby preventing +forgetting, and updates task-shared knowledge, which facilitates bi-directional +knowledge transfer. As a result, TaSL achieves an optimal balance between +retaining prior knowledge and excelling in new tasks. TaSL also demonstrates +strong generalizability, making it suitable for various base models and +adaptable to PEFT methods like LoRA. Furthermore, it offers notable +extensibility, supporting enhancements through integration with memory replay +techniques. Comprehensive experiments conducted on two CL benchmarks, involving +models ranging from 220M to 7B parameters, affirm the effectiveness of TaSL and +its variants across different settings. + +
+
+ comment: Extension of ACL 2024 paper titled: Continual Dialog State Tracking + via Task Skill Localization and Consolidation +
+
+
+
+
+ + ♻ ☆ ConCodeEval: Evaluating Large Language Models for Code Constraints in + Domain-Specific Languages + + +
+ Recent work shows Large Language Models (LLMs) struggle to understand natural +language constraints for various text generation tasks in zero- and few-shot +settings. While, in the code domain, there is wide usage of constraints in code +format to maintain the integrity of code written in Domain-Specific Languages +(DSLs) like JSON and YAML which are widely used for system-level programming +tasks in enterprises. Given that LLMs are increasingly used for system-level +code tasks, evaluating if they can comprehend these code constraints is +crucial. However, no work has been done to evaluate their controllability over +code constraints. Hence, we introduce ConCodeEval, a first-of-its-kind +benchmark having two novel tasks for code constraints across five +representations. Our findings suggest that language models struggle with code +constraints. Code languages that perform excellently for normal code tasks do +not perform well when the same languages represent fine-grained constraints. + +
+
+
+
+
+ + ♻ ☆ Contextualized Automatic Speech Recognition with Dynamic Vocabulary + + +
+ Deep biasing (DB) enhances the performance of end-to-end automatic speech +recognition (E2E-ASR) models for rare words or contextual phrases using a bias +list. However, most existing methods treat bias phrases as sequences of +subwords in a predefined static vocabulary. This naive sequence decomposition +produces unnatural token patterns, significantly lowering their occurrence +probability. More advanced techniques address this problem by expanding the +vocabulary with additional modules, including the external language model +shallow fusion or rescoring. However, they result in increasing the workload +due to the additional modules. This paper proposes a dynamic vocabulary where +bias tokens can be added during inference. Each entry in a bias list is +represented as a single token, unlike a sequence of existing subword tokens. +This approach eliminates the need to learn subword dependencies within the bias +phrases. This method is easily applied to various architectures because it only +expands the embedding and output layers in common E2E-ASR architectures. +Experimental results demonstrate that the proposed method improves the bias +phrase WER on English and Japanese datasets by 3.1 -- 4.9 points compared with +the conventional DB method. + +
+
+
+
+
+ + ♻ ☆ Causal-Guided Active Learning for Debiasing Large Language Models ACL 2024 + + +
+ Although achieving promising performance, recent analyses show that current +generative large language models (LLMs) may still capture dataset biases and +utilize them for generation, leading to poor generalizability and harmfulness +of LLMs. However, due to the diversity of dataset biases and the +over-optimization problem, previous prior-knowledge-based debiasing methods and +fine-tuning-based debiasing methods may not be suitable for current LLMs. To +address this issue, we explore combining active learning with the causal +mechanisms and propose a casual-guided active learning (CAL) framework, which +utilizes LLMs itself to automatically and autonomously identify informative +biased samples and induce the bias patterns. Then a cost-effective and +efficient in-context learning based method is employed to prevent LLMs from +utilizing dataset biases during generation. Experimental results show that CAL +can effectively recognize typical biased instances and induce various bias +patterns for debiasing LLMs. + +
+
+ comment: Accepted as ACL 2024 main conference & Rewared as Outstanding Paper +
+
+
+
+
+ + ♻ ☆ Towards Achieving Human Parity on End-to-end Simultaneous Speech + Translation via LLM Agent + + +
+ In this paper, we present Cross Language Agent -- Simultaneous +Interpretation, CLASI, a high-quality and human-like Simultaneous Speech +Translation (SiST) System. Inspired by professional human interpreters, we +utilize a novel data-driven read-write strategy to balance the translation +quality and latency. To address the challenge of translating in-domain +terminologies, CLASI employs a multi-modal retrieving module to obtain relevant +information to augment the translation. Supported by LLMs, our approach can +generate error-tolerated translation by considering the input audio, historical +context, and retrieved information. Experimental results show that our system +outperforms other systems by significant margins. Aligned with professional +human interpreters, we evaluate CLASI with a better human evaluation metric, +valid information proportion (VIP), which measures the amount of information +that can be successfully conveyed to the listeners. In the real-world +scenarios, where the speeches are often disfluent, informal, and unclear, CLASI +achieves VIP of 81.3% and 78.0% for Chinese-to-English and English-to-Chinese +translation directions, respectively. In contrast, state-of-the-art commercial +or open-source systems only achieve 35.4% and 41.6%. On the extremely hard +dataset, where other systems achieve under 13% VIP, CLASI can still achieve 70% +VIP. + +
+
+ comment: Authors are listed in alphabetical order by last name. Demonstrations + and human-annotated test sets are available at + https://byteresearchcla.github.io/clasi +
+
+
+
+
+ + ♻ ☆ SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding + + +
+ Scientific literature understanding is crucial for extracting targeted +information and garnering insights, thereby significantly advancing scientific +discovery. Despite the remarkable success of Large Language Models (LLMs), they +face challenges in scientific literature understanding, primarily due to (1) a +lack of scientific knowledge and (2) unfamiliarity with specialized scientific +tasks. + To develop an LLM specialized in scientific literature understanding, we +propose a hybrid strategy that integrates continual pre-training (CPT) and +supervised fine-tuning (SFT), to simultaneously infuse scientific domain +knowledge and enhance instruction-following capabilities for domain-specific +tasks.cIn this process, we identify two key challenges: (1) constructing +high-quality CPT corpora, and (2) generating diverse SFT instructions. We +address these challenges through a meticulous pipeline, including PDF text +extraction, parsing content error correction, quality filtering, and synthetic +instruction creation. Applying this strategy, we present a suite of LLMs: +SciLitLLM, specialized in scientific literature understanding. These models +demonstrate promising performance on scientific literature understanding +benchmarks. + Our contributions are threefold: (1) We present an effective framework that +integrates CPT and SFT to adapt LLMs to scientific literature understanding, +which can also be easily adapted to other domains. (2) We propose an LLM-based +synthesis method to generate diverse and high-quality scientific instructions, +resulting in a new instruction set -- SciLitIns -- for supervised fine-tuning +in less-represented scientific domains. (3) SciLitLLM achieves promising +performance improvements on scientific literature understanding benchmarks. + +
+
+
+
+
+ + ♻ ☆ AgentsCourt: Building Judicial Decision-Making Agents with Court Debate + Simulation and Legal Knowledge Augmentation ACL + + +
+ With the development of deep learning, natural language processing technology +has effectively improved the efficiency of various aspects of the traditional +judicial industry. However, most current efforts focus on tasks within +individual judicial stages, making it difficult to handle complex tasks that +span multiple stages. As the autonomous agents powered by large language models +are becoming increasingly smart and able to make complex decisions in +real-world settings, offering new insights for judicial intelligence. In this +paper, (1) we propose a novel multi-agent framework, AgentsCourt, for judicial +decision-making. Our framework follows the classic court trial process, +consisting of court debate simulation, legal resources retrieval and +decision-making refinement to simulate the decision-making of judge. (2) we +introduce SimuCourt, a judicial benchmark that encompasses 420 Chinese judgment +documents, spanning the three most common types of judicial cases. Furthermore, +to support this task, we construct a large-scale legal knowledge base, +Legal-KB, with multi-resource legal knowledge. (3) Extensive experiments show +that our framework outperforms the existing advanced methods in various +aspects, especially in generating legal articles, where our model achieves +significant improvements of 8.6% and 9.1% F1 score in the first and second +instance settings, respectively. + +
+
+ comment: This paper was first submitted to ACL ARR 2024 April (Under review) +
+
+
+
+
+ + ♻ ☆ Does CLIP Bind Concepts? Probing Compositionality in Large Image Models + + +
+ Large-scale neural network models combining text and images have made +incredible progress in recent years. However, it remains an open question to +what extent such models encode compositional representations of the concepts +over which they operate, such as correctly identifying "red cube" by reasoning +over the constituents "red" and "cube". In this work, we focus on the ability +of a large pretrained vision and language model (CLIP) to encode compositional +concepts and to bind variables in a structure-sensitive way (e.g., +differentiating "cube behind sphere" from "sphere behind cube"). To inspect the +performance of CLIP, we compare several architectures from research on +compositional distributional semantics models (CDSMs), a line of research that +attempts to implement traditional compositional linguistic structures within +embedding spaces. We benchmark them on three synthetic datasets - +single-object, two-object, and relational - designed to test concept binding. +We find that CLIP can compose concepts in a single-object setting, but in +situations where concept binding is needed, performance drops dramatically. At +the same time, CDSMs also perform poorly, with best performance at chance +level. + +
+
+ comment: Lewis and Nayak contributed equally +
+
+
+
+
+ + ♻ ☆ Measuring Dimensions of Self-Presentation in Twitter Bios and their + Links to Misinformation Sharing + + +
+ Social media platforms provide users with a profile description field, +commonly known as a ``bio," where they can present themselves to the world. A +growing literature shows that text in these bios can improve our understanding +of online self-presentation and behavior, but existing work relies exclusively +on keyword-based approaches to do so. We here propose and evaluate a suite of +\hl{simple, effective, and theoretically motivated} approaches to embed bios in +spaces that capture salient dimensions of social meaning, such as age and +partisanship. We \hl{evaluate our methods on four tasks, showing that the +strongest one out-performs several practical baselines.} We then show the +utility of our method in helping understand associations between +self-presentation and the sharing of URLs from low-quality news sites on +Twitter\hl{, with a particular focus on explore the interactions between age +and partisanship, and exploring the effects of self-presentations of +religiosity}. Our work provides new tools to help computational social +scientists make use of information in bios, and provides new insights into how +misinformation sharing may be perceived on Twitter. + +
+
+
+
+
+ + ♻ ☆ Token-level Direct Preference Optimization + + +
+ Fine-tuning pre-trained Large Language Models (LLMs) is essential to align +them with human values and intentions. This process often utilizes methods like +pairwise comparisons and KL divergence against a reference LLM, focusing on the +evaluation of full answers generated by the models. However, the generation of +these responses occurs in a token level, following a sequential, +auto-regressive fashion. In this paper, we introduce Token-level Direct +Preference Optimization (TDPO), a novel approach to align LLMs with human +preferences by optimizing policy at the token level. Unlike previous methods, +which face challenges in divergence efficiency, TDPO incorporates forward KL +divergence constraints for each token, improving alignment and diversity. +Utilizing the Bradley-Terry model for a token-based reward system, TDPO +enhances the regulation of KL divergence, while preserving simplicity without +the need for explicit reward modeling. Experimental results across various text +tasks demonstrate TDPO's superior performance in balancing alignment with +generation diversity. Notably, fine-tuning with TDPO strikes a better balance +than DPO in the controlled sentiment generation and single-turn dialogue +datasets, and significantly improves the quality of generated responses +compared to both DPO and PPO-based RLHF methods. Our code is open-sourced at +https://github.com/Vance0124/Token-level-Direct-Preference-Optimization. + +
+
+
+
+
+ + ♻ ☆ Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming + + +
+ Recent advances in language models have achieved significant progress. +GPT-4o, as a new milestone, has enabled real-time conversations with humans, +demonstrating near-human natural fluency. Such human-computer interaction +necessitates models with the capability to perform reasoning directly with the +audio modality and generate output in streaming. However, this remains beyond +the reach of current academic models, as they typically depend on extra TTS +systems for speech synthesis, resulting in undesirable latency. This paper +introduces the Mini-Omni, an audio-based end-to-end conversational model, +capable of real-time speech interaction. To achieve this capability, we propose +a text-instructed speech generation method, along with batch-parallel +strategies during inference to further boost the performance. Our method also +helps to retain the original model's language capabilities with minimal +degradation, enabling other works to establish real-time interaction +capabilities. We call this training method "Any Model Can Talk". We also +introduce the VoiceAssistant-400K dataset to fine-tune models optimized for +speech output. To our best knowledge, Mini-Omni is the first fully end-to-end, +open-source model for real-time speech interaction, offering valuable potential +for future research. + +
+
+ comment: Technical report, work in progress. Demo and code: + https://github.com/gpt-omni/mini-omni +
+
+
+
+
+ + ♻ ☆ Advancing Chinese biomedical text mining with community challenges + + +
+ Objective: This study aims to review the recent advances in community +challenges for biomedical text mining in China. Methods: We collected +information of evaluation tasks released in community challenges of biomedical +text mining, including task description, dataset description, data source, task +type and related links. A systematic summary and comparative analysis were +conducted on various biomedical natural language processing tasks, such as +named entity recognition, entity normalization, attribute extraction, relation +extraction, event extraction, text classification, text similarity, knowledge +graph construction, question answering, text generation, and large language +model evaluation. Results: We identified 39 evaluation tasks from 6 community +challenges that spanned from 2017 to 2023. Our analysis revealed the diverse +range of evaluation task types and data sources in biomedical text mining. We +explored the potential clinical applications of these community challenge tasks +from a translational biomedical informatics perspective. We compared with their +English counterparts, and discussed the contributions, limitations, lessons and +guidelines of these community challenges, while highlighting future directions +in the era of large language models. Conclusion: Community challenge evaluation +competitions have played a crucial role in promoting technology innovation and +fostering interdisciplinary collaboration in the field of biomedical text +mining. These challenges provide valuable platforms for researchers to develop +state-of-the-art solutions. + +
+
+
+
+
+ + ♻ ☆ Etalon: Holistic Performance Evaluation Framework for LLM Inference + Systems + + +
+ Serving large language models (LLMs) in production can incur substantial +costs, which has prompted recent advances in inference system optimizations. +Today, these systems are evaluated against conventional latency and throughput +metrics (eg. TTFT, TBT, Normalised Latency and TPOT). However, these metrics +fail to fully capture the nuances of LLM inference, leading to an incomplete +assessment of user-facing performance crucial for real-time applications such +as chat and translation. In this paper, we first identify the pitfalls of +current performance metrics in evaluating LLM inference systems. We then +propose Etalon, a comprehensive performance evaluation framework that includes +fluidity-index -- a novel metric designed to reflect the intricacies of the LLM +inference process and its impact on real-time user experience. Finally, we +evaluate various existing open-source platforms and model-as-a-service +offerings using Etalon, discussing their strengths and weaknesses. Etalon is +available at https://github.com/project-etalon/etalon. + +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised 3D Visual Grounding based on Visual Linguistic + Alignment + + +
+ Learning to ground natural language queries to target objects or regions in +3D point clouds is quite essential for 3D scene understanding. Nevertheless, +existing 3D visual grounding approaches require a substantial number of +bounding box annotations for text queries, which is time-consuming and +labor-intensive to obtain. In this paper, we propose 3D-VLA, a weakly +supervised approach for 3D visual grounding based on Visual Linguistic +Alignment. Our 3D-VLA exploits the superior ability of current large-scale +vision-language models (VLMs) on aligning the semantics between texts and 2D +images, as well as the naturally existing correspondences between 2D images and +3D point clouds, and thus implicitly constructs correspondences between texts +and 3D point clouds with no need for fine-grained box annotations in the +training procedure. During the inference stage, the learned text-3D +correspondence will help us ground the text queries to the 3D target objects +even without 2D images. To the best of our knowledge, this is the first work to +investigate 3D visual grounding in a weakly supervised manner by involving +large scale vision-language models, and extensive experiments on ReferIt3D and +ScanRefer datasets demonstrate that our 3D-VLA achieves comparable and even +superior results over the fully supervised methods. + +
+
+
+
+
+ + ♻ ☆ Improving Relational Database Interactions with Large Language Models: + Column Descriptions and Their Impact on Text-to-SQL Performance + + +
+ Relational databases often suffer from uninformative descriptors of table +contents, such as ambiguous columns and hard-to-interpret values, impacting +both human users and Text-to-SQL models. This paper explores the use of large +language models (LLMs) to generate informative column descriptions as a +semantic layer for relational databases. Using the BIRD-Bench development set, +we created ColSQL, a dataset with gold-standard column descriptions generated +and refined by LLMs and human annotators. We evaluated several +instruction-tuned models, finding that GPT-4o and Command R+ excelled in +generating high-quality descriptions. Additionally, we applied an +LLM-as-a-judge to evaluate model performance. Although this method does not +align well with human evaluations, we included it to explore its potential and +to identify areas for improvement. More work is needed to improve the +reliability of automatic evaluations for this task. We also find that detailed +column descriptions significantly improve Text-to-SQL execution accuracy, +especially when columns are uninformative. This study establishes LLMs as +effective tools for generating detailed metadata, enhancing the usability of +relational databases. + +
+
+
+
+
+ + ♻ ☆ Rasa: Building Expressive Speech Synthesis Systems for Indian Languages + in Low-resource Settings INTERSPEECH 2024 + + +
+ We release Rasa, the first multilingual expressive TTS dataset for any Indian +language, which contains 10 hours of neutral speech and 1-3 hours of expressive +speech for each of the 6 Ekman emotions covering 3 languages: Assamese, +Bengali, & Tamil. Our ablation studies reveal that just 1 hour of neutral and +30 minutes of expressive data can yield a Fair system as indicated by MUSHRA +scores. Increasing neutral data to 10 hours, with minimal expressive data, +significantly enhances expressiveness. This offers a practical recipe for +resource-constrained languages, prioritizing easily obtainable neutral data +alongside smaller amounts of expressive data. We show the importance of +syllabically balanced data and pooling emotions to enhance expressiveness. We +also highlight challenges in generating specific emotions, e.g., fear and +surprise. + +
+
+ comment: Accepted at INTERSPEECH 2024. First two authors listed contributed + equally +
+
+
+
+
+ + ♻ ☆ Phi-3 Technical Report: A Highly Capable Language Model Locally on Your + Phone + + +
+ We introduce phi-3-mini, a 3.8 billion parameter language model trained on +3.3 trillion tokens, whose overall performance, as measured by both academic +benchmarks and internal testing, rivals that of models such as Mixtral 8x7B and +GPT-3.5 (e.g., phi-3-mini achieves 69% on MMLU and 8.38 on MT-bench), despite +being small enough to be deployed on a phone. Our training dataset is a +scaled-up version of the one used for phi-2, composed of heavily filtered +publicly available web data and synthetic data. The model is also further +aligned for robustness, safety, and chat format. We also provide +parameter-scaling results with a 7B, 14B models trained for 4.8T tokens, called +phi-3-small, phi-3-medium, both significantly more capable than phi-3-mini +(e.g., respectively 75%, 78% on MMLU, and 8.7, 8.9 on MT-bench). To enhance +multilingual, multimodal, and long-context capabilities, we introduce three +models in the phi-3.5 series: phi-3.5-mini, phi-3.5-MoE, and phi-3.5-Vision. +The phi-3.5-MoE, a 16 x 3.8B MoE model with 6.6 billion active parameters, +achieves superior performance in language reasoning, math, and code tasks +compared to other open-source models of similar scale, such as Llama 3.1 and +the Mixtral series, and on par with Gemini-1.5-Flash and GPT-4o-mini. +Meanwhile, phi-3.5-Vision, a 4.2 billion parameter model derived from +phi-3.5-mini, excels in reasoning tasks and is adept at handling both +single-image and text prompts, as well as multi-image and text prompts. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ The optimal placement of the head in the noun phrase. The case of + demonstrative, numeral, adjective and noun + + +
+ The word order of a sentence is shaped by multiple principles. The principle +of syntactic dependency distance minimization is in conflict with the principle +of surprisal minimization (or predictability maximization) in single head +syntactic dependency structures: while the former predicts that the head should +be placed at the center of the linear arrangement, the latter predicts that the +head should be placed at one of the ends (either first or last). A critical +question is when surprisal minimization (or predictability maximization) should +surpass syntactic dependency distance minimization. In the context of single +head structures, it has been predicted that this is more likely to happen when +two conditions are met, i.e. (a) fewer words are involved and (b) words are +shorter. Here we test the prediction on the noun phrase when it is composed of +a demonstrative, a numeral, an adjective and a noun. We find that, across +preferred orders in languages, the noun tends to be placed at one of the ends, +confirming the theoretical prediction. We also show evidence of anti locality +effects: syntactic dependency distances in preferred orders are longer than +expected by chance. + +
+
+ comment: typos corrected +
+
+
+
+
+ + ♻ ☆ Training Language Models to Generate Text with Citations via + Fine-grained Rewards ACL 2024 + + +
+ While recent Large Language Models (LLMs) have proven useful in answering +user queries, they are prone to hallucination, and their responses often lack +credibility due to missing references to reliable sources. An intuitive +solution to these issues would be to include in-text citations referring to +external documents as evidence. While previous works have directly prompted +LLMs to generate in-text citations, their performances are far from +satisfactory, especially when it comes to smaller LLMs. In this work, we +propose an effective training framework using fine-grained rewards to teach +LLMs to generate highly supportive and relevant citations, while ensuring the +correctness of their responses. We also conduct a systematic analysis of +applying these fine-grained rewards to common LLM training strategies, +demonstrating its advantage over conventional practices. We conduct extensive +experiments on Question Answering (QA) datasets taken from the ALCE benchmark +and validate the model's generalizability using EXPERTQA. On LLaMA-2-7B, the +incorporation of fine-grained rewards achieves the best performance among the +baselines, even surpassing that of GPT-3.5-turbo. + +
+
+ comment: Accepted by ACL 2024 +
+
+
+
+
+ + ♻ ☆ Conversation Disentanglement with Bi-Level Contrastive Learning EMNLP 2022 + + +
+ Conversation disentanglement aims to group utterances into detached sessions, +which is a fundamental task in processing multi-party conversations. Existing +methods have two main drawbacks. First, they overemphasize pairwise utterance +relations but pay inadequate attention to the utterance-to-context relation +modeling. Second, huge amount of human annotated data is required for training, +which is expensive to obtain in practice. To address these issues, we propose a +general disentangle model based on bi-level contrastive learning. It brings +closer utterances in the same session while encourages each utterance to be +near its clustered session prototypes in the representation space. Unlike +existing approaches, our disentangle model works in both supervised setting +with labeled data and unsupervised setting when no such data is available. The +proposed method achieves new state-of-the-art performance on both settings +across several public datasets. + +
+
+ comment: Accepted by EMNLP 2022 Findings +
+
+
+
+
+ + ♻ ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce a novel architecture and a training framework to +support long context window and multilingual retrieval. Leveraging Matryoshka +Representation Loss, we further demonstrate that the reducing the embedding +dimensionality from 128 to 64 has insignificant impact on the model's retrieval +performance and cut storage requirements by up to 50%. Our new model, +Jina-ColBERT-v2, demonstrates strong performance across a range of English and +multilingual retrieval tasks, + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 110 + +
+
+
+ + ☆ Bridging Episodes and Semantics: A Novel Framework for Long-Form Video + Understanding ECCV'24 + + +
+ While existing research often treats long-form videos as extended short +videos, we propose a novel approach that more accurately reflects human +cognition. This paper introduces BREASE: BRidging Episodes And SEmantics for +Long-Form Video Understanding, a model that simulates episodic memory +accumulation to capture action sequences and reinforces them with semantic +knowledge dispersed throughout the video. Our work makes two key contributions: +First, we develop an Episodic COmpressor (ECO) that efficiently aggregates +crucial representations from micro to semi-macro levels. Second, we propose a +Semantics reTRiever (SeTR) that enhances these aggregated representations with +semantic information by focusing on the broader context, dramatically reducing +feature dimensionality while preserving relevant macro-level information. +Extensive experiments demonstrate that BREASE achieves state-of-the-art +performance across multiple long video understanding benchmarks in both +zero-shot and fully-supervised settings. The project page and code are at: +https://joslefaure.github.io/assets/html/hermes.html. + +
+
+ comment: Accepted to the EVAL-FoMo Workshop at ECCV'24. Project page: + https://joslefaure.github.io/assets/html/hermes.html +
+
+
+
+
+ + ☆ DARES: Depth Anything in Robotic Endoscopic Surgery with Self-supervised + Vector-LoRA of the Foundation Model + + +
+ Robotic-assisted surgery (RAS) relies on accurate depth estimation for 3D +reconstruction and visualization. While foundation models like Depth Anything +Models (DAM) show promise, directly applying them to surgery often yields +suboptimal results. Fully fine-tuning on limited surgical data can cause +overfitting and catastrophic forgetting, compromising model robustness and +generalization. Although Low-Rank Adaptation (LoRA) addresses some adaptation +issues, its uniform parameter distribution neglects the inherent feature +hierarchy, where earlier layers, learning more general features, require more +parameters than later ones. To tackle this issue, we introduce Depth Anything +in Robotic Endoscopic Surgery (DARES), a novel approach that employs a new +adaptation technique, Vector Low-Rank Adaptation (Vector-LoRA) on the DAM V2 to +perform self-supervised monocular depth estimation in RAS scenes. To enhance +learning efficiency, we introduce Vector-LoRA by integrating more parameters in +earlier layers and gradually decreasing parameters in later layers. We also +design a reprojection loss based on the multi-scale SSIM error to enhance depth +perception by better tailoring the foundation model to the specific +requirements of the surgical environment. The proposed method is validated on +the SCARED dataset and demonstrates superior performance over recent +state-of-the-art self-supervised monocular depth estimation techniques, +achieving an improvement of 13.3% in the absolute relative error metric. The +code and pre-trained weights are available at +https://github.com/mobarakol/DARES. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ CinePreGen: Camera Controllable Video Previsualization via + Engine-powered Diffusion + + +
+ With advancements in video generative AI models (e.g., SORA), creators are +increasingly using these techniques to enhance video previsualization. However, +they face challenges with incomplete and mismatched AI workflows. Existing +methods mainly rely on text descriptions and struggle with camera placement, a +key component of previsualization. To address these issues, we introduce +CinePreGen, a visual previsualization system enhanced with engine-powered +diffusion. It features a novel camera and storyboard interface that offers +dynamic control, from global to local camera adjustments. This is combined with +a user-friendly AI rendering workflow, which aims to achieve consistent results +through multi-masked IP-Adapter and engine simulation guidelines. In our +comprehensive evaluation study, we demonstrate that our system reduces +development viscosity (i.e., the complexity and challenges in the development +process), meets users' needs for extensive control and iteration in the design +process, and outperforms other AI video production workflows in cinematic +camera movement, as shown by our experiments and a within-subjects user study. +With its intuitive camera controls and realistic rendering of camera motion, +CinePreGen shows great potential for improving video production for both +individual creators and industry professionals. + +
+
+
+
+
+ + ☆ Open-vocabulary Temporal Action Localization using VLMs + + +
+ Video action localization aims to find timings of a specific action from a +long video. Although existing learning-based approaches have been successful, +those require annotating videos that come with a considerable labor cost. This +paper proposes a learning-free, open-vocabulary approach based on emerging +vision-language models (VLM). The challenge stems from the fact that VLMs are +neither designed to process long videos nor tailored for finding actions. We +overcome these problems by extending an iterative visual prompting technique. +Specifically, we sample video frames into a concatenated image with frame index +labels, making a VLM guess a frame that is considered to be closest to the +start/end of the action. Iterating this process by narrowing a sampling time +window results in finding a specific frame of start and end of an action. We +demonstrate that this sampling technique yields reasonable results, +illustrating a practical extension of VLMs for understanding videos. + +
+
+ comment: 7 pages, 5 figures, 4 tables. Last updated on August 30th, 2024 +
+
+
+
+
+ + ☆ Generative AI Enables Medical Image Segmentation in Ultra Low-Data + Regimes + + +
+ Semantic segmentation of medical images is pivotal in applications like +disease diagnosis and treatment planning. While deep learning has excelled in +automating this task, a major hurdle is the need for numerous annotated +segmentation masks, which are resource-intensive to produce due to the required +expertise and time. This scenario often leads to ultra low-data regimes, where +annotated images are extremely limited, posing significant challenges for the +generalization of conventional deep learning methods on test images. To address +this, we introduce a generative deep learning framework, which uniquely +generates high-quality paired segmentation masks and medical images, serving as +auxiliary data for training robust models in data-scarce environments. Unlike +traditional generative models that treat data generation and segmentation model +training as separate processes, our method employs multi-level optimization for +end-to-end data generation. This approach allows segmentation performance to +directly influence the data generation process, ensuring that the generated +data is specifically tailored to enhance the performance of the segmentation +model. Our method demonstrated strong generalization performance across 9 +diverse medical image segmentation tasks and on 16 datasets, in ultra-low data +regimes, spanning various diseases, organs, and imaging modalities. When +applied to various segmentation models, it achieved performance improvements of +10-20\% (absolute), in both same-domain and out-of-domain scenarios. Notably, +it requires 8 to 20 times less training data than existing methods to achieve +comparable results. This advancement significantly improves the feasibility and +cost-effectiveness of applying deep learning in medical imaging, particularly +in scenarios with limited data availability. + +
+
+
+
+
+ + ☆ How Knowledge Distillation Mitigates the Synthetic Gap in Fair Face + Recognition ECCV 2024 + + +
+ Leveraging the capabilities of Knowledge Distillation (KD) strategies, we +devise a strategy to fight the recent retraction of face recognition datasets. +Given a pretrained Teacher model trained on a real dataset, we show that +carefully utilising synthetic datasets, or a mix between real and synthetic +datasets to distil knowledge from this teacher to smaller students can yield +surprising results. In this sense, we trained 33 different models with and +without KD, on different datasets, with different architectures and losses. And +our findings are consistent, using KD leads to performance gains across all +ethnicities and decreased bias. In addition, it helps to mitigate the +performance gap between real and synthetic datasets. This approach addresses +the limitations of synthetic data training, improving both the accuracy and +fairness of face recognition models. + +
+
+ comment: Accepted at ECCV 2024 Workshops +
+
+
+
+
+ + ☆ Look, Learn and Leverage (L$^3$): Mitigating Visual-Domain Shift and + Discovering Intrinsic Relations via Symbolic Alignment + + +
+ Modern deep learning models have demonstrated outstanding performance on +discovering the underlying mechanisms when both visual appearance and intrinsic +relations (e.g., causal structure) data are sufficient, such as Disentangled +Representation Learning (DRL), Causal Representation Learning (CRL) and Visual +Question Answering (VQA) methods. However, generalization ability of these +models is challenged when the visual domain shifts and the relations data is +absent during finetuning. To address this challenge, we propose a novel +learning framework, Look, Learn and Leverage (L$^3$), which decomposes the +learning process into three distinct phases and systematically utilize the +class-agnostic segmentation masks as the common symbolic space to align visual +domains. Thus, a relations discovery model can be trained on the source domain, +and when the visual domain shifts and the intrinsic relations are absent, the +pretrained relations discovery model can be directly reused and maintain a +satisfactory performance. Extensive performance evaluations are conducted on +three different tasks: DRL, CRL and VQA, and show outstanding results on all +three tasks, which reveals the advantages of L$^3$. + +
+
+ comment: 17 pages, 9 figures, 6 tables +
+
+
+
+
+ + ☆ LSMS: Language-guided Scale-aware MedSegmentor for Medical Image + Referring Segmentation + + +
+ Conventional medical image segmentation methods have been found inadequate in +facilitating physicians with the identification of specific lesions for +diagnosis and treatment. Given the utility of text as an instructional format, +we introduce a novel task termed Medical Image Referring Segmentation (MIRS), +which requires segmenting specified lesions in images based on the given +language expressions. Due to the varying object scales in medical images, MIRS +demands robust vision-language modeling and comprehensive multi-scale +interaction for precise localization and segmentation under linguistic +guidance. However, existing medical image segmentation methods fall short in +meeting these demands, resulting in insufficient segmentation accuracy. In +response, we propose an approach named Language-guided Scale-aware MedSegmentor +(LSMS), incorporating two appealing designs: (1)~a Scale-aware Vision-Language +Attention module that leverages diverse convolutional kernels to acquire rich +visual knowledge and interact closely with linguistic features, thereby +enhancing lesion localization capability; (2)~a Full-Scale Decoder that +globally models multi-modal features across various scales, capturing +complementary information between scales to accurately outline lesion +boundaries. Addressing the lack of suitable datasets for MIRS, we constructed a +vision-language medical dataset called Reference Hepatic Lesion Segmentation +(RefHL-Seg). This dataset comprises 2,283 abdominal CT slices from 231 cases, +with corresponding textual annotations and segmentation masks for various liver +lesions in images. We validated the performance of LSMS for MIRS and +conventional medical image segmentation tasks across various datasets. Our LSMS +consistently outperforms on all datasets with lower computational costs. The +code and datasets will be released. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Enhancing Underwater Imaging with 4-D Light Fields: Dataset and Method + + +
+ In this paper, we delve into the realm of 4-D light fields (LFs) to enhance +underwater imaging plagued by light absorption, scattering, and other +challenges. Contrasting with conventional 2-D RGB imaging, 4-D LF imaging +excels in capturing scenes from multiple perspectives, thereby indirectly +embedding geometric information. This intrinsic property is anticipated to +effectively address the challenges associated with underwater imaging. By +leveraging both explicit and implicit depth cues present in 4-D LF images, we +propose a progressive, mutually reinforcing framework for underwater 4-D LF +image enhancement and depth estimation. Specifically, our framework explicitly +utilizes estimated depth information alongside implicit depth-related dynamic +convolutional kernels to modulate output features. The entire framework +decomposes this complex task, iteratively optimizing the enhanced image and +depth information to progressively achieve optimal enhancement results. More +importantly, we construct the first 4-D LF-based underwater image dataset for +quantitative evaluation and supervised training of learning-based methods, +comprising 75 underwater scenes and 3675 high-resolution 2K pairs. To craft +vibrant and varied underwater scenes, we build underwater environments with +various objects and adopt several types of degradation. Through extensive +experimentation, we showcase the potential and superiority of 4-D LF-based +underwater imaging vis-a-vis traditional 2-D RGB-based approaches. Moreover, +our method effectively corrects color bias and achieves state-of-the-art +performance. The dataset and code will be publicly available at +https://github.com/linlos1234/LFUIE. + +
+
+ comment: 14 pages, 14 figures +
+
+
+
+
+ + ☆ Evaluating Reliability in Medical DNNs: A Critical Analysis of Feature + and Confidence-Based OOD Detection MICCAI 2023 + + +
+ Reliable use of deep neural networks (DNNs) for medical image analysis +requires methods to identify inputs that differ significantly from the training +data, called out-of-distribution (OOD), to prevent erroneous predictions. OOD +detection methods can be categorised as either confidence-based (using the +model's output layer for OOD detection) or feature-based (not using the output +layer). We created two new OOD benchmarks by dividing the D7P (dermatology) and +BreastMNIST (ultrasound) datasets into subsets which either contain or don't +contain an artefact (rulers or annotations respectively). Models were trained +with artefact-free images, and images with the artefacts were used as OOD test +sets. For each OOD image, we created a counterfactual by manually removing the +artefact via image processing, to assess the artefact's impact on the model's +predictions. We show that OOD artefacts can boost a model's softmax confidence +in its predictions, due to correlations in training data among other factors. +This contradicts the common assumption that OOD artefacts should lead to more +uncertain outputs, an assumption on which most confidence-based methods rely. +We use this to explain why feature-based methods (e.g. Mahalanobis score) +typically have greater OOD detection performance than confidence-based methods +(e.g. MCP). However, we also show that feature-based methods typically perform +worse at distinguishing between inputs that lead to correct and incorrect +predictions (for both OOD and ID data). Following from these insights, we argue +that a combination of feature-based and confidence-based methods should be used +within DNN pipelines to mitigate their respective weaknesses. These project's +code and OOD benchmarks are available at: +https://github.com/HarryAnthony/Evaluating_OOD_detection. + +
+
+ comment: Accepted for the Uncertainty for Safe Utilization of Machine Learning + in Medical Imaging (UNSURE 2024) workshop at the MICCAI 2023 +
+
+
+
+
+ + ☆ Investigating Neuron Ablation in Attention Heads: The Case for Peak + Activation Centering + + +
+ The use of transformer-based models is growing rapidly throughout society. +With this growth, it is important to understand how they work, and in +particular, how the attention mechanisms represent concepts. Though there are +many interpretability methods, many look at models through their neuronal +activations, which are poorly understood. We describe different lenses through +which to view neuron activations, and investigate the effectiveness in language +models and vision transformers through various methods of neural ablation: zero +ablation, mean ablation, activation resampling, and a novel approach we term +'peak ablation'. Through experimental analysis, we find that in different +regimes and models, each method can offer the lowest degradation of model +performance compared to other methods, with resampling usually causing the most +significant performance deterioration. We make our code available at +https://github.com/nickypro/investigating-ablation. + +
+
+ comment: 9 pages, 2 figures, XAI World Conference 2024 Late-Breaking Work +
+
+
+
+
+ + ☆ Structuring a Training Strategy to Robustify Perception Models with + Realistic Image Augmentations + + +
+ Advancing Machine Learning (ML)-based perception models for autonomous +systems necessitates addressing weak spots within the models, particularly in +challenging Operational Design Domains (ODDs). These are environmental +operating conditions of an autonomous vehicle which can contain difficult +conditions, e.g., lens flare at night or objects reflected in a wet street. +This report introduces a novel methodology for training with augmentations to +enhance model robustness and performance in such conditions. The proposed +approach leverages customized physics-based augmentation functions, to generate +realistic training data that simulates diverse ODD scenarios. + We present a comprehensive framework that includes identifying weak spots in +ML models, selecting suitable augmentations, and devising effective training +strategies. The methodology integrates hyperparameter optimization and latent +space optimization to fine-tune augmentation parameters, ensuring they +maximally improve the ML models' performance. Experimental results demonstrate +improvements in model performance, as measured by commonly used metrics such as +mean Average Precision (mAP) and mean Intersection over Union (mIoU) on +open-source object detection and semantic segmentation models and datasets. + Our findings emphasize that optimal training strategies are model- and +data-specific and highlight the benefits of integrating augmentations into the +training pipeline. By incorporating augmentations, we observe enhanced +robustness of ML-based perception models, making them more resilient to edge +cases encountered in real-world ODDs. This work underlines the importance of +customized augmentations and offers an effective solution for improving the +safety and reliability of autonomous driving functions. + +
+
+
+
+
+ + ☆ BOP-D: Revisiting 6D Pose Estimation Benchmark for Better Evaluation + under Visual Ambiguities + + +
+ Currently, 6D pose estimation methods are benchmarked on datasets that +consider, for their ground truth annotations, visual ambiguities as only +related to global object symmetries. However, as previously observed [26], +visual ambiguities can also happen depending on the viewpoint or the presence +of occluding objects, when disambiguating parts become hidden. The visual +ambiguities are therefore actually different across images. We thus first +propose an automatic method to re-annotate those datasets with a 6D pose +distribution specific to each image, taking into account the visibility of the +object surface in the image to correctly determine the visual ambiguities. +Given this improved ground truth, we re-evaluate the state-of-the-art methods +and show this greatly modify the ranking of these methods. Our annotations also +allow us to benchmark recent methods able to estimate a pose distribution on +real images for the first time. We will make our annotations for the T-LESS +dataset and our code publicly available. + +
+
+
+
+
+ + ☆ DCUDF2: Improving Efficiency and Accuracy in Extracting Zero Level Sets + from Unsigned Distance Fields + + +
+ Unsigned distance fields (UDFs) allow for the representation of models with +complex topologies, but extracting accurate zero level sets from these fields +poses significant challenges, particularly in preserving topological accuracy +and capturing fine geometric details. To overcome these issues, we introduce +DCUDF2, an enhancement over DCUDF--the current state-of-the-art method--for +extracting zero level sets from UDFs. Our approach utilizes an accuracy-aware +loss function, enhanced with self-adaptive weights, to improve geometric +quality significantly. We also propose a topology correction strategy that +reduces the dependence on hyper-parameter, increasing the robustness of our +method. Furthermore, we develop new operations leveraging self-adaptive weights +to boost runtime efficiency. Extensive experiments on surface extraction across +diverse datasets demonstrate that DCUDF2 outperforms DCUDF and existing methods +in both geometric fidelity and topological accuracy. We will make the source +code publicly available. + +
+
+
+
+
+ + ☆ UrBench: A Comprehensive Benchmark for Evaluating Large Multimodal + Models in Multi-View Urban Scenarios + + +
+ Recent evaluations of Large Multimodal Models (LMMs) have explored their +capabilities in various domains, with only few benchmarks specifically focusing +on urban environments. Moreover, existing urban benchmarks have been limited to +evaluating LMMs with basic region-level urban tasks under singular views, +leading to incomplete evaluations of LMMs' abilities in urban environments. To +address these issues, we present UrBench, a comprehensive benchmark designed +for evaluating LMMs in complex multi-view urban scenarios. UrBench contains +11.6K meticulously curated questions at both region-level and role-level that +cover 4 task dimensions: Geo-Localization, Scene Reasoning, Scene +Understanding, and Object Understanding, totaling 14 task types. In +constructing UrBench, we utilize data from existing datasets and additionally +collect data from 11 cities, creating new annotations using a cross-view +detection-matching method. With these images and annotations, we then integrate +LMM-based, rule-based, and human-based methods to construct large-scale +high-quality questions. Our evaluations on 21 LMMs show that current LMMs +struggle in the urban environments in several aspects. Even the best performing +GPT-4o lags behind humans in most tasks, ranging from simple tasks such as +counting to complex tasks such as orientation, localization and object +attribute recognition, with an average performance gap of 17.4%. Our benchmark +also reveals that LMMs exhibit inconsistent behaviors with different urban +views, especially with respect to understanding cross-view relations. UrBench +datasets and benchmark results will be publicly available at +https://opendatalab.github.io/UrBench/. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ VisionTS: Visual Masked Autoencoders Are Free-Lunch Zero-Shot Time + Series Forecasters + + +
+ Foundation models have emerged as a promising approach in time series +forecasting (TSF). Existing approaches either fine-tune large language models +(LLMs) or build large-scale time-series datasets to develop TSF foundation +models. However, these methods face challenges due to the severe cross-domain +gap or in-domain heterogeneity. In this paper, we explore a new road to +building a TSF foundation model from rich and high-quality natural images, +based on the intrinsic similarities between images and time series. To bridge +the gap between the two domains, we reformulate the TSF task as an image +reconstruction task, which is further processed by a visual masked autoencoder +(MAE) self-supervised pre-trained on the ImageNet dataset. Surprisingly, +without further adaptation in the time-series domain, the proposed VisionTS +could achieve superior zero-shot forecasting performance compared to existing +TSF foundation models. With minimal fine-tuning, VisionTS could further improve +the forecasting and achieve state-of-the-art performance in most cases. These +findings suggest that visual models could be a free lunch for TSF and highlight +the potential for future cross-domain research between computer vision and TSF. +Our code is publicly available at https://github.com/Keytoyze/VisionTS. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ☆ Abstracted Gaussian Prototypes for One-Shot Concept Learning + + +
+ We introduce a cluster-based generative image segmentation framework to +encode higher-level representations of visual concepts based on one-shot +learning inspired by the Omniglot Challenge. The inferred parameters of each +component of a Gaussian Mixture Model (GMM) represent a distinct topological +subpart of a visual concept. Sampling new data from these parameters generates +augmented subparts to build a more robust prototype for each concept, i.e., the +Abstracted Gaussian Prototype (AGP). This framework addresses one-shot +classification tasks using a cognitively-inspired similarity metric and +addresses one-shot generative tasks through a novel AGP-VAE pipeline employing +variational autoencoders (VAEs) to generate new class variants. Results from +human judges reveal that the generative pipeline produces novel examples and +classes of visual concepts that are broadly indistinguishable from those made +by humans. The proposed framework leads to impressive but not state-of-the-art +classification accuracy; thus, the contribution is two-fold: 1) the system is +uniquely low in theoretical and computational complexity and operates in a +completely standalone manner compared while existing approaches draw heavily on +pre-training or knowledge engineering; and 2) in contrast with competing neural +network models, the AGP approach addresses the importance of breadth of task +capability emphasized in the Omniglot challenge (i.e., successful performance +on generative tasks). These two points are critical as we advance toward an +understanding of how learning/reasoning systems can produce viable, robust, and +flexible concepts based on literally nothing more than a single example. + +
+
+
+
+
+ + ☆ A nonlinear elasticity model in computer vision + + +
+ The purpose of this paper is to analyze a nonlinear elasticity model +previously introduced by the authors for comparing two images, regarded as +bounded open subsets of $\R^n$ together with associated vector-valued intensity +maps. Optimal transformations between the images are sought as minimisers of an +integral functional among orientation-preserving homeomorphisms. The existence +of minimisers is proved under natural coercivity and polyconvexity conditions, +assuming only that the intensity functions are bounded measurable. Variants of +the existence theorem are also proved, first under the constraint that finite +sets of landmark points in the two images are mapped one to the other, and +second when one image is to be compared to an unknown part of another. + The question is studied as to whether for images related by a linear mapping +the unique minimizer is given by that linear mapping. For a natural class of +functional integrands an example is given guaranteeing that this property holds +for pairs of images in which the second is a scaling of the first by a constant +factor. However for the property to hold for arbitrary pairs of linearly +related images it is shown that the integrand has to depend on the gradient of +the transformation as a convex function of its determinant alone. This suggests +a new model in which the integrand depends also on second derivatives of the +transformation, and an example is given for which both existence of minimizers +is assured and the above property holds for all pairs of linearly related +images. + +
+
+
+
+
+ + ☆ CondSeg: Ellipse Estimation of Pupil and Iris via Conditioned + Segmentation + + +
+ Parsing of eye components (i.e. pupil, iris and sclera) is fundamental for +eye tracking and gaze estimation for AR/VR products. Mainstream approaches +tackle this problem as a multi-class segmentation task, providing only visible +part of pupil/iris, other methods regress elliptical parameters using +human-annotated full pupil/iris parameters. In this paper, we consider two +priors: projected full pupil/iris circle can be modelled with ellipses (ellipse +prior), and the visibility of pupil/iris is controlled by openness of +eye-region (condition prior), and design a novel method CondSeg to estimate +elliptical parameters of pupil/iris directly from segmentation labels, without +explicitly annotating full ellipses, and use eye-region mask to control the +visibility of estimated pupil/iris ellipses. Conditioned segmentation loss is +used to optimize the parameters by transforming parameterized ellipses into +pixel-wise soft masks in a differentiable way. Our method is tested on public +datasets (OpenEDS-2019/-2020) and shows competitive results on segmentation +metrics, and provides accurate elliptical parameters for further applications +of eye tracking simultaneously. + +
+
+
+
+
+ + ☆ OG-Mapping: Octree-based Structured 3D Gaussians for Online Dense + Mapping + + +
+ 3D Gaussian splatting (3DGS) has recently demonstrated promising advancements +in RGB-D online dense mapping. Nevertheless, existing methods excessively rely +on per-pixel depth cues to perform map densification, which leads to +significant redundancy and increased sensitivity to depth noise. Additionally, +explicitly storing 3D Gaussian parameters of room-scale scene poses a +significant storage challenge. In this paper, we introduce OG-Mapping, which +leverages the robust scene structural representation capability of sparse +octrees, combined with structured 3D Gaussian representations, to achieve +efficient and robust online dense mapping. Moreover, OG-Mapping employs an +anchor-based progressive map refinement strategy to recover the scene +structures at multiple levels of detail. Instead of maintaining a small number +of active keyframes with a fixed keyframe window as previous approaches do, a +dynamic keyframe window is employed to allow OG-Mapping to better tackle false +local minima and forgetting issues. Experimental results demonstrate that +OG-Mapping delivers more robust and superior realism mapping results than +existing Gaussian-based RGB-D online mapping methods with a compact model, and +no additional post-processing is required. + +
+
+
+
+
+ + ☆ How Could Generative AI Support Compliance with the EU AI Act? A Review + for Safe Automated Driving Perception + + +
+ Deep Neural Networks (DNNs) have become central for the perception functions +of autonomous vehicles, substantially enhancing their ability to understand and +interpret the environment. However, these systems exhibit inherent limitations +such as brittleness, opacity, and unpredictable behavior in out-of-distribution +scenarios. The European Union (EU) Artificial Intelligence (AI) Act, as a +pioneering legislative framework, aims to address these challenges by +establishing stringent norms and standards for AI systems, including those used +in autonomous driving (AD), which are categorized as high-risk AI. In this +work, we explore how the newly available generative AI models can potentially +support addressing upcoming regulatory requirements in AD perception, +particularly with respect to safety. This short review paper summarizes the +requirements arising from the EU AI Act regarding DNN-based perception systems +and systematically categorizes existing generative AI applications in AD. While +generative AI models show promise in addressing some of the EU AI Acts +requirements, such as transparency and robustness, this review examines their +potential benefits and discusses how developers could leverage these methods to +enhance compliance with the Act. The paper also highlights areas where further +research is needed to ensure reliable and safe integration of these +technologies. + +
+
+
+
+
+ + ☆ NanoMVG: USV-Centric Low-Power Multi-Task Visual Grounding based on + Prompt-Guided Camera and 4D mmWave Radar + + +
+ Recently, visual grounding and multi-sensors setting have been incorporated +into perception system for terrestrial autonomous driving systems and Unmanned +Surface Vehicles (USVs), yet the high complexity of modern learning-based +visual grounding model using multi-sensors prevents such model to be deployed +on USVs in the real-life. To this end, we design a low-power multi-task model +named NanoMVG for waterway embodied perception, guiding both camera and 4D +millimeter-wave radar to locate specific object(s) through natural language. +NanoMVG can perform both box-level and mask-level visual grounding tasks +simultaneously. Compared to other visual grounding models, NanoMVG achieves +highly competitive performance on the WaterVG dataset, particularly in harsh +environments and boasts ultra-low power consumption for long endurance. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Covariance-corrected Whitening Alleviates Network Degeneration on + Imbalanced Classification + + +
+ Class imbalance is a critical issue in image classification that +significantly affects the performance of deep recognition models. In this work, +we first identify a network degeneration dilemma that hinders the model +learning by introducing a high linear dependence among the features inputted +into the classifier. To overcome this challenge, we propose a novel framework +called Whitening-Net to mitigate the degenerate solutions, in which ZCA +whitening is integrated before the linear classifier to normalize and +decorrelate the batch samples. However, in scenarios with extreme class +imbalance, the batch covariance statistic exhibits significant fluctuations, +impeding the convergence of the whitening operation. Therefore, we propose two +covariance-corrected modules, the Group-based Relatively Balanced Batch Sampler +(GRBS) and the Batch Embedded Training (BET), to get more accurate and stable +batch covariance, thereby reinforcing the capability of whitening. Our modules +can be trained end-to-end without incurring substantial computational costs. +Comprehensive empirical evaluations conducted on benchmark datasets, including +CIFAR-LT-10/100, ImageNet-LT, and iNaturalist-LT, validate the effectiveness of +our proposed approaches. + +
+
+ comment: 20 pages, 10 figures, 10 tables. arXiv admin note: text overlap with + arXiv:2112.05958 +
+
+
+
+
+ + ☆ Hybrid Classification-Regression Adaptive Loss for Dense Object + Detection + + +
+ For object detection detectors, enhancing model performance hinges on the +ability to simultaneously consider inconsistencies across tasks and focus on +difficult-to-train samples. Achieving this necessitates incorporating +information from both the classification and regression tasks. However, prior +work tends to either emphasize difficult-to-train samples within their +respective tasks or simply compute classification scores with IoU, often +leading to suboptimal model performance. In this paper, we propose a Hybrid +Classification-Regression Adaptive Loss, termed as HCRAL. Specifically, we +introduce the Residual of Classification and IoU (RCI) module for cross-task +supervision, addressing task inconsistencies, and the Conditioning Factor (CF) +to focus on difficult-to-train samples within each task. Furthermore, we +introduce a new strategy named Expanded Adaptive Training Sample Selection +(EATSS) to provide additional samples that exhibit classification and +regression inconsistencies. To validate the effectiveness of the proposed +method, we conduct extensive experiments on COCO test-dev. Experimental +evaluations demonstrate the superiority of our approachs. Additionally, we +designed experiments by separately combining the classification and regression +loss with regular loss functions in popular one-stage models, demonstrating +improved performance. + +
+
+
+
+
+ + ☆ EMHI: A Multimodal Egocentric Human Motion Dataset with HMD and + Body-Worn IMUs + + +
+ Egocentric human pose estimation (HPE) using wearable sensors is essential +for VR/AR applications. Most methods rely solely on either egocentric-view +images or sparse Inertial Measurement Unit (IMU) signals, leading to +inaccuracies due to self-occlusion in images or the sparseness and drift of +inertial sensors. Most importantly, the lack of real-world datasets containing +both modalities is a major obstacle to progress in this field. To overcome the +barrier, we propose EMHI, a multimodal \textbf{E}gocentric human +\textbf{M}otion dataset with \textbf{H}ead-Mounted Display (HMD) and body-worn +\textbf{I}MUs, with all data collected under the real VR product suite. +Specifically, EMHI provides synchronized stereo images from downward-sloping +cameras on the headset and IMU data from body-worn sensors, along with pose +annotations in SMPL format. This dataset consists of 885 sequences captured by +58 subjects performing 39 actions, totaling about 28.5 hours of recording. We +evaluate the annotations by comparing them with optical marker-based SMPL +fitting results. To substantiate the reliability of our dataset, we introduce +MEPoser, a new baseline method for multimodal egocentric HPE, which employs a +multimodal fusion encoder, temporal feature encoder, and MLP-based regression +heads. The experiments on EMHI show that MEPoser outperforms existing +single-modal methods and demonstrates the value of our dataset in solving the +problem of egocentric HPE. We believe the release of EMHI and the method could +advance the research of egocentric HPE and expedite the practical +implementation of this technology in VR/AR products. + +
+
+
+
+
+ + ☆ Self-supervised Anomaly Detection Pretraining Enhances Long-tail ECG + Diagnosis + + +
+ Current computer-aided ECG diagnostic systems struggle with the +underdetection of rare but critical cardiac anomalies due to the imbalanced +nature of ECG datasets. This study introduces a novel approach using +self-supervised anomaly detection pretraining to address this limitation. The +anomaly detection model is specifically designed to detect and localize subtle +deviations from normal cardiac patterns, capturing the nuanced details +essential for accurate ECG interpretation. Validated on an extensive dataset of +over one million ECG records from clinical practice, characterized by a +long-tail distribution across 116 distinct categories, the anomaly +detection-pretrained ECG diagnostic model has demonstrated a significant +improvement in overall accuracy. Notably, our approach yielded a 94.7% AUROC, +92.2% sensitivity, and 92.5\% specificity for rare ECG types, significantly +outperforming traditional methods and narrowing the performance gap with common +ECG types. The integration of anomaly detection pretraining into ECG analysis +represents a substantial contribution to the field, addressing the +long-standing challenge of long-tail data distributions in clinical +diagnostics. Furthermore, prospective validation in real-world clinical +settings revealed that our AI-driven approach enhances diagnostic efficiency, +precision, and completeness by 32%, 6.7%, and 11.8% respectively, when compared +to standard practices. This advancement marks a pivotal step forward in the +integration of AI within clinical cardiology, with particularly profound +implications for emergency care, where rapid and accurate ECG interpretation is +crucial. The contributions of this study not only push the boundaries of +current ECG diagnostic capabilities but also lay the groundwork for more +reliable and accessible cardiovascular care. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.04935 +
+
+
+
+
+ + ☆ Look, Compare, Decide: Alleviating Hallucination in Large + Vision-Language Models via Multi-View Multi-Path Reasoning + + +
+ Recently, Large Vision-Language Models (LVLMs) have demonstrated impressive +capabilities in multi-modal context comprehension. However, they still suffer +from hallucination problems referring to generating inconsistent outputs with +the image content. To mitigate hallucinations, previous studies mainly focus on +retraining LVLMs with custom datasets. Although effective, they inherently come +with additional computational costs. In this paper, we propose a training-free +framework, \textbf{MVP}, that aims to reduce hallucinations by making the most +of the innate capabilities of the LVLMs via \textbf{M}ulti-\textbf{V}iew +Multi-\textbf{P}ath Reasoning. Specifically, we first devise a multi-view +information-seeking strategy to thoroughly perceive the comprehensive +information in the image, which enriches the general global information +captured by the original vision encoder in LVLMs. Furthermore, during the +answer decoding, we observe that the occurrence of hallucinations has a strong +correlation with the certainty of the answer tokens. Thus, we propose +multi-path reasoning for each information view to quantify and aggregate the +certainty scores for each potential answer among multiple decoding paths and +finally decide the output answer. By fully grasping the information in the +image and carefully considering the certainty of the potential answers when +decoding, our MVP can effectively reduce hallucinations in LVLMs.The extensive +experiments verify that our proposed MVP significantly mitigates the +hallucination problem across four well-known LVLMs. The source code is +available at: \url{https://github.com/GasolSun36/MVP}. + +
+
+ comment: 13 pages, 7 tables, 7 figures +
+
+
+
+
+ + ☆ GMM-IKRS: Gaussian Mixture Models for Interpretable Keypoint Refinement + and Scoring ECCV 2024 + + +
+ The extraction of keypoints in images is at the basis of many computer vision +applications, from localization to 3D reconstruction. Keypoints come with a +score permitting to rank them according to their quality. While learned +keypoints often exhibit better properties than handcrafted ones, their scores +are not easily interpretable, making it virtually impossible to compare the +quality of individual keypoints across methods. We propose a framework that can +refine, and at the same time characterize with an interpretable score, the +keypoints extracted by any method. Our approach leverages a modified robust +Gaussian Mixture Model fit designed to both reject non-robust keypoints and +refine the remaining ones. Our score comprises two components: one relates to +the probability of extracting the same keypoint in an image captured from +another viewpoint, the other relates to the localization accuracy of the +keypoint. These two interpretable components permit a comparison of individual +keypoints extracted across different methods. Through extensive experiments we +demonstrate that, when applied to popular keypoint detectors, our framework +consistently improves the repeatability of keypoints as well as their +performance in homography and two/multiple-view pose recovery tasks. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ RenDetNet: Weakly-supervised Shadow Detection with Shadow Caster + Verification ECCV 2024 + + +
+ Existing shadow detection models struggle to differentiate dark image areas +from shadows. In this paper, we tackle this issue by verifying that all +detected shadows are real, i.e. they have paired shadow casters. We perform +this step in a physically-accurate manner by differentiably re-rendering the +scene and observing the changes stemming from carving out estimated shadow +casters. Thanks to this approach, the RenDetNet proposed in this paper is the +first learning-based shadow detection model whose supervisory signals can be +computed in a self-supervised manner. The developed system compares favourably +against recent models trained on our data. As part of this publication, we +release our code on github. + +
+
+ comment: AIM @ ECCV 2024 / code available at + https://github.com/n-kubiak/RenDetNet +
+
+
+
+
+ + ☆ Temporal and Interactive Modeling for Efficient Human-Human Motion + Generation + + +
+ Human-human motion generation is essential for understanding humans as social +beings. Although several transformer-based methods have been proposed, they +typically model each individual separately and overlook the causal +relationships in temporal motion sequences. Furthermore, the attention +mechanism in transformers exhibits quadratic computational complexity, +significantly reducing their efficiency when processing long sequences. In this +paper, we introduce TIM (Temporal and Interactive Modeling), an efficient and +effective approach that presents the pioneering human-human motion generation +model utilizing RWKV. Specifically, we first propose Causal Interactive +Injection to leverage the temporal properties of motion sequences and avoid +non-causal and cumbersome modeling. Then we present Role-Evolving Mixing to +adjust to the ever-evolving roles throughout the interaction. Finally, to +generate smoother and more rational motion, we design Localized Pattern +Amplification to capture short-term motion patterns. Extensive experiments on +InterHuman demonstrate that our method achieves superior performance. Notably, +TIM has achieved state-of-the-art results using only 32% of InterGen's +trainable parameters. Code will be available soon. Homepage: +https://aigc-explorer.github.io/TIM-page/ + +
+
+ comment: Homepage: https://aigc-explorer.github.io/TIM-page/ +
+
+
+
+
+ + ☆ VQ4DiT: Efficient Post-Training Vector Quantization for Diffusion + Transformers + + +
+ The Diffusion Transformers Models (DiTs) have transitioned the network +architecture from traditional UNets to transformers, demonstrating exceptional +capabilities in image generation. Although DiTs have been widely applied to +high-definition video generation tasks, their large parameter size hinders +inference on edge devices. Vector quantization (VQ) can decompose model weight +into a codebook and assignments, allowing extreme weight quantization and +significantly reducing memory usage. In this paper, we propose VQ4DiT, a fast +post-training vector quantization method for DiTs. We found that traditional VQ +methods calibrate only the codebook without calibrating the assignments. This +leads to weight sub-vectors being incorrectly assigned to the same assignment, +providing inconsistent gradients to the codebook and resulting in a suboptimal +result. To address this challenge, VQ4DiT calculates the candidate assignment +set for each weight sub-vector based on Euclidean distance and reconstructs the +sub-vector based on the weighted average. Then, using the zero-data and +block-wise calibration method, the optimal assignment from the set is +efficiently selected while calibrating the codebook. VQ4DiT quantizes a DiT +XL/2 model on a single NVIDIA A100 GPU within 20 minutes to 5 hours depending +on the different quantization settings. Experiments show that VQ4DiT +establishes a new state-of-the-art in model size and performance trade-offs, +quantizing weights to 2-bit precision while retaining acceptable image +generation quality. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Multi-centric AI Model for Unruptured Intracranial Aneurysm Detection + and Volumetric Segmentation in 3D TOF-MRI + + +
+ Purpose: To develop an open-source nnU-Net-based AI model for combined +detection and segmentation of unruptured intracranial aneurysms (UICA) in 3D +TOF-MRI, and compare models trained on datasets with aneurysm-like differential +diagnoses. Methods: This retrospective study (2020-2023) included 385 +anonymized 3D TOF-MRI images from 364 patients (mean age 59 years, 60% female) +at multiple centers plus 113 subjects from the ADAM challenge. Images featured +untreated or possible UICAs and differential diagnoses. Four distinct training +datasets were created, and the nnU-Net framework was used for model +development. Performance was assessed on a separate test set using sensitivity +and False Positive (FP)/case rate for detection, and DICE score and NSD +(Normalized Surface Distance) with a 0.5mm threshold for segmentation. +Statistical analysis included chi-square, Mann-Whitney-U, and Kruskal-Wallis +tests, with significance set at p < 0.05. Results: Models achieved overall +sensitivity between 82% and 85% and a FP/case rate of 0.20 to 0.31, with no +significant differences (p = 0.90 and p = 0.16). The primary model showed 85% +sensitivity and 0.23 FP/case rate, outperforming the ADAM-challenge winner +(61%) and a nnU-Net trained on ADAM data (51%) in sensitivity (p < 0.05). It +achieved a mean DICE score of 0.73 and an NSD of 0.84 for correctly detected +UICA. Conclusions: Our open-source, nnU-Net-based AI model (available at +10.5281/zenodo.13386859) demonstrates high sensitivity, low false positive +rates, and consistent segmentation accuracy for UICA detection and segmentation +in 3D TOF-MRI, suggesting its potential to improve clinical diagnosis and for +monitoring of UICA. + +
+
+ comment: 14 pages, 5 figures, 3 tables, 2 supplementary tables +
+
+
+
+
+ + ☆ Sparse Uncertainty-Informed Sampling from Federated Streaming Data + + +
+ We present a numerically robust, computationally efficient approach for +non-I.I.D. data stream sampling in federated client systems, where resources +are limited and labeled data for local model adaptation is sparse and +expensive. The proposed method identifies relevant stream observations to +optimize the underlying client model, given a local labeling budget, and +performs instantaneous labeling decisions without relying on any memory +buffering strategies. Our experiments show enhanced training batch diversity +and an improved numerical robustness of the proposal compared to existing +strategies over large-scale data streams, making our approach an effective and +convenient solution in FL environments. + +
+
+ comment: Preprint, 6 pages, 3 figures, Accepted for ESANN 2024 +
+
+
+
+
+ + ☆ UTrack: Multi-Object Tracking with Uncertain Detections ECCV 2024 + + +
+ The tracking-by-detection paradigm is the mainstream in multi-object +tracking, associating tracks to the predictions of an object detector. Although +exhibiting uncertainty through a confidence score, these predictions do not +capture the entire variability of the inference process. For safety and +security critical applications like autonomous driving, surveillance, etc., +knowing this predictive uncertainty is essential though. Therefore, we +introduce, for the first time, a fast way to obtain the empirical predictive +distribution during object detection and incorporate that knowledge in +multi-object tracking. Our mechanism can easily be integrated into +state-of-the-art trackers, enabling them to fully exploit the uncertainty in +the detections. Additionally, novel association methods are introduced that +leverage the proposed mechanism. We demonstrate the effectiveness of our +contribution on a variety of benchmarks, such as MOT17, MOT20, DanceTrack, and +KITTI. + +
+
+ comment: Accepted for the ECCV 2024 Workshop on Uncertainty Quantification for + Computer Vision +
+
+
+
+
+ + ☆ RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation + and Retrieval-Guidance + + +
+ Diffusion-based models demonstrate impressive generation capabilities. +However, they also have a massive number of parameters, resulting in enormous +model sizes, thus making them unsuitable for deployment on resource-constraint +devices. Block-wise generation can be a promising alternative for designing +compact-sized (parameter-efficient) deep generative models since the model can +generate one block at a time instead of generating the whole image at once. +However, block-wise generation is also considerably challenging because +ensuring coherence across generated blocks can be non-trivial. To this end, we +design a retrieval-augmented generation (RAG) approach and leverage the +corresponding blocks of the images retrieved by the RAG module to condition the +training and generation stages of a block-wise denoising diffusion model. Our +conditioning schemes ensure coherence across the different blocks during +training and, consequently, during generation. While we showcase our approach +using the latent diffusion model (LDM) as the base model, it can be used with +other variants of denoising diffusion models. We validate the solution of the +coherence problem through the proposed approach by reporting substantive +experiments to demonstrate our approach's effectiveness in compact model size +and excellent generation quality. + +
+
+
+
+
+ + ☆ FissionVAE: Federated Non-IID Image Generation with Latent Space and + Decoder Decomposition + + +
+ Federated learning is a machine learning paradigm that enables decentralized +clients to collaboratively learn a shared model while keeping all the training +data local. While considerable research has focused on federated image +generation, particularly Generative Adversarial Networks, Variational +Autoencoders have received less attention. In this paper, we address the +challenges of non-IID (independently and identically distributed) data +environments featuring multiple groups of images of different types. +Specifically, heterogeneous data distributions can lead to difficulties in +maintaining a consistent latent space and can also result in local generators +with disparate texture features being blended during aggregation. We introduce +a novel approach, FissionVAE, which decomposes the latent space and constructs +decoder branches tailored to individual client groups. This method allows for +customized learning that aligns with the unique data distributions of each +group. Additionally, we investigate the incorporation of hierarchical VAE +architectures and demonstrate the use of heterogeneous decoder architectures +within our model. We also explore strategies for setting the latent prior +distributions to enhance the decomposition process. To evaluate our approach, +we assemble two composite datasets: the first combines MNIST and FashionMNIST; +the second comprises RGB datasets of cartoon and human faces, wild animals, +marine vessels, and remote sensing images of Earth. Our experiments demonstrate +that FissionVAE greatly improves generation quality on these datasets compared +to baseline federated VAE models. + +
+
+
+
+
+ + ☆ Focus-Consistent Multi-Level Aggregation for Compositional Zero-Shot + Learning + + +
+ To transfer knowledge from seen attribute-object compositions to recognize +unseen ones, recent compositional zero-shot learning (CZSL) methods mainly +discuss the optimal classification branches to identify the elements, leading +to the popularity of employing a three-branch architecture. However, these +methods mix up the underlying relationship among the branches, in the aspect of +consistency and diversity. Specifically, consistently providing the +highest-level features for all three branches increases the difficulty in +distinguishing classes that are superficially similar. Furthermore, a single +branch may focus on suboptimal regions when spatial messages are not shared +between the personalized branches. Recognizing these issues and endeavoring to +address them, we propose a novel method called Focus-Consistent Multi-Level +Aggregation (FOMA). Our method incorporates a Multi-Level Feature Aggregation +(MFA) module to generate personalized features for each branch based on the +image content. Additionally, a Focus-Consistent Constraint encourages a +consistent focus on the informative regions, thereby implicitly exchanging +spatial information between all branches. Extensive experiments on three +benchmark datasets (UT-Zappos, C-GQA, and Clothing16K) demonstrate that our +FOMA outperforms SOTA. + +
+
+ comment: Compositional Zero-Shot Learning +
+
+
+
+
+ + ☆ Stochastic Layer-Wise Shuffle: A Good Practice to Improve Vision Mamba + Training + + +
+ Recent Vision Mamba models not only have much lower complexity for processing +higher resolution images and longer videos but also the competitive performance +with Vision Transformers (ViTs). However, they are stuck into overfitting and +thus only present up to base size (about 80M). It is still unclear how vanilla +Vision Mamba (Vim) can be efficiently scaled up to larger sizes, which is +essentially for further exploitation. In this paper, we propose a stochastic +layer-wise shuffle regularization, which empowers successfully scaling +non-hierarchical Vision Mamba to a large size (about 300M) in a supervised +setting. Specifically, our base and large-scale ShuffleMamba models can +outperform the supervised ViTs of similar size by 0.8\% and 1.0\% +classification accuracy on ImageNet1k, respectively, without auxiliary data. +When evaluated on the ADE20K semantic segmentation and COCO detection tasks, +our ShuffleMamba models also show significant improvements. Without bells and +whistles, the stochastic layer-wise shuffle has the following highlights: (1) +\textit{Plug and play:} it does not change model architectures and will be +omitted in inference. (2) \textit{Simple but effective:} it can improve the +overfitting in Vim training and only introduce random token permutation +operations. (3) \textit{Intuitive:} the token sequences in deeper layers are +more likely to be shuffled as they are expected to be more semantic and less +sensitive to patch positions. Code and models will be available at +https://github.com/huangzizheng01/ShuffleMamba. + +
+
+
+
+
+ + ☆ Approximately Invertible Neural Network for Learned Image Compression + + +
+ Learned image compression have attracted considerable interests in recent +years. It typically comprises an analysis transform, a synthesis transform, +quantization and an entropy coding model. The analysis transform and synthesis +transform are used to encode an image to latent feature and decode the +quantized feature to reconstruct the image, and can be regarded as coupled +transforms. However, the analysis transform and synthesis transform are +designed independently in the existing methods, making them unreliable in +high-quality image compression. Inspired by the invertible neural networks in +generative modeling, invertible modules are used to construct the coupled +analysis and synthesis transforms. Considering the noise introduced in the +feature quantization invalidates the invertible process, this paper proposes an +Approximately Invertible Neural Network (A-INN) framework for learned image +compression. It formulates the rate-distortion optimization in lossy image +compression when using INN with quantization, which differentiates from using +INN for generative modelling. Generally speaking, A-INN can be used as the +theoretical foundation for any INN based lossy compression method. Based on +this formulation, A-INN with a progressive denoising module (PDM) is developed +to effectively reduce the quantization noise in the decoding. Moreover, a +Cascaded Feature Recovery Module (CFRM) is designed to learn high-dimensional +feature recovery from low-dimensional ones to further reduce the noise in +feature channel compression. In addition, a Frequency-enhanced Decomposition +and Synthesis Module (FDSM) is developed by explicitly enhancing the +high-frequency components in an image to address the loss of high-frequency +information inherent in neural network based image compression. Extensive +experiments demonstrate that the proposed A-INN outperforms the existing +learned image compression methods. + +
+
+
+
+
+ + ☆ Generalizing Deepfake Video Detection with Plug-and-Play: Video-Level + Blending and Spatiotemporal Adapter Tuning + + +
+ Three key challenges hinder the development of current deepfake video +detection: (1) Temporal features can be complex and diverse: how can we +identify general temporal artifacts to enhance model generalization? (2) +Spatiotemporal models often lean heavily on one type of artifact and ignore the +other: how can we ensure balanced learning from both? (3) Videos are naturally +resource-intensive: how can we tackle efficiency without compromising accuracy? + This paper attempts to tackle the three challenges jointly. First, inspired +by the notable generality of using image-level blending data for image forgery +detection, we investigate whether and how video-level blending can be effective +in video. We then perform a thorough analysis and identify a previously +underexplored temporal forgery artifact: Facial Feature Drift (FFD), which +commonly exists across different forgeries. To reproduce FFD, we then propose a +novel Video-level Blending data (VB), where VB is implemented by blending the +original image and its warped version frame-by-frame, serving as a hard +negative sample to mine more general artifacts. Second, we carefully design a +lightweight Spatiotemporal Adapter (StA) to equip a pretrained image model +(both ViTs and CNNs) with the ability to capture both spatial and temporal +features jointly and efficiently. StA is designed with two-stream 3D-Conv with +varying kernel sizes, allowing it to process spatial and temporal features +separately. Extensive experiments validate the effectiveness of the proposed +methods; and show our approach can generalize well to previously unseen forgery +videos, even the just-released (in 2024) SoTAs. We release our code and +pretrained weights at \url{https://github.com/YZY-stack/StA4Deepfake}. + +
+
+
+
+
+ + ☆ Instant Adversarial Purification with Adversarial Consistency + Distillation + + +
+ Neural networks, despite their remarkable performance in widespread +applications, including image classification, are also known to be vulnerable +to subtle adversarial noise. Although some diffusion-based purification methods +have been proposed, for example, DiffPure, those methods are time-consuming. In +this paper, we propose One Step Control Purification (OSCP), a diffusion-based +purification model that can purify the adversarial image in one Neural Function +Evaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and +ControlNet for our one-step purification. OSCP is computationally friendly and +time efficient compared to other diffusion-based purification methods; we +achieve defense success rate of 74.19\% on ImageNet, only requiring 0.1s for +each purification. Moreover, there is a fundamental incongruence between +consistency distillation and adversarial perturbation. To address this +ontological dissonance, we propose Gaussian Adversarial Noise Distillation +(GAND), a novel consistency distillation framework that facilitates a more +nuanced reconciliation of the latent space dynamics, effectively bridging the +natural and adversarial manifolds. Our experiments show that the GAND does not +need a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient. + +
+
+
+
+
+ + ☆ Vote&Mix: Plug-and-Play Token Reduction for Efficient Vision Transformer + + +
+ Despite the remarkable success of Vision Transformers (ViTs) in various +visual tasks, they are often hindered by substantial computational cost. In +this work, we introduce Vote\&Mix (\textbf{VoMix}), a plug-and-play and +parameter-free token reduction method, which can be readily applied to +off-the-shelf ViT models \textit{without any training}. VoMix tackles the +computational redundancy of ViTs by identifying tokens with high homogeneity +through a layer-wise token similarity voting mechanism. Subsequently, the +selected tokens are mixed into the retained set, thereby preserving visual +information. Experiments demonstrate VoMix significantly improves the +speed-accuracy tradeoff of ViTs on both images and videos. Without any +training, VoMix achieves a 2$\times$ increase in throughput of existing ViT-H +on ImageNet-1K and a 2.4$\times$ increase in throughput of existing ViT-L on +Kinetics-400 video dataset, with a mere 0.3\% drop in top-1 accuracy. + +
+
+
+
+
+ + ☆ Efficient Image Restoration through Low-Rank Adaptation and Stable + Diffusion XL + + +
+ In this study, we propose an enhanced image restoration model, SUPIR, based +on the integration of two low-rank adaptive (LoRA) modules with the Stable +Diffusion XL (SDXL) framework. Our method leverages the advantages of LoRA to +fine-tune SDXL models, thereby significantly improving image restoration +quality and efficiency. We collect 2600 high-quality real-world images, each +with detailed descriptive text, for training the model. The proposed method is +evaluated on standard benchmarks and achieves excellent performance, +demonstrated by higher peak signal-to-noise ratio (PSNR), lower learned +perceptual image patch similarity (LPIPS), and higher structural similarity +index measurement (SSIM) scores. These results underscore the effectiveness of +combining LoRA with SDXL for advanced image restoration tasks, highlighting the +potential of our approach in generating high-fidelity restored images. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ A Survey of the Self Supervised Learning Mechanisms for Vision + Transformers + + +
+ Deep supervised learning models require high volume of labeled data to attain +sufficiently good results. Although, the practice of gathering and annotating +such big data is costly and laborious. Recently, the application of self +supervised learning (SSL) in vision tasks has gained significant attention. The +intuition behind SSL is to exploit the synchronous relationships within the +data as a form of self-supervision, which can be versatile. In the current big +data era, most of the data is unlabeled, and the success of SSL thus relies in +finding ways to improve this vast amount of unlabeled data available. Thus its +better for deep learning algorithms to reduce reliance on human supervision and +instead focus on self-supervision based on the inherent relationships within +the data. With the advent of ViTs, which have achieved remarkable results in +computer vision, it is crucial to explore and understand the various SSL +mechanisms employed for training these models specifically in scenarios where +there is less label data available. In this survey we thus develop a +comprehensive taxonomy of systematically classifying the SSL techniques based +upon their representations and pre-training tasks being applied. Additionally, +we discuss the motivations behind SSL, review popular pre-training tasks, and +highlight the challenges and advancements in this field. Furthermore, we +present a comparative analysis of different SSL methods, evaluate their +strengths and limitations, and identify potential avenues for future research. + +
+
+ comment: 34 Pages, 5 Figures, 7 Tables +
+
+
+
+
+ + ☆ LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality + Assessment Model + + +
+ Recent advancements in the field of No-Reference Image Quality Assessment +(NR-IQA) using deep learning techniques demonstrate high performance across +multiple open-source datasets. However, such models are typically very large +and complex making them not so suitable for real-world deployment, especially +on resource- and battery-constrained mobile devices. To address this +limitation, we propose a compact, lightweight NR-IQA model that achieves +state-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation +and test datasets while being also nearly 5.7 times faster than the fastest +SOTA model. Our model features a dual-branch architecture, with each branch +separately trained on synthetically and authentically distorted images which +enhances the model's generalizability across different distortion types. To +improve robustness under diverse real-world visual conditions, we additionally +incorporate multiple color spaces during the training process. We also +demonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks +(KANs) for final quality regression as compared to the conventional Multi-Layer +Perceptrons (MLPs). Our evaluation considering various open-source datasets +highlights the practical, high-accuracy, and robust performance of our proposed +lightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA. + +
+
+
+
+
+ + ☆ BTMuda: A Bi-level Multi-source unsupervised domain adaptation framework + for breast cancer diagnosis + + +
+ Deep learning has revolutionized the early detection of breast cancer, +resulting in a significant decrease in mortality rates. However, difficulties +in obtaining annotations and huge variations in distribution between training +sets and real scenes have limited their clinical applications. To address these +limitations, unsupervised domain adaptation (UDA) methods have been used to +transfer knowledge from one labeled source domain to the unlabeled target +domain, yet these approaches suffer from severe domain shift issues and often +ignore the potential benefits of leveraging multiple relevant sources in +practical applications. To address these limitations, in this work, we +construct a Three-Branch Mixed extractor and propose a Bi-level Multi-source +unsupervised domain adaptation method called BTMuda for breast cancer +diagnosis. Our method addresses the problems of domain shift by dividing domain +shift issues into two levels: intra-domain and inter-domain. To reduce the +intra-domain shift, we jointly train a CNN and a Transformer as two paths of a +domain mixed feature extractor to obtain robust representations rich in both +low-level local and high-level global information. As for the inter-domain +shift, we redesign the Transformer delicately to a three-branch architecture +with cross-attention and distillation, which learns domain-invariant +representations from multiple domains. Besides, we introduce two alignment +modules - one for feature alignment and one for classifier alignment - to +improve the alignment process. Extensive experiments conducted on three public +mammographic datasets demonstrate that our BTMuda outperforms state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Can We Leave Deepfake Data Behind in Training Deepfake Detector? + + +
+ The generalization ability of deepfake detectors is vital for their +applications in real-world scenarios. One effective solution to enhance this +ability is to train the models with manually-blended data, which we termed +"blendfake", encouraging models to learn generic forgery artifacts like +blending boundary. Interestingly, current SoTA methods utilize blendfake +without incorporating any deepfake data in their training process. This is +likely because previous empirical observations suggest that vanilla hybrid +training (VHT), which combines deepfake and blendfake data, results in inferior +performance to methods using only blendfake data (so-called "1+1<2"). +Therefore, a critical question arises: Can we leave deepfake behind and rely +solely on blendfake data to train an effective deepfake detector? Intuitively, +as deepfakes also contain additional informative forgery clues (e.g., deep +generative artifacts), excluding all deepfake data in training deepfake +detectors seems counter-intuitive. In this paper, we rethink the role of +blendfake in detecting deepfakes and formulate the process from "real to +blendfake to deepfake" to be a progressive transition. Specifically, blendfake +and deepfake can be explicitly delineated as the oriented pivot anchors between +"real-to-fake" transitions. The accumulation of forgery information should be +oriented and progressively increasing during this transition process. To this +end, we propose an Oriented Progressive Regularizor (OPR) to establish the +constraints that compel the distribution of anchors to be discretely arranged. +Furthermore, we introduce feature bridging to facilitate the smooth transition +between adjacent anchors. Extensive experiments confirm that our design allows +leveraging forgery information from both blendfake and deepfake effectively and +comprehensively. + +
+
+
+
+
+ + ☆ Text-to-Image Generation Via Energy-Based CLIP + + +
+ Joint Energy Models (JEMs), while drawing significant research attention, +have not been successfully scaled to real-world, high-resolution datasets. We +present EB-CLIP, a novel approach extending JEMs to the multimodal +vision-language domain using CLIP, integrating both generative and +discriminative objectives. For the generative objective, we introduce an +image-text joint-energy function based on Cosine similarity in the CLIP space, +training CLIP to assign low energy to real image-caption pairs and high energy +otherwise. For the discriminative objective, we employ contrastive adversarial +loss, extending the adversarial training objective to the multimodal domain. +EB-CLIP not only generates realistic images from text but also achieves +competitive results on the compositionality benchmark, outperforming leading +methods with fewer parameters. Additionally, we demonstrate the superior +guidance capability of EB-CLIP by enhancing CLIP-based generative frameworks +and converting unconditional diffusion models to text-based ones. Lastly, we +show that EB-CLIP can serve as a more robust evaluation metric for +text-to-image generative tasks than CLIP. + +
+
+
+
+
+ + ☆ CP-VoteNet: Contrastive Prototypical VoteNet for Few-Shot Point Cloud + Object Detection + + +
+ Few-shot point cloud 3D object detection (FS3D) aims to identify and localise +objects of novel classes from point clouds, using knowledge learnt from +annotated base classes and novel classes with very few annotations. Thus far, +this challenging task has been approached using prototype learning, but the +performance remains far from satisfactory. We find that in existing methods, +the prototypes are only loosely constrained and lack of fine-grained awareness +of the semantic and geometrical correlation embedded within the point cloud +space. To mitigate these issues, we propose to leverage the inherent +contrastive relationship within the semantic and geometrical subspaces to learn +more refined and generalisable prototypical representations. To this end, we +first introduce contrastive semantics mining, which enables the network to +extract discriminative categorical features by constructing positive and +negative pairs within training batches. Meanwhile, since point features +representing local patterns can be clustered into geometric components, we +further propose to impose contrastive relationship at the primitive level. +Through refined primitive geometric structures, the transferability of feature +encoding from base to novel classes is significantly enhanced. The above +designs and insights lead to our novel Contrastive Prototypical VoteNet +(CP-VoteNet). Extensive experiments on two FS3D benchmarks FS-ScanNet and +FS-SUNRGBD demonstrate that CP-VoteNet surpasses current state-of-the-art +methods by considerable margins across different FS3D settings. Further +ablation studies conducted corroborate the rationale and effectiveness of our +designs. + +
+
+ comment: Accepted by PRCV 2024 +
+
+
+
+
+ + ☆ ConDense: Consistent 2D/3D Pre-training for Dense and Sparse Features + from Multi-View Images ECCV 2024 + + +
+ To advance the state of the art in the creation of 3D foundation models, this +paper introduces the ConDense framework for 3D pre-training utilizing existing +pre-trained 2D networks and large-scale multi-view datasets. We propose a novel +2D-3D joint training scheme to extract co-embedded 2D and 3D features in an +end-to-end pipeline, where 2D-3D feature consistency is enforced through a +volume rendering NeRF-like ray marching process. Using dense per pixel features +we are able to 1) directly distill the learned priors from 2D models to 3D +models and create useful 3D backbones, 2) extract more consistent and less +noisy 2D features, 3) formulate a consistent embedding space where 2D, 3D, and +other modalities of data (e.g., natural language prompts) can be jointly +queried. Furthermore, besides dense features, ConDense can be trained to +extract sparse features (e.g., key points), also with 2D-3D consistency -- +condensing 3D NeRF representations into compact sets of decorated key points. +We demonstrate that our pre-trained model provides good initialization for +various 3D tasks including 3D classification and segmentation, outperforming +other 3D pre-training methods by a significant margin. It also enables, by +exploiting our sparse features, additional useful downstream tasks, such as +matching 2D images to 3D scenes, detecting duplicate 3D scenes, and querying a +repository of 3D scenes through natural language -- all quite efficiently and +without any per-scene fine-tuning. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Disease Classification and Impact of Pretrained Deep Convolution Neural + Networks on Diverse Medical Imaging Datasets across Imaging Modalities + + +
+ Imaging techniques such as Chest X-rays, whole slide images, and optical +coherence tomography serve as the initial screening and detection for a wide +variety of medical pulmonary and ophthalmic conditions respectively. This paper +investigates the intricacies of using pretrained deep convolutional neural +networks with transfer learning across diverse medical imaging datasets with +varying modalities for binary and multiclass classification. We conducted a +comprehensive performance analysis with ten network architectures and model +families each with pretraining and random initialization. Our finding showed +that the use of pretrained models as fixed feature extractors yields poor +performance irrespective of the datasets. Contrary, histopathology microscopy +whole slide images have better performance. It is also found that deeper and +more complex architectures did not necessarily result in the best performance. +This observation implies that the improvements in ImageNet are not parallel to +the medical imaging tasks. Within a medical domain, the performance of the +network architectures varies within model families with shifts in datasets. +This indicates that the performance of models within a specific modality may +not be conclusive for another modality within the same domain. This study +provides a deeper understanding of the applications of deep learning techniques +in medical imaging and highlights the impact of pretrained networks across +different medical imaging datasets under five different experimental settings. + +
+
+ comment: 15 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ Retrieval-Augmented Natural Language Reasoning for Explainable Visual + Question Answering ICIP + + +
+ Visual Question Answering with Natural Language Explanation (VQA-NLE) task is +challenging due to its high demand for reasoning-based inference. Recent +VQA-NLE studies focus on enhancing model networks to amplify the model's +reasoning capability but this approach is resource-consuming and unstable. In +this work, we introduce a new VQA-NLE model, ReRe (Retrieval-augmented natural +language Reasoning), using leverage retrieval information from the memory to +aid in generating accurate answers and persuasive explanations without relying +on complex networks and extra datasets. ReRe is an encoder-decoder architecture +model using a pre-trained clip vision encoder and a pre-trained GPT-2 language +model as a decoder. Cross-attention layers are added in the GPT-2 for +processing retrieval features. ReRe outperforms previous methods in VQA +accuracy and explanation score and shows improvement in NLE with more +persuasive, reliability. + +
+
+ comment: ICIP Workshop 2024 +
+
+
+
+
+ + ☆ Efficient Camera Exposure Control for Visual Odometry via Deep + Reinforcement Learning + + +
+ The stability of visual odometry (VO) systems is undermined by degraded image +quality, especially in environments with significant illumination changes. This +study employs a deep reinforcement learning (DRL) framework to train agents for +exposure control, aiming to enhance imaging performance in challenging +conditions. A lightweight image simulator is developed to facilitate the +training process, enabling the diversification of image exposure and sequence +trajectory. This setup enables completely offline training, eliminating the +need for direct interaction with camera hardware and the real environments. +Different levels of reward functions are crafted to enhance the VO systems, +equipping the DRL agents with varying intelligence. Extensive experiments have +shown that our exposure control agents achieve superior efficiency-with an +average inference duration of 1.58 ms per frame on a CPU-and respond more +quickly than traditional feedback control schemes. By choosing an appropriate +reward function, agents acquire an intelligent understanding of motion trends +and anticipate future illumination changes. This predictive capability allows +VO systems to deliver more stable and precise odometry results. The codes and +datasets are available at https://github.com/ShuyangUni/drl_exposure_ctrl. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ AdaptVision: Dynamic Input Scaling in MLLMs for Versatile Scene + Understanding + + +
+ Over the past few years, the advancement of Multimodal Large Language Models +(MLLMs) has captured the wide interest of researchers, leading to numerous +innovations to enhance MLLMs' comprehension. In this paper, we present +AdaptVision, a multimodal large language model specifically designed to +dynamically process input images at varying resolutions. We hypothesize that +the requisite number of visual tokens for the model is contingent upon both the +resolution and content of the input image. Generally, natural images with a +lower information density can be effectively interpreted by the model using +fewer visual tokens at reduced resolutions. In contrast, images containing +textual content, such as documents with rich text, necessitate a higher number +of visual tokens for accurate text interpretation due to their higher +information density. Building on this insight, we devise a dynamic image +partitioning module that adjusts the number of visual tokens according to the +size and aspect ratio of images. This method mitigates distortion effects that +arise from resizing images to a uniform resolution and dynamically optimizing +the visual tokens input to the LLMs. Our model is capable of processing images +with resolutions up to $1008\times 1008$. Extensive experiments across various +datasets demonstrate that our method achieves impressive performance in +handling vision-language tasks in both natural and text-related scenes. The +source code and dataset are now publicly available at +\url{https://github.com/harrytea/AdaptVision}. + +
+
+
+
+
+ + ☆ 2DGH: 2D Gaussian-Hermite Splatting for High-quality Rendering and + Better Geometry Reconstruction + + +
+ 2D Gaussian Splatting has recently emerged as a significant method in 3D +reconstruction, enabling novel view synthesis and geometry reconstruction +simultaneously. While the well-known Gaussian kernel is broadly used, its lack +of anisotropy and deformation ability leads to dim and vague edges at object +silhouettes, limiting the reconstruction quality of current Gaussian splatting +methods. To enhance the representation power, we draw inspiration from quantum +physics and propose to use the Gaussian-Hermite kernel as the new primitive in +Gaussian splatting. The new kernel takes a unified mathematical form and +extends the Gaussian function, which serves as the zero-rank term in the +updated formulation. Our experiments demonstrate the extraordinary performance +of Gaussian-Hermite kernel in both geometry reconstruction and novel-view +synthesis tasks. The proposed kernel outperforms traditional Gaussian Splatting +kernels, showcasing its potential for high-quality 3D reconstruction and +rendering. + +
+
+
+
+
+ + ☆ Cross Fusion RGB-T Tracking with Bi-directional Adapter + + +
+ Many state-of-the-art RGB-T trackers have achieved remarkable results through +modality fusion. However, these trackers often either overlook temporal +information or fail to fully utilize it, resulting in an ineffective balance +between multi-modal and temporal information. To address this issue, we propose +a novel Cross Fusion RGB-T Tracking architecture (CFBT) that ensures the full +participation of multiple modalities in tracking while dynamically fusing +temporal information. The effectiveness of CFBT relies on three newly designed +cross spatio-temporal information fusion modules: Cross Spatio-Temporal +Augmentation Fusion (CSTAF), Cross Spatio-Temporal Complementarity Fusion +(CSTCF), and Dual-Stream Spatio-Temporal Adapter (DSTA). CSTAF employs a +cross-attention mechanism to enhance the feature representation of the template +comprehensively. CSTCF utilizes complementary information between different +branches to enhance target features and suppress background features. DSTA +adopts the adapter concept to adaptively fuse complementary information from +multiple branches within the transformer layer, using the RGB modality as a +medium. These ingenious fusions of multiple perspectives introduce only less +than 0.3\% of the total modal parameters, but they indeed enable an efficient +balance between multi-modal and temporal information. Extensive experiments on +three popular RGB-T tracking benchmarks demonstrate that our method achieves +new state-of-the-art performance. + +
+
+
+
+
+ + ☆ Synthetic Lunar Terrain: A Multimodal Open Dataset for Training and + Evaluating Neuromorphic Vision Algorithms + + +
+ Synthetic Lunar Terrain (SLT) is an open dataset collected from an analogue +test site for lunar missions, featuring synthetic craters in a high-contrast +lighting setup. It includes several side-by-side captures from event-based and +conventional RGB cameras, supplemented with a high-resolution 3D laser scan for +depth estimation. The event-stream recorded from the neuromorphic vision sensor +of the event-based camera is of particular interest as this emerging technology +provides several unique advantages, such as high data rates, low energy +consumption and resilience towards scenes of high dynamic range. SLT provides a +solid foundation to analyse the limits of RGB-cameras and potential advantages +or synergies in utilizing neuromorphic visions with the goal of enabling and +improving lunar specific applications like rover navigation, landing in +cratered environments or similar. + +
+
+ comment: 7 pages, 5 figures, to be published at "International Symposium on + Artificial Intelligence, Robotics and Automation in Space, i-SAIRAS, 2024 +
+
+
+
+
+ + ☆ Contrastive Learning with Synthetic Positives + + +
+ Contrastive learning with the nearest neighbor has proved to be one of the +most efficient self-supervised learning (SSL) techniques by utilizing the +similarity of multiple instances within the same class. However, its efficacy +is constrained as the nearest neighbor algorithm primarily identifies ``easy'' +positive pairs, where the representations are already closely located in the +embedding space. In this paper, we introduce a novel approach called +Contrastive Learning with Synthetic Positives (CLSP) that utilizes synthetic +images, generated by an unconditional diffusion model, as the additional +positives to help the model learn from diverse positives. Through feature +interpolation in the diffusion model sampling process, we generate images with +distinct backgrounds yet similar semantic content to the anchor image. These +images are considered ``hard'' positives for the anchor image, and when +included as supplementary positives in the contrastive loss, they contribute to +a performance improvement of over 2\% and 1\% in linear evaluation compared to +the previous NNCLR and All4One methods across multiple benchmark datasets such +as CIFAR10, achieving state-of-the-art methods. On transfer learning +benchmarks, CLSP outperforms existing SSL frameworks on 6 out of 8 downstream +datasets. We believe CLSP establishes a valuable baseline for future SSL +studies incorporating synthetic data in the training process. + +
+
+ comment: 8 pages, conference +
+
+
+
+
+ + ☆ Causal Representation-Based Domain Generalization on Gaze Estimation + + +
+ The availability of extensive datasets containing gaze information for each +subject has significantly enhanced gaze estimation accuracy. However, the +discrepancy between domains severely affects a model's performance explicitly +trained for a particular domain. In this paper, we propose the Causal +Representation-Based Domain Generalization on Gaze Estimation (CauGE) framework +designed based on the general principle of causal mechanisms, which is +consistent with the domain difference. We employ an adversarial training manner +and an additional penalizing term to extract domain-invariant features. After +extracting features, we position the attention layer to make features +sufficient for inferring the actual gaze. By leveraging these modules, CauGE +ensures that the neural networks learn from representations that meet the +causal mechanisms' general principles. By this, CauGE generalizes across +domains by extracting domain-invariant features, and spurious correlations +cannot influence the model. Our method achieves state-of-the-art performance in +the domain generalization on gaze estimation benchmark. + +
+
+
+
+
+ + ☆ HiTSR: A Hierarchical Transformer for Reference-based Super-Resolution + + +
+ In this paper, we propose HiTSR, a hierarchical transformer model for +reference-based image super-resolution, which enhances low-resolution input +images by learning matching correspondences from high-resolution reference +images. Diverging from existing multi-network, multi-stage approaches, we +streamline the architecture and training pipeline by incorporating the double +attention block from GAN literature. Processing two visual streams +independently, we fuse self-attention and cross-attention blocks through a +gating attention strategy. The model integrates a squeeze-and-excitation module +to capture global context from the input images, facilitating long-range +spatial interactions within window-based attention blocks. Long skip +connections between shallow and deep layers further enhance information flow. +Our model demonstrates superior performance across three datasets including +SUN80, Urban100, and Manga109. Specifically, on the SUN80 dataset, our model +achieves PSNR/SSIM values of 30.24/0.821. These results underscore the +effectiveness of attention mechanisms in reference-based image +super-resolution. The transformer-based model attains state-of-the-art results +without the need for purpose-built subnetworks, knowledge distillation, or +multi-stage training, emphasizing the potency of attention in meeting +reference-based image super-resolution requirements. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2307.08837 +
+
+
+
+
+ + ☆ Transient Fault Tolerant Semantic Segmentation for Autonomous Driving ECCV 2024 + + +
+ Deep learning models are crucial for autonomous vehicle perception, but their +reliability is challenged by algorithmic limitations and hardware faults. We +address the latter by examining fault-tolerance in semantic segmentation +models. Using established hardware fault models, we evaluate existing hardening +techniques both in terms of accuracy and uncertainty and introduce ReLUMax, a +novel simple activation function designed to enhance resilience against +transient faults. ReLUMax integrates seamlessly into existing architectures +without time overhead. Our experiments demonstrate that ReLUMax effectively +improves robustness, preserving performance and boosting prediction confidence, +thus contributing to the development of reliable autonomous driving systems. + +
+
+ comment: Accepted ECCV 2024 UnCV Workshop - + https://github.com/iurada/neutron-segmentation +
+
+
+
+
+ + ♻ ☆ Frankenstein: Generating Semantic-Compositional 3D Scenes in One + Tri-Plane SIGGRAPH + + +
+ We present Frankenstein, a diffusion-based framework that can generate +semantic-compositional 3D scenes in a single pass. Unlike existing methods that +output a single, unified 3D shape, Frankenstein simultaneously generates +multiple separated shapes, each corresponding to a semantically meaningful +part. The 3D scene information is encoded in one single tri-plane tensor, from +which multiple Singed Distance Function (SDF) fields can be decoded to +represent the compositional shapes. During training, an auto-encoder compresses +tri-planes into a latent space, and then the denoising diffusion process is +employed to approximate the distribution of the compositional scenes. +Frankenstein demonstrates promising results in generating room interiors as +well as human avatars with automatically separated parts. The generated scenes +facilitate many downstream applications, such as part-wise re-texturing, object +rearrangement in the room or avatar cloth re-targeting. Our project page is +available at: https://wolfball.github.io/frankenstein/. + +
+
+ comment: SIGGRAPH Asia 2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ RT-GS2: Real-Time Generalizable Semantic Segmentation for 3D Gaussian + Representations of Radiance Fields BMVC 2024 + + +
+ Gaussian Splatting has revolutionized the world of novel view synthesis by +achieving high rendering performance in real-time. Recently, studies have +focused on enriching these 3D representations with semantic information for +downstream tasks. In this paper, we introduce RT-GS2, the first generalizable +semantic segmentation method employing Gaussian Splatting. While existing +Gaussian Splatting-based approaches rely on scene-specific training, RT-GS2 +demonstrates the ability to generalize to unseen scenes. Our method adopts a +new approach by first extracting view-independent 3D Gaussian features in a +self-supervised manner, followed by a novel View-Dependent / View-Independent +(VDVI) feature fusion to enhance semantic consistency over different views. +Extensive experimentation on three different datasets showcases RT-GS2's +superiority over the state-of-the-art methods in semantic segmentation quality, +exemplified by a 8.01% increase in mIoU on the Replica dataset. Moreover, our +method achieves real-time performance of 27.03 FPS, marking an astonishing 901 +times speedup compared to existing approaches. This work represents a +significant advancement in the field by introducing, to the best of our +knowledge, the first real-time generalizable semantic segmentation method for +3D Gaussian representations of radiance fields. + +
+
+ comment: Accepted paper at BMVC 2024 +
+
+
+
+
+ + ♻ ☆ A Permuted Autoregressive Approach to Word-Level Recognition for Urdu + Digital Text + + +
+ This research paper introduces a novel word-level Optical Character +Recognition (OCR) model specifically designed for digital Urdu text, leveraging +transformer-based architectures and attention mechanisms to address the +distinct challenges of Urdu script recognition, including its diverse text +styles, fonts, and variations. The model employs a permuted autoregressive +sequence (PARSeq) architecture, which enhances its performance by enabling +context-aware inference and iterative refinement through the training of +multiple token permutations. This method allows the model to adeptly manage +character reordering and overlapping characters, commonly encountered in Urdu +script. Trained on a dataset comprising approximately 160,000 Urdu text images, +the model demonstrates a high level of accuracy in capturing the intricacies of +Urdu script, achieving a CER of 0.178. Despite ongoing challenges in handling +certain text variations, the model exhibits superior accuracy and effectiveness +in practical applications. Future work will focus on refining the model through +advanced data augmentation techniques and the integration of context-aware +language models to further enhance its performance and robustness in Urdu text +recognition. + +
+
+
+
+
+ + ♻ ☆ DeformGS: Scene Flow in Highly Deformable Scenes for Deformable Object + Manipulation + + +
+ Teaching robots to fold, drape, or reposition deformable objects such as +cloth will unlock a variety of automation applications. While remarkable +progress has been made for rigid object manipulation, manipulating deformable +objects poses unique challenges, including frequent occlusions, +infinite-dimensional state spaces and complex dynamics. Just as object pose +estimation and tracking have aided robots for rigid manipulation, dense 3D +tracking (scene flow) of highly deformable objects will enable new applications +in robotics while aiding existing approaches, such as imitation learning or +creating digital twins with real2sim transfer. We propose DeformGS, an approach +to recover scene flow in highly deformable scenes, using simultaneous video +captures of a dynamic scene from multiple cameras. DeformGS builds on recent +advances in Gaussian splatting, a method that learns the properties of a large +number of Gaussians for state-of-the-art and fast novel-view synthesis. +DeformGS learns a deformation function to project a set of Gaussians with +canonical properties into world space. The deformation function uses a +neural-voxel encoding and a multilayer perceptron (MLP) to infer Gaussian +position, rotation, and a shadow scalar. We enforce physics-inspired +regularization terms based on conservation of momentum and isometry, which +leads to trajectories with smaller trajectory errors. We also leverage existing +foundation models SAM and XMEM to produce noisy masks, and learn a per-Gaussian +mask for better physics-inspired regularization. DeformGS achieves high-quality +3D tracking on highly deformable scenes with shadows and occlusions. In +experiments, DeformGS improves 3D tracking by an average of 55.8% compared to +the state-of-the-art. With sufficient texture, DeformGS achieves a median +tracking error of 3.3 mm on a cloth of 1.5 x 1.5 m in area. Website: +https://deformgs.github.io + +
+
+
+
+
+ + ♻ ☆ OpticalRS-4M: Scaling Efficient Masked Autoencoder Learning on Large + Remote Sensing Dataset + + +
+ Masked Image Modeling (MIM) has become an essential method for building +foundational visual models in remote sensing (RS). However, the limitations in +size and diversity of existing RS datasets restrict the ability of MIM methods +to learn generalizable representations. Additionally, conventional MIM +techniques, which require reconstructing all tokens, introduce unnecessary +computational overhead. To address these issues, we present a new pre-training +pipeline for RS models, featuring the creation of a large-scale RS dataset and +an efficient MIM approach. We curated a high-quality dataset named OpticalRS-4M +by collecting publicly available RS datasets and processing them through +exclusion, slicing, and deduplication. OpticalRS-4M comprises 4 million optical +images covering various RS tasks, such as object detection and pixel +segmentation. To enhance efficiency, we propose SelectiveMAE, a pre-training +method that dynamically encodes and reconstructs semantically rich patch +tokens, thereby reducing the inefficiencies of traditional MIM models caused by +redundant background pixels in RS images. Extensive experiments demonstrate +that OpticalRS-4M significantly improves classification, detection, and +segmentation performance, while SelectiveMAE increases training efficiency over +2 times. This highlights the effectiveness and scalability of our pipeline in +developing RS foundational models. + +
+
+
+
+
+ + ♻ ☆ Docling Technical Report + + +
+ This technical report introduces Docling, an easy to use, self-contained, +MIT-licensed open-source package for PDF document conversion. It is powered by +state-of-the-art specialized AI models for layout analysis (DocLayNet) and +table structure recognition (TableFormer), and runs efficiently on commodity +hardware in a small resource budget. The code interface allows for easy +extensibility and addition of new features and models. + +
+
+
+
+
+ + ♻ ☆ Foundational Models for Pathology and Endoscopy Images: Application for + Gastric Inflammation + + +
+ The integration of artificial intelligence (AI) in medical diagnostics +represents a significant advancement in managing upper gastrointestinal (GI) +cancer, a major cause of global cancer mortality. Specifically for gastric +cancer (GC), chronic inflammation causes changes in the mucosa such as atrophy, +intestinal metaplasia (IM), dysplasia and ultimately cancer. Early detection +through endoscopic regular surveillance is essential for better outcomes. +Foundation models (FM), which are machine or deep learning models trained on +diverse data and applicable to broad use cases, offer a promising solution to +enhance the accuracy of endoscopy and its subsequent pathology image analysis. +This review explores the recent advancements, applications, and challenges +associated with FM in endoscopy and pathology imaging. We started by +elucidating the core principles and architectures underlying these models, +including their training methodologies and the pivotal role of large-scale data +in developing their predictive capabilities. Moreover, this work discusses +emerging trends and future research directions, emphasizing the integration of +multimodal data, the development of more robust and equitable models, and the +potential for real-time diagnostic support. This review aims to provide a +roadmap for researchers and practitioners in navigating the complexities of +incorporating FM into clinical practice for prevention/management of GC cases, +thereby improving patient outcomes. + +
+
+
+
+
+ + ♻ ☆ DreamPhysics: Learning Physical Properties of Dynamic 3D Gaussians with + Video Diffusion Priors + + +
+ Dynamic 3D interaction has been attracting a lot of attention recently. +However, creating such 4D content remains challenging. One solution is to +animate 3D scenes with physics-based simulation, which requires manually +assigning precise physical properties to the object or the simulated results +would become unnatural. Another solution is to learn the deformation of 3D +objects with the distillation of video generative models, which, however, tends +to produce 3D videos with small and discontinuous motions due to the +inappropriate extraction and application of physical prior. In this work, +combining the strengths and complementing shortcomings of the above two +solutions, we propose to learn the physical properties of a material field with +video diffusion priors, and then utilize a physics-based Material-Point-Method +(MPM) simulator to generate 4D content with realistic motions. In particular, +we propose motion distillation sampling to emphasize video motion information +during distillation. Moreover, to facilitate the optimization, we further +propose a KAN-based material field with frame boosting. Experimental results +demonstrate that our method enjoys more realistic motion than +state-of-the-arts. Codes are released at: +https://github.com/tyhuang0428/DreamPhysics. + +
+
+ comment: Codes are released at: https://github.com/tyhuang0428/DreamPhysics +
+
+
+
+
+ + ♻ ☆ L4DR: LiDAR-4DRadar Fusion for Weather-Robust 3D Object Detection + + +
+ LiDAR-based vision systems are integral for 3D object detection, which is +crucial for autonomous navigation. However, they suffer from performance +degradation in adverse weather conditions due to the quality deterioration of +LiDAR point clouds. Fusing LiDAR with the weather-robust 4D radar sensor is +expected to solve this problem. However, the fusion of LiDAR and 4D radar is +challenging because they differ significantly in terms of data quality and the +degree of degradation in adverse weather. To address these issues, we introduce +L4DR, a weather-robust 3D object detection method that effectively achieves +LiDAR and 4D Radar fusion. Our L4DR includes Multi-Modal Encoding (MME) and +Foreground-Aware Denoising (FAD) technique to reconcile sensor gaps, which is +the first exploration of the complementarity of early fusion between LiDAR and +4D radar. Additionally, we design an Inter-Modal and Intra-Modal ({IM}2 ) +parallel feature extraction backbone coupled with a Multi-Scale Gated Fusion +(MSGF) module to counteract the varying degrees of sensor degradation under +adverse weather conditions. Experimental evaluation on a VoD dataset with +simulated fog proves that L4DR is more adaptable to changing weather +conditions. It delivers a significant performance increase under different fog +levels, improving the 3D mAP by up to 20.0% over the traditional LiDAR-only +approach. Moreover, the results on the K-Radar dataset validate the consistent +performance improvement of L4DR in real-world adverse weather conditions. + +
+
+
+
+
+ + ♻ ☆ GeoMeter: Probing Depth and Height Perception of Large Visual-Language + Models + + +
+ Geometric understanding is crucial for navigating and interacting with our +environment. While large Vision Language Models (VLMs) demonstrate impressive +capabilities, deploying them in real-world scenarios necessitates a comparable +geometric understanding in visual perception. In this work, we focus on the +geometric comprehension of these models; specifically targeting the depths and +heights of objects within a scene. Our observations reveal that, although VLMs +excel in basic geometric properties perception such as shape and size, they +encounter significant challenges in reasoning about the depth and height of +objects. To address this, we introduce GeoMeter, a suite of benchmark datasets +encompassing Synthetic 2D, Synthetic 3D, and Real-World scenarios to rigorously +evaluate these aspects. We benchmark 17 state-of-the-art VLMs using these +datasets and find that they consistently struggle with both depth and height +perception. Our key insights include detailed analyses of the shortcomings in +depth and height reasoning capabilities of VLMs and the inherent bias present +in these models. This study aims to pave the way for the development of VLMs +with enhanced geometric understanding, crucial for real-world applications. + +
+
+
+
+
+ + ♻ ☆ Revisiting 360 Depth Estimation with PanoGabor: A New Fusion Perspective + + +
+ Depth estimation from a monocular 360 image is important to the perception of +the entire 3D environment. However, the inherent distortion and large field of +view (FoV) in 360 images pose great challenges for this task. To this end, +existing mainstream solutions typically introduce additional perspective-based +360 representations (\textit{e.g.}, Cubemap) to achieve effective feature +extraction. Nevertheless, regardless of the introduced representations, they +eventually need to be unified into the equirectangular projection (ERP) format +for the subsequent depth estimation, which inevitably reintroduces the +troublesome distortions. In this work, we propose an oriented distortion-aware +Gabor Fusion framework (PGFuse) to address the above challenges. First, we +introduce Gabor filters that analyze texture in the frequency domain, thereby +extending the receptive fields and enhancing depth cues. To address the +reintroduced distortions, we design a linear latitude-aware distortion +representation method to generate customized, distortion-aware Gabor filters +(PanoGabor filters). Furthermore, we design a channel-wise and spatial-wise +unidirectional fusion module (CS-UFM) that integrates the proposed PanoGabor +filters to unify other representations into the ERP format, delivering +effective and distortion-free features. Considering the orientation sensitivity +of the Gabor transform, we introduce a spherical gradient constraint to +stabilize this sensitivity. Experimental results on three popular indoor 360 +benchmarks demonstrate the superiority of the proposed PGFuse to existing +state-of-the-art solutions. Code can be available upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Object-Centric Diffusion for Efficient Video Editing ECCV24 + + +
+ Diffusion-based video editing have reached impressive quality and can +transform either the global style, local structure, and attributes of given +video inputs, following textual edit prompts. However, such solutions typically +incur heavy memory and computational costs to generate temporally-coherent +frames, either in the form of diffusion inversion and/or cross-frame attention. +In this paper, we conduct an analysis of such inefficiencies, and suggest +simple yet effective modifications that allow significant speed-ups whilst +maintaining quality. Moreover, we introduce Object-Centric Diffusion, to fix +generation artifacts and further reduce latency by allocating more computations +towards foreground edited regions, arguably more important for perceptual +quality. We achieve this by two novel proposals: i) Object-Centric Sampling, +decoupling the diffusion steps spent on salient or background regions and +spending most on the former, and ii) Object-Centric Token Merging, which +reduces cost of cross-frame attention by fusing redundant tokens in unimportant +background regions. Both techniques are readily applicable to a given video +editing model without retraining, and can drastically reduce its memory and +computational cost. We evaluate our proposals on inversion-based and +control-signal-based editing pipelines, and show a latency reduction up to 10x +for a comparable synthesis quality. Project page: +qualcomm-ai-research.github.io/object-centric-diffusion. + +
+
+ comment: ECCV24 +
+
+
+
+
+ + ♻ ☆ CaFNet: A Confidence-Driven Framework for Radar Camera Depth Estimation IROS 2024 + + +
+ Depth estimation is critical in autonomous driving for interpreting 3D scenes +accurately. Recently, radar-camera depth estimation has become of sufficient +interest due to the robustness and low-cost properties of radar. Thus, this +paper introduces a two-stage, end-to-end trainable Confidence-aware Fusion Net +(CaFNet) for dense depth estimation, combining RGB imagery with sparse and +noisy radar point cloud data. The first stage addresses radar-specific +challenges, such as ambiguous elevation and noisy measurements, by predicting a +radar confidence map and a preliminary coarse depth map. A novel approach is +presented for generating the ground truth for the confidence map, which +involves associating each radar point with its corresponding object to identify +potential projection surfaces. These maps, together with the initial radar +input, are processed by a second encoder. For the final depth estimation, we +innovate a confidence-aware gated fusion mechanism to integrate radar and image +features effectively, thereby enhancing the reliability of the depth map by +filtering out radar noise. Our methodology, evaluated on the nuScenes dataset, +demonstrates superior performance, improving upon the current leading model by +3.2% in Mean Absolute Error (MAE) and 2.7% in Root Mean Square Error (RMSE). +Code: https://github.com/harborsarah/CaFNet + +
+
+ comment: Accepted by IROS 2024 +
+
+
+
+
+ + ♻ ☆ Addressing the challenges of loop detection in agricultural environments + + +
+ While visual SLAM systems are well studied and achieve impressive results in +indoor and urban settings, natural, outdoor and open-field environments are +much less explored and still present relevant research challenges. Visual +navigation and local mapping have shown a relatively good performance in +open-field environments. However, globally consistent mapping and long-term +localization still depend on the robustness of loop detection and closure, for +which the literature is scarce. In this work we propose a novel method to pave +the way towards robust loop detection in open fields, particularly in +agricultural settings, based on local feature search and stereo geometric +refinement, with a final stage of relative pose estimation. Our method +consistently achieves good loop detections, with a median error of 15cm. We aim +to characterize open fields as a novel environment for loop detection, +understanding the limitations and problems that arise when dealing with them. + +
+
+
+
+
+ + ♻ ☆ Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active + Image Classification ECML + + +
+ Deep active learning (AL) seeks to minimize the annotation costs for training +deep neural networks. BAIT, a recently proposed AL strategy based on the Fisher +Information, has demonstrated impressive performance across various datasets. +However, BAIT's high computational and memory requirements hinder its +applicability on large-scale classification tasks, resulting in current +research neglecting BAIT in their evaluation. This paper introduces two methods +to enhance BAIT's computational efficiency and scalability. Notably, we +significantly reduce its time complexity by approximating the Fisher +Information. In particular, we adapt the original formulation by i) taking the +expectation over the most probable classes, and ii) constructing a binary +classification task, leading to an alternative likelihood for gradient +computations. Consequently, this allows the efficient use of BAIT on +large-scale datasets, including ImageNet. Our unified and comprehensive +evaluation across a variety of datasets demonstrates that our approximations +achieve strong performance with considerably reduced time complexity. +Furthermore, we provide an extensive open-source toolbox that implements recent +state-of-the-art AL strategies, available at +https://github.com/dhuseljic/dal-toolbox. + +
+
+ comment: Accepted at ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ GlyphDraw2: Automatic Generation of Complex Glyph Posters with Diffusion + Models and Large Language Models + + +
+ Posters play a crucial role in marketing and advertising by enhancing visual +communication and brand visibility, making significant contributions to +industrial design. With the latest advancements in controllable T2I diffusion +models, increasing research has focused on rendering text within synthesized +images. Despite improvements in text rendering accuracy, the field of automatic +poster generation remains underexplored. In this paper, we propose an automatic +poster generation framework with text rendering capabilities leveraging LLMs, +utilizing a triple-cross attention mechanism based on alignment learning. This +framework aims to create precise poster text within a detailed contextual +background. Additionally, the framework supports controllable fonts, adjustable +image resolution, and the rendering of posters with descriptions and text in +both English and Chinese.Furthermore, we introduce a high-resolution font +dataset and a poster dataset with resolutions exceeding 1024 pixels. Our +approach leverages the SDXL architecture. Extensive experiments validate our +method's capability in generating poster images with complex and contextually +rich backgrounds.Codes is available at +https://github.com/OPPO-Mente-Lab/GlyphDraw2. + +
+
+
+
+
+ + ♻ ☆ Large coordinate kernel attention network for lightweight image + super-resolution + + +
+ The multi-scale receptive field and large kernel attention (LKA) module have +been shown to significantly improve performance in the lightweight image +super-resolution task. However, existing lightweight super-resolution (SR) +methods seldom pay attention to designing efficient building block with +multi-scale receptive field for local modeling, and their LKA modules face a +quadratic increase in computational and memory footprints as the convolutional +kernel size increases. To address the first issue, we propose the multi-scale +blueprint separable convolutions (MBSConv) as highly efficient building block +with multi-scale receptive field, it can focus on the learning for the +multi-scale information which is a vital component of discriminative +representation. As for the second issue, we revisit the key properties of LKA +in which we find that the adjacent direct interaction of local information and +long-distance dependencies is crucial to provide remarkable performance. Thus, +taking this into account and in order to mitigate the complexity of LKA, we +propose a large coordinate kernel attention (LCKA) module which decomposes the +2D convolutional kernels of the depth-wise convolutional layers in LKA into +horizontal and vertical 1-D kernels. LCKA enables the adjacent direct +interaction of local information and long-distance dependencies not only in the +horizontal direction but also in the vertical. Besides, LCKA allows for the +direct use of extremely large kernels in the depth-wise convolutional layers to +capture more contextual information, which helps to significantly improve the +reconstruction performance, and it incurs lower computational complexity and +memory footprints. Integrating MBSConv and LCKA, we propose a large coordinate +kernel attention network (LCAN). + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Improving Online Source-free Domain Adaptation for Object Detection by + Unsupervised Data Acquisition ECCV + + +
+ Effective object detection in autonomous vehicles is challenged by deployment +in diverse and unfamiliar environments. Online Source-Free Domain Adaptation +(O-SFDA) offers model adaptation using a stream of unlabeled data from a target +domain in an online manner. However, not all captured frames contain +information beneficial for adaptation, especially in the presence of redundant +data and class imbalance issues. This paper introduces a novel approach to +enhance O-SFDA for adaptive object detection through unsupervised data +acquisition. Our methodology prioritizes the most informative unlabeled frames +for inclusion in the online training process. Empirical evaluation on a +real-world dataset reveals that our method outperforms existing +state-of-the-art O-SFDA techniques, demonstrating the viability of unsupervised +data acquisition for improving the adaptive object detector. + +
+
+ comment: Accepted by ECCV workshop ROAM 2024; 12 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ CathAction: A Benchmark for Endovascular Intervention Understanding + + +
+ Real-time visual feedback from catheterization analysis is crucial for +enhancing surgical safety and efficiency during endovascular interventions. +However, existing datasets are often limited to specific tasks, small scale, +and lack the comprehensive annotations necessary for broader endovascular +intervention understanding. To tackle these limitations, we introduce +CathAction, a large-scale dataset for catheterization understanding. Our +CathAction dataset encompasses approximately 500,000 annotated frames for +catheterization action understanding and collision detection, and 25,000 ground +truth masks for catheter and guidewire segmentation. For each task, we +benchmark recent related works in the field. We further discuss the challenges +of endovascular intentions compared to traditional computer vision tasks and +point out open research questions. We hope that CathAction will facilitate the +development of endovascular intervention understanding methods that can be +applied to real-world applications. The dataset is available at +https://airvlab.github.io/cathaction/. + +
+
+ comment: 10 pages. Webpage: https://airvlab.github.io/cathaction/ +
+
+
+
+
+ + ♻ ☆ Human-Free Automated Prompting for Vision-Language Anomaly Detection: + Prompt Optimization with Meta-guiding Prompt Scheme + + +
+ Pre-trained vision-language models (VLMs) are highly adaptable to various +downstream tasks through few-shot learning, making prompt-based anomaly +detection a promising approach. Traditional methods depend on human-crafted +prompts that require prior knowledge of specific anomaly types. Our goal is to +develop a human-free prompt-based anomaly detection framework that optimally +learns prompts through data-driven methods, eliminating the need for human +intervention. The primary challenge in this approach is the lack of anomalous +samples during the training phase. Additionally, the Vision Transformer +(ViT)-based image encoder in VLMs is not ideal for pixel-wise anomaly +segmentation due to a locality feature mismatch between the original image and +the output feature map. To tackle the first challenge, we have developed the +Object-Attention Anomaly Generation Module (OAGM) to synthesize anomaly samples +for training. Furthermore, our Meta-Guiding Prompt-Tuning Scheme (MPTS) +iteratively adjusts the gradient-based optimization direction of learnable +prompts to avoid overfitting to the synthesized anomalies. For the second +challenge, we propose Locality-Aware Attention, which ensures that each local +patch feature attends only to nearby patch features, preserving the locality +features corresponding to their original locations. This framework allows for +the optimal prompt embeddings by searching in the continuous latent space via +backpropagation, free from human semantic constraints. Additionally, the +modified locality-aware attention improves the precision of pixel-wise anomaly +segmentation. + +
+
+
+
+
+ + ♻ ☆ Many-Worlds Inverse Rendering + + +
+ Discontinuous visibility changes remain a major bottleneck when optimizing +surfaces within a physically-based inverse renderer. Many previous works have +proposed sophisticated algorithms and data structures to sample visibility +silhouettes more efficiently. + Our work presents another solution: instead of differentiating a tentative +surface locally, we differentiate a volumetric perturbation of a surface. We +refer this as a many-worlds representation because it models a non-interacting +superposition of conflicting explanations (worlds) of the input dataset. Each +world is optically isolated from others, leading to a new transport law that +distinguishes our method from prior work based on exponential random media. + The resulting Monte Carlo algorithm is simpler and more efficient than prior +methods. We demonstrate that our method promotes rapid convergence, both in +terms of the total iteration count and the cost per iteration. + +
+
+
+
+
+ + ♻ ☆ RoadRunner -- Learning Traversability Estimation for Autonomous Off-road + Driving + + +
+ Autonomous navigation at high speeds in off-road environments necessitates +robots to comprehensively understand their surroundings using onboard sensing +only. The extreme conditions posed by the off-road setting can cause degraded +camera image quality due to poor lighting and motion blur, as well as limited +sparse geometric information available from LiDAR sensing when driving at high +speeds. In this work, we present RoadRunner, a novel framework capable of +predicting terrain traversability and an elevation map directly from camera and +LiDAR sensor inputs. RoadRunner enables reliable autonomous navigation, by +fusing sensory information, handling of uncertainty, and generation of +contextually informed predictions about the geometry and traversability of the +terrain while operating at low latency. In contrast to existing methods relying +on classifying handcrafted semantic classes and using heuristics to predict +traversability costs, our method is trained end-to-end in a self-supervised +fashion. The RoadRunner network architecture builds upon popular sensor fusion +network architectures from the autonomous driving domain, which embed LiDAR and +camera information into a common Bird's Eye View perspective. Training is +enabled by utilizing an existing traversability estimation stack to generate +training data in hindsight in a scalable manner from real-world off-road +driving datasets. Furthermore, RoadRunner improves the system latency by a +factor of roughly 4, from 500 ms to 140 ms, while improving the accuracy for +traversability costs and elevation map predictions. We demonstrate the +effectiveness of RoadRunner in enabling safe and reliable off-road navigation +at high speeds in multiple real-world driving scenarios through unstructured +desert environments. + +
+
+ comment: accepted for IEEE Transactions on Field Robotics (T-FR) +
+
+
+
+
+ + ♻ ☆ Evidential Deep Partial Multi-View Classification With Discount Fusion + + +
+ Incomplete multi-view data classification poses significant challenges due to +the common issue of missing views in real-world scenarios. Despite +advancements, existing methods often fail to provide reliable predictions, +largely due to the uncertainty of missing views and the inconsistent quality of +imputed data. To tackle these problems, we propose a novel framework called +Evidential Deep Partial Multi-View Classification (EDP-MVC). Initially, we use +K-means imputation to address missing views, creating a complete set of +multi-view data. However, the potential conflicts and uncertainties within this +imputed data can affect the reliability of downstream inferences. To manage +this, we introduce a Conflict-Aware Evidential Fusion Network (CAEFN), which +dynamically adjusts based on the reliability of the evidence, ensuring +trustworthy discount fusion and producing reliable inference outcomes. +Comprehensive experiments on various benchmark datasets reveal EDP-MVC not only +matches but often surpasses the performance of state-of-the-art methods. + +
+
+ comment: Ongoing work. 13 pages, 3 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Eyes Can Deceive: Benchmarking Counterfactual Reasoning Abilities of + Multi-modal Large Language Models + + +
+ Counterfactual reasoning, as a crucial manifestation of human intelligence, +refers to making presuppositions based on established facts and extrapolating +potential outcomes. Existing multimodal large language models (MLLMs) have +exhibited impressive cognitive and reasoning capabilities, which have been +examined across a wide range of Visual Question Answering (VQA) benchmarks. +Nevertheless, how will existing MLLMs perform when faced with counterfactual +questions? To answer this question, we first curate a novel +\textbf{C}ounter\textbf{F}actual \textbf{M}ulti\textbf{M}odal reasoning +benchmark, abbreviated as \textbf{CFMM}, to systematically assess the +counterfactual reasoning capabilities of MLLMs. Our CFMM comprises six +challenging tasks, each including hundreds of carefully human-labeled and +GPT-generated counterfactual questions, to evaluate MLLM's counterfactual +reasoning capabilities across diverse aspects. Through experiments, +interestingly, we find that existing MLLMs prefer to believe what they see, but +ignore the counterfactual presuppositions presented in the question, thereby +leading to inaccurate responses. Furthermore, we evaluate a wide range of +prevalent MLLMs on our proposed CFMM. The significant gap between their +performance on our CFMM and that on several VQA benchmarks indicates that there +is still considerable room for improvement in existing MLLMs toward approaching +human-level intelligence. On the other hand, through boosting MLLMs +performances on our CFMM in the future, potential avenues toward developing +MLLMs with advanced intelligence can be explored. + +
+
+
+
+
+ + ♻ ☆ Early Explorations of Lightweight Models for Wound Segmentation on + Mobile Devices + + +
+ The aging population poses numerous challenges to healthcare, including the +increase in chronic wounds in the elderly. The current approach to wound +assessment by therapists based on photographic documentation is subjective, +highlighting the need for computer-aided wound recognition from smartphone +photos. This offers objective and convenient therapy monitoring, while being +accessible to patients from their home at any time. However, despite research +in mobile image segmentation, there is a lack of focus on mobile wound +segmentation. To address this gap, we conduct initial research on three +lightweight architectures to investigate their suitability for smartphone-based +wound segmentation. Using public datasets and UNet as a baseline, our results +are promising, with both ENet and TopFormer, as well as the larger UNeXt +variant, showing comparable performance to UNet. Furthermore, we deploy the +models into a smartphone app for visual assessment of live segmentation, where +results demonstrate the effectiveness of TopFormer in distinguishing wounds +from wound-coloured objects. While our study highlights the potential of +transformer models for mobile wound segmentation, future work should aim to +further improve the mask contours. + +
+
+ comment: Extended version of our paper that was published in the "47th German + Conference on Artificial Intelligence (KI 2024)" +
+
+
+
+
+ + ♻ ☆ Deep Convolutional Framelet Denoising for Panoramic by Mixed Wavelet + Integration + + +
+ Enhancing quality and removing noise during preprocessing is one of the most +critical steps in image processing. X-ray images are created by photons +colliding with atoms and the variation in scattered noise absorption. This +noise leads to a deterioration in the graph's medical quality and, at times, +results in repetition, thereby increasing the patient's effective dose. One of +the most critical challenges in this area has consistently been lowering the +image noise. Techniques like BM3d, low-pass filters, and Autoencoder have taken +this step. Owing to their structural design and high rate of repetition, neural +networks employing diverse architectures have, over the past decade, achieved +noise reduction with satisfactory outcomes, surpassing the traditional BM3D and +low-pass filters. The combination of the Hankel matrix with neural networks +represents one of these configurations. The Hankel matrix aims to identify a +local circle by separating individual values into local and non-local +components, utilizing a non-local matrix. A non-local matrix can be created +using the wave or DCT. This paper suggests integrating the waveform with the +Daubechies (D4) wavelet due to its higher energy concentration and employs the +u-Net neural network architecture, which incorporates the waveform exclusively +at each stage. The outcomes were evaluated using the PSNR and SSIM criteria, +and the outcomes were verified by using various waves. The effectiveness of a +one-wave network has increased from 0.5% to 1.2%, according to studies done on +other datasets + +
+
+
+
+
+ + ♻ ☆ TSAR-MVS: Textureless-aware Segmentation and Correlative Refinement + Guided Multi-View Stereo + + +
+ The reconstruction of textureless areas has long been a challenging problem +in MVS due to lack of reliable pixel correspondences between images. In this +paper, we propose the Textureless-aware Segmentation And Correlative Refinement +guided Multi-View Stereo (TSAR-MVS), a novel method that effectively tackles +challenges posed by textureless areas in 3D reconstruction through filtering, +refinement and segmentation. First, we implement the joint hypothesis +filtering, a technique that merges a confidence estimator with a disparity +discontinuity detector to eliminate incorrect depth estimations. Second, to +spread the pixels with confident depth, we introduce an iterative correlation +refinement strategy that leverages RANSAC to generate 3D planes based on +superpixels, succeeded by a weighted median filter for broadening the influence +of accurately determined pixels. Finally, we present a textureless-aware +segmentation method that leverages edge detection and line detection for +accurately identify large textureless regions for further depth completion. +Experiments on ETH3D, Tanks & Temples and Strecha datasets demonstrate the +superior performance and strong generalization capability of our proposed +method. + +
+
+
+
+
+ + ♻ ☆ MSP-MVS: Multi-granularity Segmentation Prior Guided Multi-View Stereo + + +
+ Reconstructing textureless areas in MVS poses challenges due to the absence +of reliable pixel correspondences within fixed patch. Although certain methods +employ patch deformation to expand the receptive field, their patches +mistakenly skip depth edges to calculate areas with depth discontinuity, +thereby causing ambiguity. Consequently, we introduce Multi-granularity +Segmentation Prior Multi-View Stereo (MSP-MVS). Specifically, we first propose +multi-granularity segmentation prior by integrating multi-granularity depth +edges to restrict patch deformation within homogeneous areas. Moreover, we +present anchor equidistribution that bring deformed patches with more uniformly +distributed anchors to ensure an adequate coverage of their own homogeneous +areas. Furthermore, we introduce iterative local search optimization to +represent larger patch with sparse representative candidates, significantly +boosting the expressive capacity for each patch. The state-of-the-art results +on ETH3D and Tanks & Temples benchmarks demonstrate the effectiveness and +robust generalization ability of our proposed method. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2308.09990 +
+
+
+
+
+ + ♻ ☆ Tailoring Adversarial Attacks on Deep Neural Networks for Targeted Class + Manipulation Using DeepFool Algorithm + + +
+ The susceptibility of deep neural networks (DNNs) to adversarial attacks +undermines their reliability across numerous applications, underscoring the +necessity for an in-depth exploration of these vulnerabilities and the +formulation of robust defense strategies. The DeepFool algorithm by +Moosavi-Dezfooli et al. (2016) represents a pivotal step in identifying minimal +perturbations required to induce misclassification of input images. +Nonetheless, its generic methodology falls short in scenarios necessitating +targeted interventions. Additionally, previous research studies have +predominantly concentrated on the success rate of attacks without adequately +addressing the consequential distortion of images, the maintenance of image +quality, or the confidence threshold required for misclassification. To bridge +these gaps, we introduce the Enhanced Targeted DeepFool (ET DeepFool) +algorithm, an evolution of DeepFool that not only facilitates the specification +of desired misclassification targets but also incorporates a configurable +minimum confidence score. Our empirical investigations demonstrate the +superiority of this refined approach in maintaining the integrity of images and +minimizing perturbations across a variety of DNN architectures. Unlike previous +iterations, such as the Targeted DeepFool by Gajjar et al. (2022), our method +grants unparalleled control over the perturbation process, enabling precise +manipulation of model responses. Preliminary outcomes reveal that certain +models, including AlexNet and the advanced Vision Transformer, display +commendable robustness to such manipulations. This discovery of varying levels +of model robustness, as unveiled through our confidence level adjustments, +could have far-reaching implications for the field of image recognition. Our +code will be made public upon acceptance of the paper. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Zero-Shot Multi-Object Scene Completion ECCV 2024 + + +
+ We present a 3D scene completion method that recovers the complete geometry +of multiple unseen objects in complex scenes from a single RGB-D image. Despite +notable advancements in single-object 3D shape completion, high-quality +reconstructions in highly cluttered real-world multi-object scenes remains a +challenge. To address this issue, we propose OctMAE, an architecture that +leverages an Octree U-Net and a latent 3D MAE to achieve high-quality and near +real-time multi-object scene completion through both local and global geometric +reasoning. Because a naive 3D MAE can be computationally intractable and memory +intensive even in the latent space, we introduce a novel occlusion masking +strategy and adopt 3D rotary embeddings, which significantly improves the +runtime and scene completion quality. To generalize to a wide range of objects +in diverse scenes, we create a large-scale photorealistic dataset, featuring a +diverse set of 12K 3D object models from the Objaverse dataset which are +rendered in multi-object scenes with physics-based positioning. Our method +outperforms the current state-of-the-art on both synthetic and real-world +datasets and demonstrates a strong zero-shot capability. + +
+
+ comment: Published at ECCV 2024, Webpage: https://sh8.io/#/oct_mae +
+
+
+
+
+ + ♻ ☆ Does CLIP Bind Concepts? Probing Compositionality in Large Image Models + + +
+ Large-scale neural network models combining text and images have made +incredible progress in recent years. However, it remains an open question to +what extent such models encode compositional representations of the concepts +over which they operate, such as correctly identifying "red cube" by reasoning +over the constituents "red" and "cube". In this work, we focus on the ability +of a large pretrained vision and language model (CLIP) to encode compositional +concepts and to bind variables in a structure-sensitive way (e.g., +differentiating "cube behind sphere" from "sphere behind cube"). To inspect the +performance of CLIP, we compare several architectures from research on +compositional distributional semantics models (CDSMs), a line of research that +attempts to implement traditional compositional linguistic structures within +embedding spaces. We benchmark them on three synthetic datasets - +single-object, two-object, and relational - designed to test concept binding. +We find that CLIP can compose concepts in a single-object setting, but in +situations where concept binding is needed, performance drops dramatically. At +the same time, CDSMs also perform poorly, with best performance at chance +level. + +
+
+ comment: Lewis and Nayak contributed equally +
+
+
+
+
+ + ♻ ☆ 3D Weakly Supervised Semantic Segmentation with 2D Vision-Language + Guidance + + +
+ In this paper, we propose 3DSS-VLG, a weakly supervised approach for 3D +Semantic Segmentation with 2D Vision-Language Guidance, an alternative approach +that a 3D model predicts dense-embedding for each point which is co-embedded +with both the aligned image and text spaces from the 2D vision-language model. +Specifically, our method exploits the superior generalization ability of the 2D +vision-language models and proposes the Embeddings Soft-Guidance Stage to +utilize it to implicitly align 3D embeddings and text embeddings. Moreover, we +introduce the Embeddings Specialization Stage to purify the feature +representation with the help of a given scene-level label, specifying a better +feature supervised by the corresponding text embedding. Thus, the 3D model is +able to gain informative supervisions both from the image embedding and text +embedding, leading to competitive segmentation performances. To the best of our +knowledge, this is the first work to investigate 3D weakly supervised semantic +segmentation by using the textual semantic information of text category labels. +Moreover, with extensive quantitative and qualitative experiments, we present +that our 3DSS-VLG is able not only to achieve the state-of-the-art performance +on both S3DIS and ScanNet datasets, but also to maintain strong generalization +capability. + +
+
+
+
+
+ + ♻ ☆ Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models + + +
+ Diffusion models have revolutionized customized text-to-image generation, +allowing for efficient synthesis of photos from personal data with textual +descriptions. However, these advancements bring forth risks including privacy +breaches and unauthorized replication of artworks. Previous researches +primarily center around using prompt-specific methods to generate adversarial +examples to protect personal images, yet the effectiveness of existing methods +is hindered by constrained adaptability to different prompts. In this paper, we +introduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for +customized diffusion models. PAP first models the prompt distribution using a +Laplace Approximation, and then produces prompt-agnostic perturbations by +maximizing a disturbance expectation based on the modeled distribution. This +approach effectively tackles the prompt-agnostic attacks, leading to improved +defense stability. Extensive experiments in face privacy and artistic style +protection, demonstrate the superior generalization of our method in comparison +to existing techniques. + +
+
+ comment: The experiments are insufficient and need to be completed +
+
+
+
+
+ + ♻ ☆ Hydra-MDP: End-to-end Multimodal Planning with Multi-target + Hydra-Distillation CVPR + 2024 + + +
+ We propose Hydra-MDP, a novel paradigm employing multiple teachers in a +teacher-student model. This approach uses knowledge distillation from both +human and rule-based teachers to train the student model, which features a +multi-head decoder to learn diverse trajectory candidates tailored to various +evaluation metrics. With the knowledge of rule-based teachers, Hydra-MDP learns +how the environment influences the planning in an end-to-end manner instead of +resorting to non-differentiable post-processing. This method achieves the +$1^{st}$ place in the Navsim challenge, demonstrating significant improvements +in generalization across diverse driving environments and conditions. More +details by visiting \url{https://github.com/NVlabs/Hydra-MDP}. + +
+
+ comment: The 1st place solution of End-to-end Driving at Scale at the CVPR + 2024 Autonomous Grand Challenge +
+
+
+
+
+ + ♻ ☆ VRSO: Visual-Centric Reconstruction for Static Object Annotation IROS + + +
+ As a part of the perception results of intelligent driving systems, static +object detection (SOD) in 3D space provides crucial cues for driving +environment understanding. With the rapid deployment of deep neural networks +for SOD tasks, the demand for high-quality training samples soars. The +traditional, also reliable, way is manual labelling over the dense LiDAR point +clouds and reference images. Though most public driving datasets adopt this +strategy to provide SOD ground truth (GT), it is still expensive and +time-consuming in practice. This paper introduces VRSO, a visual-centric +approach for static object annotation. Experiments on the Waymo Open Dataset +show that the mean reprojection error from VRSO annotation is only 2.6 pixels, +around four times lower than the Waymo Open Dataset labels (10.6 pixels). VRSO +is distinguished in low cost, high efficiency, and high quality: (1) It +recovers static objects in 3D space with only camera images as input, and (2) +manual annotation is barely involved since GT for SOD tasks is generated based +on an automatic reconstruction and annotation pipeline. + +
+
+ comment: Accepted at 2024 IEEE International Conference on Intelligent Robots + and Systems (IROS) +
+
+
+
+
+ + ♻ ☆ An Asynchronous Linear Filter Architecture for Hybrid Event-Frame + Cameras + + +
+ Event cameras are ideally suited to capture High Dynamic Range (HDR) visual +information without blur but provide poor imaging capability for static or +slowly varying scenes. Conversely, conventional image sensors measure absolute +intensity of slowly changing scenes effectively but do poorly on HDR or quickly +changing scenes. In this paper, we present an asynchronous linear filter +architecture, fusing event and frame camera data, for HDR video reconstruction +and spatial convolution that exploits the advantages of both sensor modalities. +The key idea is the introduction of a state that directly encodes the +integrated or convolved image information and that is updated asynchronously as +each event or each frame arrives from the camera. The state can be read-off +as-often-as and whenever required to feed into subsequent vision modules for +real-time robotic systems. Our experimental results are evaluated on both +publicly available datasets with challenging lighting conditions and fast +motions, along with a new dataset with HDR reference that we provide. The +proposed AKF pipeline outperforms other state-of-the-art methods in both +absolute intensity error (69.4% reduction) and image similarity indexes +(average 35.5% improvement). We also demonstrate the integration of image +convolution with linear spatial kernels Gaussian, Sobel, and Laplacian as an +application of our architecture. + +
+
+ comment: 17 pages, 10 figures. Date of Publication: 04 September 2023 +
+
+
+
+
+ + ♻ ☆ LLaVA-SG: Leveraging Scene Graphs as Visual Semantic Expression in + Vision-Language Models + + +
+ Recent advances in large vision-language models (VLMs) typically employ +vision encoders based on the Vision Transformer (ViT) architecture. The +division of the images into patches by ViT results in a fragmented perception, +thereby hindering the visual understanding capabilities of VLMs. In this paper, +we propose an innovative enhancement to address this limitation by introducing +a Scene Graph Expression (SGE) module in VLMs. This module extracts and +structurally expresses the complex semantic information within images, thereby +improving the foundational perception and understanding abilities of VLMs. +Extensive experiments demonstrate that integrating our SGE module significantly +enhances the VLM's performance in vision-language tasks, indicating its +effectiveness in preserving intricate semantic details and facilitating better +visual understanding. + +
+
+
+
+
+ + ♻ ☆ Dissecting Out-of-Distribution Detection and Open-Set Recognition: A + Critical Analysis of Methods and Benchmarks + + +
+ Detecting test-time distribution shift has emerged as a key capability for +safely deployed machine learning models, with the question being tackled under +various guises in recent years. In this paper, we aim to provide a consolidated +view of the two largest sub-fields within the community: out-of-distribution +(OOD) detection and open-set recognition (OSR). In particular, we aim to +provide rigorous empirical analysis of different methods across settings and +provide actionable takeaways for practitioners and researchers. Concretely, we +make the following contributions: (i) We perform rigorous cross-evaluation +between state-of-the-art methods in the OOD detection and OSR settings and +identify a strong correlation between the performances of methods for them; +(ii) We propose a new, large-scale benchmark setting which we suggest better +disentangles the problem tackled by OOD detection and OSR, re-evaluating +state-of-the-art OOD detection and OSR methods in this setting; (iii) We +surprisingly find that the best performing method on standard benchmarks +(Outlier Exposure) struggles when tested at scale, while scoring rules which +are sensitive to the deep feature magnitude consistently show promise; and (iv) +We conduct empirical analysis to explain these phenomena and highlight +directions for future research. Code: +https://github.com/Visual-AI/Dissect-OOD-OSR + +
+
+ comment: Accepted to IJCV, preprint version; v2: add supplementary +
+
+
+
+
+ + ♻ ☆ MiniGPT-Reverse-Designing: Predicting Image Adjustments Utilizing + MiniGPT-4 + + +
+ Vision-Language Models (VLMs) have recently seen significant advancements +through integrating with Large Language Models (LLMs). The VLMs, which process +image and text modalities simultaneously, have demonstrated the ability to +learn and understand the interaction between images and texts across various +multi-modal tasks. Reverse designing, which could be defined as a complex +vision-language task, aims to predict the edits and their parameters, given a +source image, an edited version, and an optional high-level textual edit +description. This task requires VLMs to comprehend the interplay between the +source image, the edited version, and the optional textual context +simultaneously, going beyond traditional vision-language tasks. In this paper, +we extend and fine-tune MiniGPT-4 for the reverse designing task. Our +experiments demonstrate the extensibility of off-the-shelf VLMs, specifically +MiniGPT-4, for more complex tasks such as reverse designing. Code is available +at this \href{https://github.com/VahidAz/MiniGPT-Reverse-Designing} + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised 3D Visual Grounding based on Visual Linguistic + Alignment + + +
+ Learning to ground natural language queries to target objects or regions in +3D point clouds is quite essential for 3D scene understanding. Nevertheless, +existing 3D visual grounding approaches require a substantial number of +bounding box annotations for text queries, which is time-consuming and +labor-intensive to obtain. In this paper, we propose 3D-VLA, a weakly +supervised approach for 3D visual grounding based on Visual Linguistic +Alignment. Our 3D-VLA exploits the superior ability of current large-scale +vision-language models (VLMs) on aligning the semantics between texts and 2D +images, as well as the naturally existing correspondences between 2D images and +3D point clouds, and thus implicitly constructs correspondences between texts +and 3D point clouds with no need for fine-grained box annotations in the +training procedure. During the inference stage, the learned text-3D +correspondence will help us ground the text queries to the 3D target objects +even without 2D images. To the best of our knowledge, this is the first work to +investigate 3D visual grounding in a weakly supervised manner by involving +large scale vision-language models, and extensive experiments on ReferIt3D and +ScanRefer datasets demonstrate that our 3D-VLA achieves comparable and even +superior results over the fully supervised methods. + +
+
+
+
+
+ + ♻ ☆ DeepSpeak Dataset v1.0 + + +
+ We describe a large-scale dataset--DeepSpeak--of real and deepfake footage of +people talking and gesturing in front of their webcams. The real videos in this +first version of the dataset consist of 17 hours of footage from 220 diverse +individuals. Constituting more than 26 hours of footage, the fake videos +consist of a range of different state-of-the-art face-swap and lip-sync +deepfakes with natural and AI-generated voices. We expect to release future +versions of this dataset with different and updated deepfake technologies. This +dataset is made freely available for research and non-commercial uses; requests +for commercial use will be considered. + +
+
+
+
+
+ + ♻ ☆ Visual Multi-Object Tracking with Re-Identification and Occlusion + Handling using Labeled Random Finite Sets + + +
+ This paper proposes an online visual multi-object tracking (MOT) algorithm +that resolves object appearance-reappearance and occlusion. Our solution is +based on the labeled random finite set (LRFS) filtering approach, which in +principle, addresses disappearance, appearance, reappearance, and occlusion via +a single Bayesian recursion. However, in practice, existing numerical +approximations cause reappearing objects to be initialized as new tracks, +especially after long periods of being undetected. In occlusion handling, the +filter's efficacy is dictated by trade-offs between the sophistication of the +occlusion model and computational demand. Our contribution is a novel modeling +method that exploits object features to address reappearing objects whilst +maintaining a linear complexity in the number of detections. Moreover, to +improve the filter's occlusion handling, we propose a fuzzy detection model +that takes into consideration the overlapping areas between tracks and their +sizes. We also develop a fast version of the filter to further reduce the +computational time. The source code is publicly available at +https://github.com/linh-gist/mv-glmb-ab. + +
+
+
+
+
+ + ♻ ☆ Criticality Leveraged Adversarial Training (CLAT) for Boosted + Performance via Parameter Efficiency + + +
+ Adversarial training enhances neural network robustness but suffers from a +tendency to overfit and increased generalization errors on clean data. This +work introduces CLAT, an innovative approach that mitigates adversarial +overfitting by introducing parameter efficiency into the adversarial training +process, improving both clean accuracy and adversarial robustness. Instead of +tuning the entire model, CLAT identifies and fine-tunes robustness-critical +layers - those predominantly learning non-robust features - while freezing the +remaining model to enhance robustness. It employs dynamic critical layer +selection to adapt to changes in layer criticality throughout the fine-tuning +process. Empirically, CLAT can be applied on top of existing adversarial +training methods, significantly reduces the number of trainable parameters by +approximately 95%, and achieves more than a 2% improvement in adversarial +robustness compared to baseline methods. + +
+
+ comment: 9 pages + appendix/ additional experiments +
+
+
+
+
+ + ♻ ☆ Prompt Generation Networks for Input-Space Adaptation of Frozen Vision + Transformers BMVC2024 + + +
+ With the introduction of the transformer architecture in computer vision, +increasing model scale has been demonstrated as a clear path to achieving +performance and robustness gains. However, with model parameter counts reaching +the billions, classical finetuning approaches are becoming increasingly +limiting and even unfeasible when models become hosted as inference APIs, as in +NLP. Visual input-prompt learning, an adaptation technique in which additional +inputs in visual (RGB) space are learned, has emerged as a potential solution +for adapting frozen and cloud-hosted models, requiring neither access to the +forward pass, nor post-processing. Yet so far, these constraints have +deteriorated adaptation performances significantly. To this end, we propose the +Prompt Generation Network (PGN) that generates a different prompt for every +data point, which is then used to adapt a frozen pretrained vision model to a +target task. We show that the PGN effectively adapts pretrained models to +various new datasets: It surpasses previous methods by a large margin on 12/12 +datasets and even outperforms full-finetuning on 5/12, while requiring 100x +fewer parameters. Lastly, we introduce the "prompt inversion" trick, with which +PGNs can be efficiently trained in a latent space but deployed in RGB input +space for inference. + +
+
+ comment: Accepted by BMVC2024. Codebase: https://github.com/jochemloedeman/PGN +
+
+
+
+
+ + ♻ ☆ PoCo: Point Context Cluster for RGBD Indoor Place Recognition + + +
+ We present a novel end-to-end algorithm (PoCo) for the indoor RGB-D place +recognition task, aimed at identifying the most likely match for a given query +frame within a reference database. The task presents inherent challenges +attributed to the constrained field of view and limited range of perception +sensors. We propose a new network architecture, which generalizes the recent +Context of Clusters (CoCs) to extract global descriptors directly from the +noisy point clouds through end-to-end learning. Moreover, we develop the +architecture by integrating both color and geometric modalities into the point +features to enhance the global descriptor representation. We conducted +evaluations on public datasets ScanNet-PR and ARKit with 807 and 5047 +scenarios, respectively. PoCo achieves SOTA performance: on ScanNet-PR, we +achieve R@1 of 64.63%, a 5.7% improvement from the best-published result CGis +(61.12%); on Arkit, we achieve R@1 of 45.12%, a 13.3% improvement from the +best-published result CGis (39.82%). In addition, PoCo shows higher efficiency +than CGis in inference time (1.75X-faster), and we demonstrate the +effectiveness of PoCo in recognizing places within a real-world laboratory +environment. + +
+
+
+
+
+ + ♻ ☆ Mix-Domain Contrastive Learning for Unpaired H&E-to-IHC Stain + Translation + + +
+ H&E-to-IHC stain translation techniques offer a promising solution for +precise cancer diagnosis, especially in low-resource regions where there is a +shortage of health professionals and limited access to expensive equipment. +Considering the pixel-level misalignment of H&E-IHC image pairs, current +research explores the pathological consistency between patches from the same +positions of the image pair. However, most of them overemphasize the +correspondence between domains or patches, overlooking the side information +provided by the non-corresponding objects. In this paper, we propose a +Mix-Domain Contrastive Learning (MDCL) method to leverage the supervision +information in unpaired H&E-to-IHC stain translation. Specifically, the +proposed MDCL method aggregates the inter-domain and intra-domain pathology +information by estimating the correlation between the anchor patch and all the +patches from the matching images, encouraging the network to learn additional +contrastive knowledge from mixed domains. With the mix-domain pathology +information aggregation, MDCL enhances the pathological consistency between the +corresponding patches and the component discrepancy of the patches from the +different positions of the generated IHC image. Extensive experiments on two +H&E-to-IHC stain translation datasets, namely MIST and BCI, demonstrate that +the proposed method achieves state-of-the-art performance across multiple +metrics. + +
+
+
+
+
+ + ♻ ☆ FRACTAL: An Ultra-Large-Scale Aerial Lidar Dataset for 3D Semantic + Segmentation of Diverse Landscapes + + +
+ Mapping agencies are increasingly adopting Aerial Lidar Scanning (ALS) as a +new tool to map buildings and other above-ground structures. Processing ALS +data at scale requires efficient point classification methods that perform well +over highly diverse territories. Large annotated Lidar datasets are needed to +evaluate these classification methods, however, current Lidar benchmarks have +restricted scope and often cover a single urban area. To bridge this data gap, +we introduce the FRench ALS Clouds from TArgeted Landscapes (FRACTAL) dataset: +an ultra-large-scale aerial Lidar dataset made of 100,000 dense point clouds +with high quality labels for 7 semantic classes and spanning 250 km$^2$. +FRACTAL achieves high spatial and semantic diversity by explicitly sampling +rare classes and challenging landscapes from five different regions of France. +We describe the data collection, annotation, and curation process of the +dataset. We provide baseline semantic segmentation results using a state of the +art 3D point cloud classification model. FRACTAL aims to support the +development of 3D deep learning approaches for large-scale land monitoring. + +
+
+ comment: 9 (body) + 2 (bibliography) + 8 (appendices) pages | Dataset is + available at https://huggingface.co/datasets/IGNF/FRACTAL | Trained model is + available at https://huggingface.co/IGNF/FRACTAL-LidarHD_7cl_randlanet | Deep + learning code repository is on Gihtub at https://github.com/IGNF/myria3d | + Data engineering code repository is on Github at + https://github.com/IGNF/pacasam +
+
+
+
+
+ + ♻ ☆ Matryoshka Diffusion Models ICLR2024 + + +
+ Diffusion models are the de facto approach for generating high-quality images +and videos, but learning high-dimensional models remains a formidable task due +to computational and optimization challenges. Existing methods often resort to +training cascaded models in pixel space or using a downsampled latent space of +a separately trained auto-encoder. In this paper, we introduce Matryoshka +Diffusion Models(MDM), an end-to-end framework for high-resolution image and +video synthesis. We propose a diffusion process that denoises inputs at +multiple resolutions jointly and uses a NestedUNet architecture where features +and parameters for small-scale inputs are nested within those of large scales. +In addition, MDM enables a progressive training schedule from lower to higher +resolutions, which leads to significant improvements in optimization for +high-resolution generation. We demonstrate the effectiveness of our approach on +various benchmarks, including class-conditioned image generation, +high-resolution text-to-image, and text-to-video applications. Remarkably, we +can train a single pixel-space model at resolutions of up to 1024x1024 pixels, +demonstrating strong zero-shot generalization using the CC12M dataset, which +contains only 12 million images. Our code is released at +https://github.com/apple/ml-mdm + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ♻ ☆ Motion Avatar: Generate Human and Animal Avatars with Arbitrary Motion BMVC 2024 + + +
+ In recent years, there has been significant interest in creating 3D avatars +and motions, driven by their diverse applications in areas like film-making, +video games, AR/VR, and human-robot interaction. However, current efforts +primarily concentrate on either generating the 3D avatar mesh alone or +producing motion sequences, with integrating these two aspects proving to be a +persistent challenge. Additionally, while avatar and motion generation +predominantly target humans, extending these techniques to animals remains a +significant challenge due to inadequate training data and methods. To bridge +these gaps, our paper presents three key contributions. Firstly, we proposed a +novel agent-based approach named Motion Avatar, which allows for the automatic +generation of high-quality customizable human and animal avatars with motions +through text queries. The method significantly advanced the progress in dynamic +3D character generation. Secondly, we introduced a LLM planner that coordinates +both motion and avatar generation, which transforms a discriminative planning +into a customizable Q&A fashion. Lastly, we presented an animal motion dataset +named Zoo-300K, comprising approximately 300,000 text-motion pairs across 65 +animal categories and its building pipeline ZooGen, which serves as a valuable +resource for the community. See project website +https://steve-zeyu-zhang.github.io/MotionAvatar/ + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ rerankers: A Lightweight Python Library to Unify Ranking Methods + + +
+ This paper presents rerankers, a Python library which provides an easy-to-use +interface to the most commonly used re-ranking approaches. Re-ranking is an +integral component of many retrieval pipelines; however, there exist numerous +approaches to it, relying on different implementation methods. +\texttt{rerankers} unifies these methods into a single user-friendly interface, +allowing practitioners and researchers alike to explore different methods while +only changing a single line of Python code. Moreover ,rerankers ensures that +its implementations are done with the fewest dependencies possible, and re-uses +the original implementation whenever possible, guaranteeing that our simplified +interface results in no performance degradation compared to more complex ones. +The full source code and list of supported models are updated regularly and +available at https://github.com/answerdotai/rerankers. + +
+
+
+
+
+ + ☆ Not All Videos Become Outdated: Short-Video Recommendation by Learning + to Deconfound Release Interval Bias + + +
+ Short-video recommender systems often exhibit a biased preference to recently +released videos. However, not all videos become outdated; certain classic +videos can still attract user's attention. Such bias along temporal dimension +can be further aggravated by the matching model between users and videos, +because the model learns from preexisting interactions. From real data, we +observe that different videos have varying sensitivities to recency in +attracting users' attention. Our analysis, based on a causal graph modeling +short-video recommendation, suggests that the release interval serves as a +confounder, establishing a backdoor path between users and videos. To address +this confounding effect, we propose a model-agnostic causal architecture called +Learning to Deconfound the Release Interval Bias (LDRI). LDRI enables jointly +learning of the matching model and the video recency sensitivity perceptron. In +the inference stage, we apply a backdoor adjustment, effectively blocking the +backdoor path by intervening on each video. Extensive experiments on two +benchmarks demonstrate that LDRI consistently outperforms backbone models and +exhibits superior performance against state-of-the-art models. Additional +comprehensive analyses confirm the deconfounding capability of LDRI. + +
+
+
+
+
+ + ☆ Metadata practices for simulation workflows + + +
+ Computer simulations are an essential pillar of knowledge generation in +science. Understanding, reproducing, and exploring the results of simulations +relies on tracking and organizing metadata describing numerical experiments. +However, the models used to understand real-world systems, and the +computational machinery required to simulate them, are typically complex, and +produce large amounts of heterogeneous metadata. Here, we present general +practices for acquiring and handling metadata that are agnostic to software and +hardware, and highly flexible for the user. These consist of two steps: 1) +recording and storing raw metadata, and 2) selecting and structuring metadata. +As a proof of concept, we develop the Archivist, a Python tool to help with the +second step, and use it to apply our practices to distinct high-performance +computing use cases from neuroscience and hydrology. Our practices and the +Archivist can readily be applied to existing workflows without the need for +substantial restructuring. They support sustainable numerical workflows, +facilitating reproducibility and data reuse in generic simulation-based +research. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ Efficient Multi-task Prompt Tuning for Recommendation + + +
+ With the expansion of business scenarios, real recommender systems are facing +challenges in dealing with the constantly emerging new tasks in multi-task +learning frameworks. In this paper, we attempt to improve the generalization +ability of multi-task recommendations when dealing with new tasks. We find that +joint training will enhance the performance of the new task but always +negatively impact existing tasks in most multi-task learning methods. Besides, +such a re-training mechanism with new tasks increases the training costs, +limiting the generalization ability of multi-task recommendation models. Based +on this consideration, we aim to design a suitable sharing mechanism among +different tasks while maintaining joint optimization efficiency in new task +learning. A novel two-stage prompt-tuning MTL framework (MPT-Rec) is proposed +to address task irrelevance and training efficiency problems in multi-task +recommender systems. Specifically, we disentangle the task-specific and +task-sharing information in the multi-task pre-training stage, then use +task-aware prompts to transfer knowledge from other tasks to the new task +effectively. By freezing parameters in the pre-training tasks, MPT-Rec solves +the negative impacts that may be brought by the new task and greatly reduces +the training costs. Extensive experiments on three real-world datasets show the +effectiveness of our proposed multi-task learning framework. MPT-Rec achieves +the best performance compared to the SOTA multi-task learning method. Besides, +it maintains comparable model performance but vastly improves the training +efficiency (i.e., with up to 10% parameters in the full training way) in the +new task learning. + +
+
+
+
+
+ + ☆ Identifying and Clustering Counter Relationships of Team Compositions in + PvP Games for Efficient Balance Analysis + + +
+ How can balance be quantified in game settings? This question is crucial for +game designers, especially in player-versus-player (PvP) games, where analyzing +the strength relations among predefined team compositions-such as hero +combinations in multiplayer online battle arena (MOBA) games or decks in card +games-is essential for enhancing gameplay and achieving balance. We have +developed two advanced measures that extend beyond the simplistic win rate to +quantify balance in zero-sum competitive scenarios. These measures are derived +from win value estimations, which employ strength rating approximations via the +Bradley-Terry model and counter relationship approximations via vector +quantization, significantly reducing the computational complexity associated +with traditional win value estimations. Throughout the learning process of +these models, we identify useful categories of compositions and pinpoint their +counter relationships, aligning with the experiences of human players without +requiring specific game knowledge. Our methodology hinges on a simple technique +to enhance codebook utilization in discrete representation with a deterministic +vector quantization process for an extremely small state space. Our framework +has been validated in popular online games, including Age of Empires II, +Hearthstone, Brawl Stars, and League of Legends. The accuracy of the observed +strength relations in these games is comparable to traditional pairwise win +value predictions, while also offering a more manageable complexity for +analysis. Ultimately, our findings contribute to a deeper understanding of PvP +game dynamics and present a methodology that significantly improves game +balance evaluation and design. + +
+
+ comment: TMLR 09/2024 https://openreview.net/forum?id=2D36otXvBE +
+
+
+
+
+ + ☆ Understanding the User: An Intent-Based Ranking Dataset + + +
+ As information retrieval systems continue to evolve, accurate evaluation and +benchmarking of these systems become pivotal. Web search datasets, such as MS +MARCO, primarily provide short keyword queries without accompanying intent or +descriptions, posing a challenge in comprehending the underlying information +need. This paper proposes an approach to augmenting such datasets to annotate +informative query descriptions, with a focus on two prominent benchmark +datasets: TREC-DL-21 and TREC-DL-22. Our methodology involves utilizing +state-of-the-art LLMs to analyze and comprehend the implicit intent within +individual queries from benchmark datasets. By extracting key semantic +elements, we construct detailed and contextually rich descriptions for these +queries. To validate the generated query descriptions, we employ crowdsourcing +as a reliable means of obtaining diverse human perspectives on the accuracy and +informativeness of the descriptions. This information can be used as an +evaluation set for tasks such as ranking, query rewriting, or others. + +
+
+
+
+
+ + ☆ Evaluation of Table Representations to Answer Questions from Tables in + Documents : A Case Study using 3GPP Specifications + + +
+ With the ubiquitous use of document corpora for question answering, one +important aspect which is especially relevant for technical documents is the +ability to extract information from tables which are interspersed with text. +The major challenge in this is that unlike free-flow text or isolated set of +tables, the representation of a table in terms of what is a relevant chunk is +not obvious. We conduct a series of experiments examining various +representations of tabular data interspersed with text to understand the +relative benefits of different representations. We choose a corpus of $3^{rd}$ +Generation Partnership Project (3GPP) documents since they are heavily +interspersed with tables. We create expert curated dataset of question answers +to evaluate our approach. We conclude that row level representations with +corresponding table header information being included in every cell improves +the performance of the retrieval, thus leveraging the structural information +present in the tabular data. + +
+
+ comment: 10 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Facilitating phenotyping from clinical texts: the medkit library + + +
+ Phenotyping consists in applying algorithms to identify individuals +associated with a specific, potentially complex, trait or condition, typically +out of a collection of Electronic Health Records (EHRs). Because a lot of the +clinical information of EHRs are lying in texts, phenotyping from text takes an +important role in studies that rely on the secondary use of EHRs. However, the +heterogeneity and highly specialized aspect of both the content and form of +clinical texts makes this task particularly tedious, and is the source of time +and cost constraints in observational studies. To facilitate the development, +evaluation and reproductibility of phenotyping pipelines, we developed an +open-source Python library named medkit. It enables composing data processing +pipelines made of easy-to-reuse software bricks, named medkit operations. In +addition to the core of the library, we share the operations and pipelines we +already developed and invite the phenotyping community for their reuse and +enrichment. medkit is available at https://github.com/medkit-lib/medkit + +
+
+
+
+
+ + ♻ ☆ Beyond One-Size-Fits-All: Multi-Domain, Multi-Task Framework for + Embedding Model Selection + + +
+ This position paper proposes a systematic approach towards developing a +framework to help select the most effective embedding models for natural +language processing (NLP) tasks, addressing the challenge posed by the +proliferation of both proprietary and open-source encoder models. + +
+
+ comment: It was an initial idea - we plan to work on a detailed version +
+
+
+
+
+ + ♻ ☆ SynDL: A Large-Scale Synthetic Test Collection for Passage Retrieval + + +
+ Large-scale test collections play a crucial role in Information Retrieval +(IR) research. However, according to the Cranfield paradigm and the research +into publicly available datasets, the existing information retrieval research +studies are commonly developed on small-scale datasets that rely on human +assessors for relevance judgments - a time-intensive and expensive process. +Recent studies have shown the strong capability of Large Language Models (LLMs) +in producing reliable relevance judgments with human accuracy but at a greatly +reduced cost. In this paper, to address the missing large-scale ad-hoc document +retrieval dataset, we extend the TREC Deep Learning Track (DL) test collection +via additional language model synthetic labels to enable researchers to test +and evaluate their search systems at a large scale. Specifically, such a test +collection includes more than 1,900 test queries from the previous years of +tracks. We compare system evaluation with past human labels from past years and +find that our synthetically created large-scale test collection can lead to +highly correlated system rankings. + +
+
+ comment: 9 pages, resource paper +
+
+
+
+
+ + ♻ ☆ The Importance of Cognitive Biases in the Recommendation Ecosystem + + +
+ Cognitive biases have been studied in psychology, sociology, and behavioral +economics for decades. Traditionally, they have been considered a negative +human trait that leads to inferior decision-making, reinforcement of +stereotypes, or can be exploited to manipulate consumers, respectively. We +argue that cognitive biases also manifest in different parts of the +recommendation ecosystem and at different stages of the recommendation process. +More importantly, we contest this traditional detrimental perspective on +cognitive biases and claim that certain cognitive biases can be beneficial when +accounted for by recommender systems. Concretely, we provide empirical evidence +that biases such as feature-positive effect, Ikea effect, and cultural +homophily can be observed in various components of the recommendation pipeline, +including input data (such as ratings or side information), recommendation +algorithm or model (and consequently recommended items), and user interactions +with the system. In three small experiments covering recruitment and +entertainment domains, we study the pervasiveness of the aforementioned biases. +We ultimately advocate for a prejudice-free consideration of cognitive biases +to improve user and item models as well as recommendation algorithms. + +
+
+
+
+
+ + ♻ ☆ Perceptual Similarity for Measuring Decision-Making Style and Policy + Diversity in Games + + +
+ Defining and measuring decision-making styles, also known as playstyles, is +crucial in gaming, where these styles reflect a broad spectrum of individuality +and diversity. However, finding a universally applicable measure for these +styles poses a challenge. Building on Playstyle Distance, the first +unsupervised metric to measure playstyle similarity based on game screens and +raw actions, we introduce three enhancements to increase accuracy: multiscale +analysis with varied state granularity, a perceptual kernel rooted in +psychology, and the utilization of the intersection-over-union method for +efficient evaluation. These innovations not only advance measurement precision +but also offer insights into human cognition of similarity. Across two racing +games and seven Atari games, our techniques significantly improve the precision +of zero-shot playstyle classification, achieving an accuracy exceeding 90 +percent with fewer than 512 observation-action pairs, which is less than half +an episode of these games. Furthermore, our experiments with 2048 and Go +demonstrate the potential of discrete playstyle measures in puzzle and board +games. We also develop an algorithm for assessing decision-making diversity +using these measures. Our findings improve the measurement of end-to-end game +analysis and the evolution of artificial intelligence for diverse playstyles. + +
+
+ comment: TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49 +
+
+
+
+
+ + ♻ ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce a novel architecture and a training framework to +support long context window and multilingual retrieval. Leveraging Matryoshka +Representation Loss, we further demonstrate that the reducing the embedding +dimensionality from 128 to 64 has insignificant impact on the model's retrieval +performance and cut storage requirements by up to 50%. Our new model, +Jina-ColBERT-v2, demonstrates strong performance across a range of English and +multilingual retrieval tasks, + +
+
+
+
+
+
+
+
+ + Machine Learning 136 + +
+
+
+ + ☆ SelectTTS: Synthesizing Anyone's Voice via Discrete Unit-Based Frame + Selection + + +
+ Synthesizing the voices of unseen speakers is a persisting challenge in +multi-speaker text-to-speech (TTS). Most multi-speaker TTS models rely on +modeling speaker characteristics through speaker conditioning during training. +Modeling unseen speaker attributes through this approach has necessitated an +increase in model complexity, which makes it challenging to reproduce results +and improve upon them. We design a simple alternative to this. We propose +SelectTTS, a novel method to select the appropriate frames from the target +speaker and decode using frame-level self-supervised learning (SSL) features. +We show that this approach can effectively capture speaker characteristics for +unseen speakers, and achieves comparable results to other multi-speaker TTS +frameworks in both objective and subjective metrics. With SelectTTS, we show +that frame selection from the target speaker's speech is a direct way to +achieve generalization in unseen speakers with low model complexity. We achieve +better speaker similarity performance than SOTA baselines XTTS-v2 and VALL-E +with over an 8x reduction in model parameters and a 270x reduction in training +data + +
+
+ comment: Submitted to IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Fairness-Aware Estimation of Graphical Models + + +
+ This paper examines the issue of fairness in the estimation of graphical +models (GMs), particularly Gaussian, Covariance, and Ising models. These models +play a vital role in understanding complex relationships in high-dimensional +data. However, standard GMs can result in biased outcomes, especially when the +underlying data involves sensitive characteristics or protected groups. To +address this, we introduce a comprehensive framework designed to reduce bias in +the estimation of GMs related to protected attributes. Our approach involves +the integration of the pairwise graph disparity error and a tailored loss +function into a nonsmooth multi-objective optimization problem, striving to +achieve fairness across different sensitive groups while maintaining the +effectiveness of the GMs. Experimental evaluations on synthetic and real-world +datasets demonstrate that our framework effectively mitigates bias without +undermining GMs' performance. + +
+
+ comment: 32 Pages, 9 Figures +
+
+
+
+
+ + ☆ Continual learning with the neural tangent ensemble + + +
+ A natural strategy for continual learning is to weigh a Bayesian ensemble of +fixed functions. This suggests that if a (single) neural network could be +interpreted as an ensemble, one could design effective algorithms that learn +without forgetting. To realize this possibility, we observe that a neural +network classifier with N parameters can be interpreted as a weighted ensemble +of N classifiers, and that in the lazy regime limit these classifiers are fixed +throughout learning. We term these classifiers the neural tangent experts and +show they output valid probability distributions over the labels. We then +derive the likelihood and posterior probability of each expert given past data. +Surprisingly, we learn that the posterior updates for these experts are +equivalent to a scaled and projected form of stochastic gradient descent (SGD) +over the network weights. Away from the lazy regime, networks can be seen as +ensembles of adaptive experts which improve over time. These results offer a +new interpretation of neural networks as Bayesian ensembles of experts, +providing a principled framework for understanding and mitigating catastrophic +forgetting in continual learning settings. + +
+
+
+
+
+ + ☆ Bayesian Optimization for Non-Convex Two-Stage Stochastic Optimization + Problems + + +
+ Bayesian optimization is a sample-efficient method for solving expensive, +black-box optimization problems. Stochastic programming concerns optimization +under uncertainty where, typically, average performance is the quantity of +interest. In the first stage of a two-stage problem, here-and-now decisions +must be made in the face of this uncertainty, while in the second stage, +wait-and-see decisions are made after the uncertainty has been resolved. Many +methods in stochastic programming assume that the objective is cheap to +evaluate and linear or convex. In this work, we apply Bayesian optimization to +solve non-convex, two-stage stochastic programs which are expensive to +evaluate. We formulate a knowledge-gradient-based acquisition function to +jointly optimize the first- and second-stage variables, establish a guarantee +of asymptotic consistency and provide a computationally efficient +approximation. We demonstrate comparable empirical results to an alternative we +formulate which alternates its focus between the two variable types, and +superior empirical results over the standard, naive, two-step benchmark. We +show that differences in the dimension and length scales between the variable +types can lead to inefficiencies of the two-step algorithm, while the joint and +alternating acquisition functions perform well in all problems tested. +Experiments are conducted on both synthetic and real-world examples. + +
+
+
+
+
+ + ☆ LASSO-MOGAT: A Multi-Omics Graph Attention Framework for Cancer + Classification + + +
+ The application of machine learning methods to analyze changes in gene +expression patterns has recently emerged as a powerful approach in cancer +research, enhancing our understanding of the molecular mechanisms underpinning +cancer development and progression. Combining gene expression data with other +types of omics data has been reported by numerous works to improve cancer +classification outcomes. Despite these advances, effectively integrating +high-dimensional multi-omics data and capturing the complex relationships +across different biological layers remains challenging. This paper introduces +LASSO-MOGAT (LASSO-Multi-Omics Gated ATtention), a novel graph-based deep +learning framework that integrates messenger RNA, microRNA, and DNA methylation +data to classify 31 cancer types. Utilizing differential expression analysis +with LIMMA and LASSO regression for feature selection, and leveraging Graph +Attention Networks (GATs) to incorporate protein-protein interaction (PPI) +networks, LASSO-MOGAT effectively captures intricate relationships within +multi-omics data. Experimental validation using five-fold cross-validation +demonstrates the method's precision, reliability, and capacity for providing +comprehensive insights into cancer molecular mechanisms. The computation of +attention coefficients for the edges in the graph by the proposed +graph-attention architecture based on protein-protein interactions proved +beneficial for identifying synergies in multi-omics data for cancer +classification. + +
+
+
+
+
+ + ☆ MoRe Fine-Tuning with 10x Fewer Parameters + + +
+ Parameter-efficient fine-tuning (PEFT) techniques have unlocked the potential +to cheaply and easily specialize large pretrained models. However, the most +prominent approaches, like low-rank adapters (LoRA), depend on heuristics or +rules-of-thumb for their architectural choices -- potentially limiting their +performance for new models and architectures. This limitation suggests that +techniques from neural architecture search could be used to obtain optimal +adapter architectures, but these are often expensive and difficult to +implement. We address this challenge with Monarch Rectangular Fine-tuning +(MoRe), a simple framework to search over adapter architectures that relies on +the Monarch matrix class. Theoretically, we show that MoRe is more expressive +than LoRA. Empirically, our approach is more parameter-efficient and performant +than state-of-the-art PEFTs on a range of tasks and models, with as few as 5\% +of LoRA's parameters. + +
+
+
+
+
+ + ☆ Traffic expertise meets residual RL: Knowledge-informed model-based + residual reinforcement learning for CAV trajectory control + + +
+ Model-based reinforcement learning (RL) is anticipated to exhibit higher +sample efficiency compared to model-free RL by utilizing a virtual environment +model. However, it is challenging to obtain sufficiently accurate +representations of the environmental dynamics due to uncertainties in complex +systems and environments. An inaccurate environment model may degrade the +sample efficiency and performance of model-based RL. Furthermore, while +model-based RL can improve sample efficiency, it often still requires +substantial training time to learn from scratch, potentially limiting its +advantages over model-free approaches. To address these challenges, this paper +introduces a knowledge-informed model-based residual reinforcement learning +framework aimed at enhancing learning efficiency by infusing established expert +knowledge into the learning process and avoiding the issue of beginning from +zero. Our approach integrates traffic expert knowledge into a virtual +environment model, employing the Intelligent Driver Model (IDM) for basic +dynamics and neural networks for residual dynamics, thus ensuring adaptability +to complex scenarios. We propose a novel strategy that combines traditional +control methods with residual RL, facilitating efficient learning and policy +optimization without the need to learn from scratch. The proposed approach is +applied to CAV trajectory control tasks for the dissipation of stop-and-go +waves in mixed traffic flow. Experimental results demonstrate that our proposed +approach enables the CAV agent to achieve superior performance in trajectory +control compared to the baseline agents in terms of sample efficiency, traffic +flow smoothness and traffic mobility. The source code and supplementary +materials are available at https://github.com/zihaosheng/traffic-expertise-RL/. + +
+
+
+
+
+ + ☆ Exploring the Impact of Environmental Pollutants on Multiple Sclerosis + Progression + + +
+ Multiple Sclerosis (MS) is a chronic autoimmune and inflammatory neurological +disorder characterised by episodes of symptom exacerbation, known as relapses. +In this study, we investigate the role of environmental factors in relapse +occurrence among MS patients, using data from the H2020 BRAINTEASER project. We +employed predictive models, including Random Forest (RF) and Logistic +Regression (LR), with varying sets of input features to predict the occurrence +of relapses based on clinical and pollutant data collected over a week. The RF +yielded the best result, with an AUC-ROC score of 0.713. Environmental +variables, such as precipitation, NO2, PM2.5, humidity, and temperature, were +found to be relevant to the prediction. + +
+
+
+
+
+ + ☆ Leveraging Graph Neural Networks to Forecast Electricity Consumption ECML + + +
+ Accurate electricity demand forecasting is essential for several reasons, +especially as the integration of renewable energy sources and the transition to +a decentralized network paradigm introduce greater complexity and uncertainty. +The proposed methodology leverages graph-based representations to effectively +capture the spatial distribution and relational intricacies inherent in this +decentralized network structure. This research work offers a novel approach +that extends beyond the conventional Generalized Additive Model framework by +considering models like Graph Convolutional Networks or Graph SAGE. These +graph-based models enable the incorporation of various levels of +interconnectedness and information sharing among nodes, where each node +corresponds to the combined load (i.e. consumption) of a subset of consumers +(e.g. the regions of a country). More specifically, we introduce a range of +methods for inferring graphs tailored to consumption forecasting, along with a +framework for evaluating the developed models in terms of both performance and +explainability. We conduct experiments on electricity forecasting, in both a +synthetic and a real framework considering the French mainland regions, and the +performance and merits of our approach are discussed. + +
+
+ comment: 17 pages, ECML PKDD 2024 Workshop paper +
+
+
+
+
+ + ☆ Hold Me Tight: Stable Encoder-Decoder Design for Speech Enhancement INTERSPEECH 2024 + + +
+ Convolutional layers with 1-D filters are often used as frontend to encode +audio signals. Unlike fixed time-frequency representations, they can adapt to +the local characteristics of input data. However, 1-D filters on raw audio are +hard to train and often suffer from instabilities. In this paper, we address +these problems with hybrid solutions, i.e., combining theory-driven and +data-driven approaches. First, we preprocess the audio signals via a auditory +filterbank, guaranteeing good frequency localization for the learned encoder. +Second, we use results from frame theory to define an unsupervised learning +objective that encourages energy conservation and perfect reconstruction. +Third, we adapt mixed compressed spectral norms as learning objectives to the +encoder coefficients. Using these solutions in a low-complexity +encoder-mask-decoder model significantly improves the perceptual evaluation of +speech quality (PESQ) in speech enhancement. + +
+
+ comment: Accepted at INTERSPEECH 2024 +
+
+
+
+
+ + ☆ C-RADAR: A Centralized Deep Learning System for Intrusion Detection in + Software Defined Networks + + +
+ The popularity of Software Defined Networks (SDNs) has grown in recent years, +mainly because of their ability to simplify network management and improve +network flexibility. However, this also makes them vulnerable to various types +of cyber attacks. SDNs work on a centralized control plane which makes them +more prone to network attacks. Research has demonstrated that deep learning +(DL) methods can be successful in identifying intrusions in conventional +networks, but their application in SDNs is still an open research area. In this +research, we propose the use of DL techniques for intrusion detection in SDNs. +We measure the effectiveness of our method by experimentation on a dataset of +network traffic and comparing it to existing techniques. Our results show that +the DL-based approach outperforms traditional methods in terms of detection +accuracy and computational efficiency. The deep learning architecture that has +been used in this research is a Long Short Term Memory Network and +Self-Attention based architecture i.e. LSTM-Attn which achieves an Fl-score of +0.9721. Furthermore, this technique can be trained to detect new attack +patterns and improve the overall security of SDNs. + +
+
+
+
+
+ + ☆ Bidirectional Decoding: Improving Action Chunking via Closed-Loop + Resampling + + +
+ Predicting and executing a sequence of actions without intermediate +replanning, known as action chunking, is increasingly used in robot learning +from human demonstrations. However, its effects on learned policies remain +puzzling: some studies highlight its importance for achieving strong +performance, while others observe detrimental effects. In this paper, we first +dissect the role of action chunking by analyzing the divergence between the +learner and the demonstrator. We find that longer action chunks enable a policy +to better capture temporal dependencies by taking into account more past states +and actions within the chunk. However, this advantage comes at the cost of +exacerbating errors in stochastic environments due to fewer observations of +recent states. To address this, we propose Bidirectional Decoding (BID), a +test-time inference algorithm that bridges action chunking with closed-loop +operations. BID samples multiple predictions at each time step and searches for +the optimal one based on two criteria: (i) backward coherence, which favors +samples aligned with previous decisions, (ii) forward contrast, which favors +samples close to outputs of a stronger policy and distant from those of a +weaker policy. By coupling decisions within and across action chunks, BID +enhances temporal consistency over extended sequences while enabling adaptive +replanning in stochastic environments. Experimental results show that BID +substantially outperforms conventional closed-loop operations of two +state-of-the-art generative policies across seven simulation benchmarks and two +real-world tasks. + +
+
+ comment: Project website: https://bid-robot.github.io/ +
+
+
+
+
+ + ☆ Forget to Flourish: Leveraging Machine-Unlearning on Pretrained Language + Models for Privacy Leakage + + +
+ Fine-tuning large language models on private data for downstream applications +poses significant privacy risks in potentially exposing sensitive information. +Several popular community platforms now offer convenient distribution of a +large variety of pre-trained models, allowing anyone to publish without +rigorous verification. This scenario creates a privacy threat, as pre-trained +models can be intentionally crafted to compromise the privacy of fine-tuning +datasets. In this study, we introduce a novel poisoning technique that uses +model-unlearning as an attack tool. This approach manipulates a pre-trained +language model to increase the leakage of private data during the fine-tuning +process. Our method enhances both membership inference and data extraction +attacks while preserving model utility. Experimental results across different +models, datasets, and fine-tuning setups demonstrate that our attacks +significantly surpass baseline performance. This work serves as a cautionary +note for users who download pre-trained models from unverified sources, +highlighting the potential risks involved. + +
+
+
+
+
+ + ☆ Evaluating Reliability in Medical DNNs: A Critical Analysis of Feature + and Confidence-Based OOD Detection MICCAI 2023 + + +
+ Reliable use of deep neural networks (DNNs) for medical image analysis +requires methods to identify inputs that differ significantly from the training +data, called out-of-distribution (OOD), to prevent erroneous predictions. OOD +detection methods can be categorised as either confidence-based (using the +model's output layer for OOD detection) or feature-based (not using the output +layer). We created two new OOD benchmarks by dividing the D7P (dermatology) and +BreastMNIST (ultrasound) datasets into subsets which either contain or don't +contain an artefact (rulers or annotations respectively). Models were trained +with artefact-free images, and images with the artefacts were used as OOD test +sets. For each OOD image, we created a counterfactual by manually removing the +artefact via image processing, to assess the artefact's impact on the model's +predictions. We show that OOD artefacts can boost a model's softmax confidence +in its predictions, due to correlations in training data among other factors. +This contradicts the common assumption that OOD artefacts should lead to more +uncertain outputs, an assumption on which most confidence-based methods rely. +We use this to explain why feature-based methods (e.g. Mahalanobis score) +typically have greater OOD detection performance than confidence-based methods +(e.g. MCP). However, we also show that feature-based methods typically perform +worse at distinguishing between inputs that lead to correct and incorrect +predictions (for both OOD and ID data). Following from these insights, we argue +that a combination of feature-based and confidence-based methods should be used +within DNN pipelines to mitigate their respective weaknesses. These project's +code and OOD benchmarks are available at: +https://github.com/HarryAnthony/Evaluating_OOD_detection. + +
+
+ comment: Accepted for the Uncertainty for Safe Utilization of Machine Learning + in Medical Imaging (UNSURE 2024) workshop at the MICCAI 2023 +
+
+
+
+
+ + ☆ Estimation of Cardiac and Non-cardiac Diagnosis from Electrocardiogram + Features + + +
+ Introduction: Ensuring timely and accurate diagnosis of medical conditions is +paramount for effective patient care. Electrocardiogram (ECG) signals are +fundamental for evaluating a patient's cardiac health and are readily +available. Despite this, little attention has been given to the remarkable +potential of ECG data in detecting non-cardiac conditions. + Methods: In our study, we used publicly available datasets (MIMIC-IV-ECG-ICD +and ECG-VIEW II) to investigate the feasibility of inferring general diagnostic +conditions from ECG features. To this end, we trained a tree-based model +(XGBoost) based on ECG features and basic demographic features to estimate a +wide range of diagnoses, encompassing both cardiac and non-cardiac conditions. + Results: Our results demonstrate the reliability of estimating 23 cardiac as +well as 21 non-cardiac conditions above 0.7 AUROC in a statistically +significant manner across a wide range of physiological categories. Our +findings underscore the predictive potential of ECG data in identifying +well-known cardiac conditions. However, even more striking, this research +represents a pioneering effort in systematically expanding the scope of +ECG-based diagnosis to conditions not traditionally associated with the cardiac +system. + +
+
+ comment: 4 pages, source code under https://github.com/AI4HealthUOL/CardioDiag +
+
+
+
+
+ + ☆ Modularity in Transformers: Investigating Neuron Separability & + Specialization + + +
+ Transformer models are increasingly prevalent in various applications, yet +our understanding of their internal workings remains limited. This paper +investigates the modularity and task specialization of neurons within +transformer architectures, focusing on both vision (ViT) and language (Mistral +7B) models. Using a combination of selective pruning and MoEfication clustering +techniques, we analyze the overlap and specialization of neurons across +different tasks and data subsets. Our findings reveal evidence of task-specific +neuron clusters, with varying degrees of overlap between related tasks. We +observe that neuron importance patterns persist to some extent even in randomly +initialized models, suggesting an inherent structure that training refines. +Additionally, we find that neuron clusters identified through MoEfication +correspond more strongly to task-specific neurons in earlier and later layers +of the models. This work contributes to a more nuanced understanding of +transformer internals and offers insights into potential avenues for improving +model interpretability and efficiency. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Investigating Neuron Ablation in Attention Heads: The Case for Peak + Activation Centering + + +
+ The use of transformer-based models is growing rapidly throughout society. +With this growth, it is important to understand how they work, and in +particular, how the attention mechanisms represent concepts. Though there are +many interpretability methods, many look at models through their neuronal +activations, which are poorly understood. We describe different lenses through +which to view neuron activations, and investigate the effectiveness in language +models and vision transformers through various methods of neural ablation: zero +ablation, mean ablation, activation resampling, and a novel approach we term +'peak ablation'. Through experimental analysis, we find that in different +regimes and models, each method can offer the lowest degradation of model +performance compared to other methods, with resampling usually causing the most +significant performance deterioration. We make our code available at +https://github.com/nickypro/investigating-ablation. + +
+
+ comment: 9 pages, 2 figures, XAI World Conference 2024 Late-Breaking Work +
+
+
+
+
+ + ☆ Fair Best Arm Identification with Fixed Confidence + + +
+ In this work, we present a novel framework for Best Arm Identification (BAI) +under fairness constraints, a setting that we refer to as \textit{F-BAI} (fair +BAI). Unlike traditional BAI, which solely focuses on identifying the optimal +arm with minimal sample complexity, F-BAI also includes a set of fairness +constraints. These constraints impose a lower limit on the selection rate of +each arm and can be either model-agnostic or model-dependent. For this setting, +we establish an instance-specific sample complexity lower bound and analyze the +\textit{price of fairness}, quantifying how fairness impacts sample complexity. +Based on the sample complexity lower bound, we propose F-TaS, an algorithm +provably matching the sample complexity lower bound, while ensuring that the +fairness constraints are satisfied. Numerical results, conducted using both a +synthetic model and a practical wireless scheduling application, show the +efficiency of F-TaS in minimizing the sample complexity while achieving low +fairness violations. + +
+
+
+
+
+ + ☆ Structuring a Training Strategy to Robustify Perception Models with + Realistic Image Augmentations + + +
+ Advancing Machine Learning (ML)-based perception models for autonomous +systems necessitates addressing weak spots within the models, particularly in +challenging Operational Design Domains (ODDs). These are environmental +operating conditions of an autonomous vehicle which can contain difficult +conditions, e.g., lens flare at night or objects reflected in a wet street. +This report introduces a novel methodology for training with augmentations to +enhance model robustness and performance in such conditions. The proposed +approach leverages customized physics-based augmentation functions, to generate +realistic training data that simulates diverse ODD scenarios. + We present a comprehensive framework that includes identifying weak spots in +ML models, selecting suitable augmentations, and devising effective training +strategies. The methodology integrates hyperparameter optimization and latent +space optimization to fine-tune augmentation parameters, ensuring they +maximally improve the ML models' performance. Experimental results demonstrate +improvements in model performance, as measured by commonly used metrics such as +mean Average Precision (mAP) and mean Intersection over Union (mIoU) on +open-source object detection and semantic segmentation models and datasets. + Our findings emphasize that optimal training strategies are model- and +data-specific and highlight the benefits of integrating augmentations into the +training pipeline. By incorporating augmentations, we observe enhanced +robustness of ML-based perception models, making them more resilient to edge +cases encountered in real-world ODDs. This work underlines the importance of +customized augmentations and offers an effective solution for improving the +safety and reliability of autonomous driving functions. + +
+
+
+
+
+ + ☆ Hybridizing Base-Line 2D-CNN Model with Cat Swarm Optimization for + Enhanced Advanced Persistent Threat Detection + + +
+ In the realm of cyber-security, detecting Advanced Persistent Threats (APTs) +remains a formidable challenge due to their stealthy and sophisticated nature. +This research paper presents an innovative approach that leverages +Convolutional Neural Networks (CNNs) with a 2D baseline model, enhanced by the +cutting-edge Cat Swarm Optimization (CSO) algorithm, to significantly improve +APT detection accuracy. By seamlessly integrating the 2D-CNN baseline model +with CSO, we unlock the potential for unprecedented accuracy and efficiency in +APT detection. The results unveil an impressive accuracy score of $98.4\%$, +marking a significant enhancement in APT detection across various attack +stages, illuminating a path forward in combating these relentless and +sophisticated threats. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Accelerating the discovery of steady-states of planetary interior + dynamics with machine learning + + +
+ Simulating mantle convection often requires reaching a computationally +expensive steady-state, crucial for deriving scaling laws for thermal and +dynamical flow properties and benchmarking numerical solutions. The strong +temperature dependence of the rheology of mantle rocks causes viscosity +variations of several orders of magnitude, leading to a slow-evolving stagnant +lid where heat conduction dominates, overlying a rapidly-evolving and strongly +convecting region. Time-stepping methods, while effective for fluids with +constant viscosity, are hindered by the Courant criterion, which restricts the +time step based on the system's maximum velocity and grid size. Consequently, +achieving steady-state requires a large number of time steps due to the +disparate time scales governing the stagnant and convecting regions. + We present a concept for accelerating mantle convection simulations using +machine learning. We generate a dataset of 128 two-dimensional simulations with +mixed basal and internal heating, and pressure- and temperature-dependent +viscosity. We train a feedforward neural network on 97 simulations to predict +steady-state temperature profiles. These can then be used to initialize +numerical time stepping methods for different simulation parameters. Compared +to typical initializations, the number of time steps required to reach +steady-state is reduced by a median factor of 3.75. The benefit of this method +lies in requiring very few simulations to train on, providing a solution with +no prediction error as we initialize a numerical method, and posing minimal +computational overhead at inference time. We demonstrate the effectiveness of +our approach and discuss the potential implications for accelerated simulations +for advancing mantle convection research. + +
+
+
+
+
+ + ☆ Stationary Policies are Optimal in Risk-averse Total-reward MDPs with + EVaR + + +
+ Optimizing risk-averse objectives in discounted MDPs is challenging because +most models do not admit direct dynamic programming equations and require +complex history-dependent policies. In this paper, we show that the risk-averse +{\em total reward criterion}, under the Entropic Risk Measure (ERM) and +Entropic Value at Risk (EVaR) risk measures, can be optimized by a stationary +policy, making it simple to analyze, interpret, and deploy. We propose +exponential value iteration, policy iteration, and linear programming to +compute optimal policies. In comparison with prior work, our results only +require the relatively mild condition of transient MDPs and allow for {\em +both} positive and negative rewards. Our results indicate that the total reward +criterion may be preferable to the discounted criterion in a broad range of +risk-averse reinforcement learning domains. + +
+
+
+
+
+ + ☆ Image-Perfect Imperfections: Safety, Bias, and Authenticity in the + Shadow of Text-To-Image Model Evolution + + +
+ Text-to-image models, such as Stable Diffusion (SD), undergo iterative +updates to improve image quality and address concerns such as safety. +Improvements in image quality are straightforward to assess. However, how model +updates resolve existing concerns and whether they raise new questions remain +unexplored. This study takes an initial step in investigating the evolution of +text-to-image models from the perspectives of safety, bias, and authenticity. +Our findings, centered on Stable Diffusion, indicate that model updates paint a +mixed picture. While updates progressively reduce the generation of unsafe +images, the bias issue, particularly in gender, intensifies. We also find that +negative stereotypes either persist within the same Non-White race group or +shift towards other Non-White race groups through SD updates, yet with minimal +association of these traits with the White race group. Additionally, our +evaluation reveals a new concern stemming from SD updates: State-of-the-art +fake image detectors, initially trained for earlier SD versions, struggle to +identify fake images generated by updated versions. We show that fine-tuning +these detectors on fake images generated by updated versions achieves at least +96.6\% accuracy across various SD versions, addressing this issue. Our insights +highlight the importance of continued efforts to mitigate biases and +vulnerabilities in evolving text-to-image models. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, October 14-18, 2024 +
+
+
+
+
+ + ☆ Minimax and Communication-Efficient Distributed Best Subset Selection + with Oracle Property + + +
+ The explosion of large-scale data in fields such as finance, e-commerce, and +social media has outstripped the processing capabilities of single-machine +systems, driving the need for distributed statistical inference methods. +Traditional approaches to distributed inference often struggle with achieving +true sparsity in high-dimensional datasets and involve high computational +costs. We propose a novel, two-stage, distributed best subset selection +algorithm to address these issues. Our approach starts by efficiently +estimating the active set while adhering to the $\ell_0$ norm-constrained +surrogate likelihood function, effectively reducing dimensionality and +isolating key variables. A refined estimation within the active set follows, +ensuring sparse estimates and matching the minimax $\ell_2$ error bound. We +introduce a new splicing technique for adaptive parameter selection to tackle +subproblems under $\ell_0$ constraints and a Generalized Information Criterion +(GIC). Our theoretical and numerical studies show that the proposed algorithm +correctly finds the true sparsity pattern, has the oracle property, and greatly +lowers communication costs. This is a big step forward in distributed sparse +estimation. + +
+
+
+
+
+ + ☆ The Transferability of Downsampling Sparse Graph Convolutional Networks + + +
+ In this paper, we propose a large-scale sparse graph downsampling method +based on a sparse random graph model, which allows for the adjustment of +different sparsity levels. We combine sparsity and topological similarity: the +sparse graph model reduces the node connection probability as the graph size +increases, while the downsampling method preserves a specific topological +connection pattern during this change. Based on the downsampling method, we +derive a theoretical transferability bound about downsampling sparse graph +convolutional networks (GCNs), that higher sampling rates, greater average +degree expectations, and smaller initial graph sizes lead to better +downsampling transferability performance. + +
+
+
+
+
+ + ☆ Equation identification for fluid flows via physics-informed neural + networks ICML 2024 + + +
+ Scientific machine learning (SciML) methods such as physics-informed neural +networks (PINNs) are used to estimate parameters of interest from governing +equations and small quantities of data. However, there has been little work in +assessing how well PINNs perform for inverse problems across wide ranges of +governing equations across the mathematical sciences. We present a new and +challenging benchmark problem for inverse PINNs based on a parametric sweep of +the 2D Burgers' equation with rotational flow. We show that a novel strategy +that alternates between first- and second-order optimization proves superior to +typical first-order strategies for estimating parameters. In addition, we +propose a novel data-driven method to characterize PINN effectiveness in the +inverse setting. PINNs' physics-informed regularization enables them to +leverage small quantities of data more efficiently than the data-driven +baseline. However, both PINNs and the baseline can fail to recover parameters +for highly inviscid flows, motivating the need for further development of PINN +methods. + +
+
+ comment: Published at ICML 2024 AI4Science: + https://openreview.net/forum?id=XsvCLEYH3O +
+
+
+
+
+ + ☆ Joint Estimation and Prediction of City-wide Delivery Demand: A Large + Language Model Empowered Graph-based Learning Approach + + +
+ The proliferation of e-commerce and urbanization has significantly +intensified delivery operations in urban areas, boosting the volume and +complexity of delivery demand. Data-driven predictive methods, especially those +utilizing machine learning techniques, have emerged to handle these +complexities in urban delivery demand management problems. One particularly +pressing problem that has not yet been sufficiently studied is the joint +estimation and prediction of city-wide delivery demand. To this end, we +formulate this problem as a graph-based spatiotemporal learning task. First, a +message-passing neural network model is formalized to capture the interaction +between demand patterns of associated regions. Second, by exploiting recent +advances in large language models, we extract general geospatial knowledge +encodings from the unstructured locational data and integrate them into the +demand predictor. Last, to encourage the cross-city transferability of the +model, an inductive training scheme is developed in an end-to-end routine. +Extensive empirical results on two real-world delivery datasets, including +eight cities in China and the US, demonstrate that our model significantly +outperforms state-of-the-art baselines in these challenging tasks. + +
+
+
+
+
+ + ☆ Self-supervised learning for crystal property prediction via denoising ICML 2024 + + +
+ Accurate prediction of the properties of crystalline materials is crucial for +targeted discovery, and this prediction is increasingly done with data-driven +models. However, for many properties of interest, the number of materials for +which a specific property has been determined is much smaller than the number +of known materials. To overcome this disparity, we propose a novel +self-supervised learning (SSL) strategy for material property prediction. Our +approach, crystal denoising self-supervised learning (CDSSL), pretrains +predictive models (e.g., graph networks) with a pretext task based on +recovering valid material structures when given perturbed versions of these +structures. We demonstrate that CDSSL models out-perform models trained without +SSL, across material types, properties, and dataset sizes. + +
+
+ comment: Published at ICML 2024 AI4Science: + https://openreview.net/forum?id=yML9ufAEoV +
+
+
+
+
+ + ☆ Learning and Verifying Maximal Taylor-Neural Lyapunov functions + + +
+ We introduce a novel neural network architecture, termed Taylor-neural +Lyapunov functions, designed to approximate Lyapunov functions with formal +certification. This architecture innovatively encodes local approximations and +extends them globally by leveraging neural networks to approximate the +residuals. Our method recasts the problem of estimating the largest region of +attraction - specifically for maximal Lyapunov functions - into a learning +problem, ensuring convergence around the origin through robust control theory. +Physics-informed machine learning techniques further refine the estimation of +the largest region of attraction. Remarkably, this method is versatile, +operating effectively even without simulated data points. We validate the +efficacy of our approach by providing numerical certificates of convergence +across multiple examples. Our proposed methodology not only competes closely +with state-of-the-art approaches, such as sum-of-squares and LyZNet, but also +achieves comparable results even in the absence of simulated data. This work +represents a significant advancement in control theory, with broad potential +applications in the design of stable control systems and beyond. + +
+
+
+
+
+ + ☆ Categorical data clustering: 25 years beyond K-modes + + +
+ The clustering of categorical data is a common and important task in computer +science, offering profound implications across a spectrum of applications. +Unlike purely numerical datasets, categorical data often lack inherent ordering +as in nominal data, or have varying levels of order as in ordinal data, thus +requiring specialized methodologies for efficient organization and analysis. +This review provides a comprehensive synthesis of categorical data clustering +in the past twenty-five years, starting from the introduction of K-modes. It +elucidates the pivotal role of categorical data clustering in diverse fields +such as health sciences, natural sciences, social sciences, education, +engineering and economics. Practical comparisons are conducted for algorithms +having public implementations, highlighting distinguishing clustering +methodologies and revealing the performance of recent algorithms on several +benchmark categorical datasets. Finally, challenges and opportunities in the +field are discussed. + +
+
+
+
+
+ + ☆ Using Quantum Solved Deep Boltzmann Machines to Increase the Data + Efficiency of RL Agents + + +
+ Deep Learning algorithms, such as those used in Reinforcement Learning, often +require large quantities of data to train effectively. In most cases, the +availability of data is not a significant issue. However, for some contexts, +such as in autonomous cyber defence, we require data efficient methods. +Recently, Quantum Machine Learning and Boltzmann Machines have been proposed as +solutions to this challenge. In this work we build upon the pre-existing work +to extend the use of Deep Boltzmann Machines to the cutting edge algorithm +Proximal Policy Optimisation in a Reinforcement Learning cyber defence +environment. We show that this approach, when solved using a D-WAVE quantum +annealer, can lead to a two-fold increase in data efficiency. We therefore +expect it to be used by the machine learning and quantum communities who are +hoping to capitalise on data-efficient Reinforcement Learning methods. + +
+
+
+
+
+ + ☆ AI-Driven Intrusion Detection Systems (IDS) on the ROAD dataset: A + Comparative Analysis for automotive Controller Area Network (CAN) + + +
+ The integration of digital devices in modern vehicles has revolutionized +automotive technology, enhancing safety and the overall driving experience. The +Controller Area Network (CAN) bus is a central system for managing in-vehicle +communication between the electronic control units (ECUs). However, the CAN +protocol poses security challenges due to inherent vulnerabilities, lacking +encryption and authentication, which, combined with an expanding attack +surface, necessitates robust security measures. In response to this challenge, +numerous Intrusion Detection Systems (IDS) have been developed and deployed. +Nonetheless, an open, comprehensive, and realistic dataset to test the +effectiveness of such IDSs remains absent in the existing literature. This +paper addresses this gap by considering the latest ROAD dataset, containing +stealthy and sophisticated injections. The methodology involves dataset +labelling and the implementation of both state-of-the-art deep learning models +and traditional machine learning models to show the discrepancy in performance +between the datasets most commonly used in the literature and the ROAD dataset, +a more realistic alternative. + +
+
+
+
+
+ + ☆ Geometry of Lightning Self-Attention: Identifiability and Dimension + + +
+ We consider function spaces defined by self-attention networks without +normalization, and theoretically analyze their geometry. Since these networks +are polynomial, we rely on tools from algebraic geometry. In particular, we +study the identifiability of deep attention by providing a description of the +generic fibers of the parametrization for an arbitrary number of layers and, as +a consequence, compute the dimension of the function space. Additionally, for a +single-layer model, we characterize the singular and boundary points. Finally, +we formulate a conjectural extension of our results to normalized +self-attention networks, prove it for a single layer, and numerically verify it +in the deep case. + +
+
+
+
+
+ + ☆ Democratizing AI in Africa: FL for Low-Resource Edge Devices + + +
+ Africa faces significant challenges in healthcare delivery due to limited +infrastructure and access to advanced medical technologies. This study explores +the use of federated learning to overcome these barriers, focusing on perinatal +health. We trained a fetal plane classifier using perinatal data from five +African countries: Algeria, Ghana, Egypt, Malawi, and Uganda, along with data +from Spanish hospitals. To incorporate the lack of computational resources in +the analysis, we considered a heterogeneous set of devices, including a +Raspberry Pi and several laptops, for model training. We demonstrate +comparative performance between a centralized and a federated model, despite +the compute limitations, and a significant improvement in model +generalizability when compared to models trained only locally. These results +show the potential for a future implementation at a large scale of a federated +learning platform to bridge the accessibility gap and improve model +generalizability with very little requirements. + +
+
+
+
+
+ + ☆ Towards Symbolic XAI -- Explanation Through Human Understandable Logical + Relationships Between Features + + +
+ Explainable Artificial Intelligence (XAI) plays a crucial role in fostering +transparency and trust in AI systems, where traditional XAI approaches +typically offer one level of abstraction for explanations, often in the form of +heatmaps highlighting single or multiple input features. However, we ask +whether abstract reasoning or problem-solving strategies of a model may also be +relevant, as these align more closely with how humans approach solutions to +problems. We propose a framework, called Symbolic XAI, that attributes +relevance to symbolic queries expressing logical relationships between input +features, thereby capturing the abstract reasoning behind a model's +predictions. The methodology is built upon a simple yet general multi-order +decomposition of model predictions. This decomposition can be specified using +higher-order propagation-based relevance methods, such as GNN-LRP, or +perturbation-based explanation methods commonly used in XAI. The effectiveness +of our framework is demonstrated in the domains of natural language processing +(NLP), vision, and quantum chemistry (QC), where abstract symbolic domain +knowledge is abundant and of significant interest to users. The Symbolic XAI +framework provides an understanding of the model's decision-making process that +is both flexible for customization by the user and human-readable through +logical formulas. + +
+
+
+
+
+ + ☆ Short-term Wind Speed Forecasting for Power Integration in Smart Grids + based on Hybrid LSSVM-SVMD Method + + +
+ Owing to its minimal pollution and efficient energy use, wind energy has +become one of the most widely exploited renewable energy resources. The +successful integration of wind power into the grid system is contingent upon +accurate wind speed forecasting models. However, the task of wind speed +forecasting is challenging due to the inherent intermittent characteristics of +wind speed. In this paper, a hybrid machine learning approach is developed for +predicting short-term wind speed. First, the wind data was decomposed into +modal components using Successive Variational Mode Decomposition (SVMD). Then, +each sub-signal was fitted into a Least Squares Support Vector Machines (LSSVM) +model, with its hyperparameter optimized by a novel variant of Quantum-behaved +Particle Swarm Optimization (QPSO), QPSO with elitist breeding (EBQPSO). +Second, the residuals making up for the differences between the original wind +series and the aggregate of the SVMD modes were modeled using long short-term +model (LSTM). Then, the overall predicted values were computed using the +aggregate of the LSSVM and the LSTM models. Finally, the performance of the +proposed model was compared against state-of-the-art benchmark models for +forecasting wind speed using two separate data sets collected from a local wind +farm. Empirical results show significant improvement in performance by the +proposed method, achieving a 1.21% to 32.76% reduction in root mean square +error (RMSE) and a 2.05% to 40.75% reduction in mean average error (MAE) +compared to the benchmark methods. The entire code implementation of this work +is freely available in Github. + +
+
+
+
+
+ + ☆ Identifying and Clustering Counter Relationships of Team Compositions in + PvP Games for Efficient Balance Analysis + + +
+ How can balance be quantified in game settings? This question is crucial for +game designers, especially in player-versus-player (PvP) games, where analyzing +the strength relations among predefined team compositions-such as hero +combinations in multiplayer online battle arena (MOBA) games or decks in card +games-is essential for enhancing gameplay and achieving balance. We have +developed two advanced measures that extend beyond the simplistic win rate to +quantify balance in zero-sum competitive scenarios. These measures are derived +from win value estimations, which employ strength rating approximations via the +Bradley-Terry model and counter relationship approximations via vector +quantization, significantly reducing the computational complexity associated +with traditional win value estimations. Throughout the learning process of +these models, we identify useful categories of compositions and pinpoint their +counter relationships, aligning with the experiences of human players without +requiring specific game knowledge. Our methodology hinges on a simple technique +to enhance codebook utilization in discrete representation with a deterministic +vector quantization process for an extremely small state space. Our framework +has been validated in popular online games, including Age of Empires II, +Hearthstone, Brawl Stars, and League of Legends. The accuracy of the observed +strength relations in these games is comparable to traditional pairwise win +value predictions, while also offering a more manageable complexity for +analysis. Ultimately, our findings contribute to a deeper understanding of PvP +game dynamics and present a methodology that significantly improves game +balance evaluation and design. + +
+
+ comment: TMLR 09/2024 https://openreview.net/forum?id=2D36otXvBE +
+
+
+
+
+ + ☆ SafeTail: Efficient Tail Latency Optimization in Edge Service Scheduling + via Computational Redundancy Management + + +
+ Optimizing tail latency while efficiently managing computational resources is +crucial for delivering high-performance, latency-sensitive services in edge +computing. Emerging applications, such as augmented reality, require +low-latency computing services with high reliability on user devices, which +often have limited computational capabilities. Consequently, these devices +depend on nearby edge servers for processing. However, inherent uncertainties +in network and computation latencies stemming from variability in wireless +networks and fluctuating server loads make service delivery on time +challenging. Existing approaches often focus on optimizing median latency but +fall short of addressing the specific challenges of tail latency in edge +environments, particularly under uncertain network and computational +conditions. Although some methods do address tail latency, they typically rely +on fixed or excessive redundancy and lack adaptability to dynamic network +conditions, often being designed for cloud environments rather than the unique +demands of edge computing. In this paper, we introduce SafeTail, a framework +that meets both median and tail response time targets, with tail latency +defined as latency beyond the 90^th percentile threshold. SafeTail addresses +this challenge by selectively replicating services across multiple edge servers +to meet target latencies. SafeTail employs a reward-based deep learning +framework to learn optimal placement strategies, balancing the need to achieve +target latencies with minimizing additional resource usage. Through +trace-driven simulations, SafeTail demonstrated near-optimal performance and +outperformed most baseline strategies across three diverse services. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Learning Multi-Target TDOA Features for Sound Event Localization and + Detection + + +
+ Sound event localization and detection (SELD) systems using audio recordings +from a microphone array rely on spatial cues for determining the location of +sound events. As a consequence, the localization performance of such systems is +to a large extent determined by the quality of the audio features that are used +as inputs to the system. We propose a new feature, based on neural generalized +cross-correlations with phase-transform (NGCC-PHAT), that learns audio +representations suitable for localization. Using permutation invariant training +for the time-difference of arrival (TDOA) estimation problem enables NGCC-PHAT +to learn TDOA features for multiple overlapping sound events. These features +can be used as a drop-in replacement for GCC-PHAT inputs to a SELD-network. We +test our method on the STARSS23 dataset and demonstrate improved localization +performance compared to using standard GCC-PHAT or SALSA-Lite input features. + +
+
+ comment: DCASE 2024 +
+
+
+
+
+ + ☆ Efficient Testable Learning of General Halfspaces with Adversarial Label + Noise COLT'24 + + +
+ We study the task of testable learning of general -- not necessarily +homogeneous -- halfspaces with adversarial label noise with respect to the +Gaussian distribution. In the testable learning framework, the goal is to +develop a tester-learner such that if the data passes the tester, then one can +trust the output of the robust learner on the data.Our main result is the first +polynomial time tester-learner for general halfspaces that achieves +dimension-independent misclassification error. At the heart of our approach is +a new methodology to reduce testable learning of general halfspaces to testable +learning of nearly homogeneous halfspaces that may be of broader interest. + +
+
+ comment: Presented to COLT'24 +
+
+
+
+
+ + ☆ The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by + Leveraging Second-Order Information + + +
+ The rising footprint of machine learning has led to a focus on imposing +\emph{model sparsity} as a means of reducing computational and memory costs. +For deep neural networks (DNNs), the state-of-the-art accuracy-vs-sparsity is +achieved by heuristics inspired by the classical Optimal Brain Surgeon (OBS) +framework~\citep{lecun90brain, hassibi1992second, hassibi1993optimal}, which +leverages loss curvature information to make better pruning decisions. Yet, +these results still lack a solid theoretical understanding, and it is unclear +whether they can be improved by leveraging connections to the wealth of work on +sparse recovery algorithms. In this paper, we draw new connections between +these two areas and present new sparse recovery algorithms inspired by the OBS +framework that comes with theoretical guarantees under reasonable assumptions +and have strong practical performance. Specifically, our work starts from the +observation that we can leverage curvature information in OBS-like fashion upon +the projection step of classic iterative sparse recovery algorithms such as +IHT. We show for the first time that this leads both to improved convergence +bounds under standard assumptions. Furthermore, we present extensions of this +approach to the practical task of obtaining accurate sparse DNNs, and validate +it experimentally at scale for Transformer-based models on vision and language +tasks. + +
+
+
+
+
+ + ☆ Deep Feature Embedding for Tabular Data ICONIP 2024 + + +
+ Tabular data learning has extensive applications in deep learning but its +existing embedding techniques are limited in numerical and categorical features +such as the inability to capture complex relationships and engineering. This +paper proposes a novel deep embedding framework with leverages lightweight deep +neural networks to generate effective feature embeddings for tabular data in +machine learning research. For numerical features, a two-step feature expansion +and deep transformation technique is used to capture copious semantic +information. For categorical features, a unique identification vector for each +entity is referred by a compact lookup table with a parameterized deep +embedding function to uniform the embedding size dimensions, and transformed +into a embedding vector using deep neural network. Experiments are conducted on +real-world datasets for performance evaluation. + +
+
+ comment: 15 pages, 2figures, accepted to ICONIP 2024, Paper ID: 1399 +
+
+
+
+
+ + ☆ Investigating Privacy Leakage in Dimensionality Reduction Methods via + Reconstruction Attack + + +
+ This study investigates privacy leakage in dimensionality reduction methods +through a novel machine learning-based reconstruction attack. Employing an +\emph{informed adversary} threat model, we develop a neural network capable of +reconstructing high-dimensional data from low-dimensional embeddings. + We evaluate six popular dimensionality reduction techniques: PCA, sparse +random projection (SRP), multidimensional scaling (MDS), Isomap, $t$-SNE, and +UMAP. Using both MNIST and NIH Chest X-ray datasets, we perform a qualitative +analysis to identify key factors affecting reconstruction quality. Furthermore, +we assess the effectiveness of an additive noise mechanism in mitigating these +reconstruction attacks. + +
+
+
+
+
+ + ☆ The Many Faces of Optimal Weak-to-Strong Learning + + +
+ Boosting is an extremely successful idea, allowing one to combine multiple +low accuracy classifiers into a much more accurate voting classifier. In this +work, we present a new and surprisingly simple Boosting algorithm that obtains +a provably optimal sample complexity. Sample optimal Boosting algorithms have +only recently been developed, and our new algorithm has the fastest runtime +among all such algorithms and is the simplest to describe: Partition your +training data into 5 disjoint pieces of equal size, run AdaBoost on each, and +combine the resulting classifiers via a majority vote. In addition to this +theoretical contribution, we also perform the first empirical comparison of the +proposed sample optimal Boosting algorithms. Our pilot empirical study suggests +that our new algorithm might outperform previous algorithms on large data sets. + +
+
+
+
+
+ + ☆ Towards Hyper-parameter-free Federated Learning + + +
+ The adaptive synchronization techniques in federated learning (FL) for scaled +global model updates show superior performance over the vanilla federated +averaging (FedAvg) scheme. However, existing methods employ additional tunable +hyperparameters on the server to determine the scaling factor. A contrasting +approach is automated scaling analogous to tuning-free step-size schemes in +stochastic gradient descent (SGD) methods, which offer competitive convergence +rates and exhibit good empirical performance. In this work, we introduce two +algorithms for automated scaling of global model updates. In our first +algorithm, we establish that a descent-ensuring step-size regime at the clients +ensures descent for the server objective. We show that such a scheme enables +linear convergence for strongly convex federated objectives. Our second +algorithm shows that the average of objective values of sampled clients is a +practical and effective substitute for the objective function value at the +server required for computing the scaling factor, whose computation is +otherwise not permitted. Our extensive empirical results show that the proposed +methods perform at par or better than the popular federated learning algorithms +for both convex and non-convex problems. Our work takes a step towards +designing hyper-parameter-free federated learning. + +
+
+ comment: 28 pages, 3 figures +
+
+
+
+
+ + ☆ Flow Matching for Optimal Reaction Coordinates of Biomolecular System + + +
+ We present Flow Matching for Reaction Coordinates (FMRC), a novel deep +learning algorithm designed to identify optimal reaction coordinates (RC) in +biomolecular reversible dynamics. FMRC is based on the mathematical principles +of lumpability and decomposability, which we reformulate into a conditional +probability framework for efficient data-driven optimization using deep +generative models. While FMRC does not explicitly learn the well-established +transfer operator or its eigenfunctions, it can effectively encode the dynamics +of leading eigenfunctions of the system transfer operator into its +low-dimensional RC space. We further quantitatively compare its performance +with several state-of-the-art algorithms by evaluating the quality of Markov +State Models (MSM) constructed in their respective RC spaces, demonstrating the +superiority of FMRC in three increasingly complex biomolecular systems. +Finally, we discuss its potential applications in downstream applications such +as enhanced sampling methods and MSM construction. + +
+
+
+
+
+ + ☆ Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph + Neural Networks for Drug Response Prediction + + +
+ Graph Neural Networks have been widely applied in critical decision-making +areas that demand interpretable predictions, leading to the flourishing +development of interpretability algorithms. However, current graph +interpretability algorithms tend to emphasize generality and often overlook +biological significance, thereby limiting their applicability in predicting +cancer drug responses. In this paper, we propose a novel post-hoc +interpretability algorithm for cancer drug response prediction, CETExplainer, +which incorporates a controllable edge-type-specific weighting mechanism. It +considers the mutual information between subgraphs and predictions, proposing a +structural scoring approach to provide fine-grained, biologically meaningful +explanations for predictive models. We also introduce a method for constructing +ground truth based on real-world datasets to quantitatively evaluate the +proposed interpretability algorithm. Empirical analysis on the real-world +dataset demonstrates that CETExplainer achieves superior stability and improves +explanation quality compared to leading algorithms, thereby offering a robust +and insightful tool for cancer drug prediction. + +
+
+
+
+
+ + ☆ Efficient Estimation of Unique Components in Independent Component + Analysis by Matrix Representation + + +
+ Independent component analysis (ICA) is a widely used method in various +applications of signal processing and feature extraction. It extends principal +component analysis (PCA) and can extract important and complicated components +with small variances. One of the major problems of ICA is that the uniqueness +of the solution is not guaranteed, unlike PCA. That is because there are many +local optima in optimizing the objective function of ICA. It has been shown +previously that the unique global optimum of ICA can be estimated from many +random initializations by handcrafted thread computation. In this paper, the +unique estimation of ICA is highly accelerated by reformulating the algorithm +in matrix representation and reducing redundant calculations. Experimental +results on artificial datasets and EEG data verified the efficiency of the +proposed method. + +
+
+
+
+
+ + ☆ Sparse Uncertainty-Informed Sampling from Federated Streaming Data + + +
+ We present a numerically robust, computationally efficient approach for +non-I.I.D. data stream sampling in federated client systems, where resources +are limited and labeled data for local model adaptation is sparse and +expensive. The proposed method identifies relevant stream observations to +optimize the underlying client model, given a local labeling budget, and +performs instantaneous labeling decisions without relying on any memory +buffering strategies. Our experiments show enhanced training batch diversity +and an improved numerical robustness of the proposal compared to existing +strategies over large-scale data streams, making our approach an effective and +convenient solution in FL environments. + +
+
+ comment: Preprint, 6 pages, 3 figures, Accepted for ESANN 2024 +
+
+
+
+
+ + ☆ RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation + and Retrieval-Guidance + + +
+ Diffusion-based models demonstrate impressive generation capabilities. +However, they also have a massive number of parameters, resulting in enormous +model sizes, thus making them unsuitable for deployment on resource-constraint +devices. Block-wise generation can be a promising alternative for designing +compact-sized (parameter-efficient) deep generative models since the model can +generate one block at a time instead of generating the whole image at once. +However, block-wise generation is also considerably challenging because +ensuring coherence across generated blocks can be non-trivial. To this end, we +design a retrieval-augmented generation (RAG) approach and leverage the +corresponding blocks of the images retrieved by the RAG module to condition the +training and generation stages of a block-wise denoising diffusion model. Our +conditioning schemes ensure coherence across the different blocks during +training and, consequently, during generation. While we showcase our approach +using the latent diffusion model (LDM) as the base model, it can be used with +other variants of denoising diffusion models. We validate the solution of the +coherence problem through the proposed approach by reporting substantive +experiments to demonstrate our approach's effectiveness in compact model size +and excellent generation quality. + +
+
+
+
+
+ + ☆ FissionVAE: Federated Non-IID Image Generation with Latent Space and + Decoder Decomposition + + +
+ Federated learning is a machine learning paradigm that enables decentralized +clients to collaboratively learn a shared model while keeping all the training +data local. While considerable research has focused on federated image +generation, particularly Generative Adversarial Networks, Variational +Autoencoders have received less attention. In this paper, we address the +challenges of non-IID (independently and identically distributed) data +environments featuring multiple groups of images of different types. +Specifically, heterogeneous data distributions can lead to difficulties in +maintaining a consistent latent space and can also result in local generators +with disparate texture features being blended during aggregation. We introduce +a novel approach, FissionVAE, which decomposes the latent space and constructs +decoder branches tailored to individual client groups. This method allows for +customized learning that aligns with the unique data distributions of each +group. Additionally, we investigate the incorporation of hierarchical VAE +architectures and demonstrate the use of heterogeneous decoder architectures +within our model. We also explore strategies for setting the latent prior +distributions to enhance the decomposition process. To evaluate our approach, +we assemble two composite datasets: the first combines MNIST and FashionMNIST; +the second comprises RGB datasets of cartoon and human faces, wild animals, +marine vessels, and remote sensing images of Earth. Our experiments demonstrate +that FissionVAE greatly improves generation quality on these datasets compared +to baseline federated VAE models. + +
+
+
+
+
+ + ☆ Instant Adversarial Purification with Adversarial Consistency + Distillation + + +
+ Neural networks, despite their remarkable performance in widespread +applications, including image classification, are also known to be vulnerable +to subtle adversarial noise. Although some diffusion-based purification methods +have been proposed, for example, DiffPure, those methods are time-consuming. In +this paper, we propose One Step Control Purification (OSCP), a diffusion-based +purification model that can purify the adversarial image in one Neural Function +Evaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and +ControlNet for our one-step purification. OSCP is computationally friendly and +time efficient compared to other diffusion-based purification methods; we +achieve defense success rate of 74.19\% on ImageNet, only requiring 0.1s for +each purification. Moreover, there is a fundamental incongruence between +consistency distillation and adversarial perturbation. To address this +ontological dissonance, we propose Gaussian Adversarial Noise Distillation +(GAND), a novel consistency distillation framework that facilitates a more +nuanced reconciliation of the latent space dynamics, effectively bridging the +natural and adversarial manifolds. Our experiments show that the GAND does not +need a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient. + +
+
+
+
+
+ + ☆ A Survey of the Self Supervised Learning Mechanisms for Vision + Transformers + + +
+ Deep supervised learning models require high volume of labeled data to attain +sufficiently good results. Although, the practice of gathering and annotating +such big data is costly and laborious. Recently, the application of self +supervised learning (SSL) in vision tasks has gained significant attention. The +intuition behind SSL is to exploit the synchronous relationships within the +data as a form of self-supervision, which can be versatile. In the current big +data era, most of the data is unlabeled, and the success of SSL thus relies in +finding ways to improve this vast amount of unlabeled data available. Thus its +better for deep learning algorithms to reduce reliance on human supervision and +instead focus on self-supervision based on the inherent relationships within +the data. With the advent of ViTs, which have achieved remarkable results in +computer vision, it is crucial to explore and understand the various SSL +mechanisms employed for training these models specifically in scenarios where +there is less label data available. In this survey we thus develop a +comprehensive taxonomy of systematically classifying the SSL techniques based +upon their representations and pre-training tasks being applied. Additionally, +we discuss the motivations behind SSL, review popular pre-training tasks, and +highlight the challenges and advancements in this field. Furthermore, we +present a comparative analysis of different SSL methods, evaluate their +strengths and limitations, and identify potential avenues for future research. + +
+
+ comment: 34 Pages, 5 Figures, 7 Tables +
+
+
+
+
+ + ☆ Estimating Conditional Average Treatment Effects via Sufficient + Representation Learning + + +
+ Estimating the conditional average treatment effects (CATE) is very important +in causal inference and has a wide range of applications across many fields. In +the estimation process of CATE, the unconfoundedness assumption is typically +required to ensure the identifiability of the regression problems. When +estimating CATE using high-dimensional data, there have been many variable +selection methods and neural network approaches based on representation +learning, while these methods do not provide a way to verify whether the subset +of variables after dimensionality reduction or the learned representations +still satisfy the unconfoundedness assumption during the estimation process, +which can lead to ineffective estimates of the treatment effects. Additionally, +these methods typically use data from only the treatment or control group when +estimating the regression functions for each group. This paper proposes a novel +neural network approach named \textbf{CrossNet} to learn a sufficient +representation for the features, based on which we then estimate the CATE, +where cross indicates that in estimating the regression functions, we used data +from their own group as well as cross-utilized data from another group. +Numerical simulations and empirical results demonstrate that our method +outperforms the competitive approaches. + +
+
+
+
+
+ + ☆ Error-controlled non-additive interaction discovery in machine learning + models + + +
+ Machine learning (ML) models are powerful tools for detecting complex +patterns within data, yet their "black box" nature limits their +interpretability, hindering their use in critical domains like healthcare and +finance. To address this challenge, interpretable ML methods have been +developed to explain how features influence model predictions. However, these +methods often focus on univariate feature importance, overlooking the complex +interactions between features that ML models are capable of capturing. +Recognizing this limitation, recent efforts have aimed to extend these methods +to discover feature interactions, but existing approaches struggle with +robustness and error control, especially under data perturbations. In this +study, we introduce Diamond, a novel method for trustworthy feature interaction +discovery. Diamond uniquely integrates the model-X knockoffs framework to +control the false discovery rate (FDR), ensuring that the proportion of falsely +discovered interactions remains low. We further address the challenges of using +off-the-shelf interaction importance measures by proposing a calibration +procedure that refines these measures to maintain the desired FDR. Diamond's +applicability spans a wide range of ML models, including deep neural networks, +tree-based models, and factorization-based models. Our empirical evaluations on +both simulated and real datasets across various biomedical studies demonstrate +Diamond's utility in enabling more reliable data-driven scientific discoveries. +This method represents a significant step forward in the deployment of ML +models for scientific innovation and hypothesis generation. + +
+
+
+
+
+ + ☆ Disease Classification and Impact of Pretrained Deep Convolution Neural + Networks on Diverse Medical Imaging Datasets across Imaging Modalities + + +
+ Imaging techniques such as Chest X-rays, whole slide images, and optical +coherence tomography serve as the initial screening and detection for a wide +variety of medical pulmonary and ophthalmic conditions respectively. This paper +investigates the intricacies of using pretrained deep convolutional neural +networks with transfer learning across diverse medical imaging datasets with +varying modalities for binary and multiclass classification. We conducted a +comprehensive performance analysis with ten network architectures and model +families each with pretraining and random initialization. Our finding showed +that the use of pretrained models as fixed feature extractors yields poor +performance irrespective of the datasets. Contrary, histopathology microscopy +whole slide images have better performance. It is also found that deeper and +more complex architectures did not necessarily result in the best performance. +This observation implies that the improvements in ImageNet are not parallel to +the medical imaging tasks. Within a medical domain, the performance of the +network architectures varies within model families with shifts in datasets. +This indicates that the performance of models within a specific modality may +not be conclusive for another modality within the same domain. This study +provides a deeper understanding of the applications of deep learning techniques +in medical imaging and highlights the impact of pretrained networks across +different medical imaging datasets under five different experimental settings. + +
+
+ comment: 15 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ Improving Time Series Classification with Representation Soft Label + Smoothing + + +
+ Previous research has indicated that deep neural network based models for +time series classification (TSC) tasks are prone to overfitting. This issue can +be mitigated by employing strategies that prevent the model from becoming +overly confident in its predictions, such as label smoothing and confidence +penalty. Building upon the concept of label smoothing, we propose a novel +approach to generate more reliable soft labels, which we refer to as +representation soft label smoothing. We apply label smoothing, confidence +penalty, and our method representation soft label smoothing to several TSC +models and compare their performance with baseline method which only uses hard +labels for training. Our results demonstrate that the use of these enhancement +techniques yields competitive results compared to the baseline method. +Importantly, our method demonstrates strong performance across models with +varying structures and complexities. + +
+
+ comment: 14 pages,6 figures +
+
+
+
+
+ + ☆ Evaluation of Table Representations to Answer Questions from Tables in + Documents : A Case Study using 3GPP Specifications + + +
+ With the ubiquitous use of document corpora for question answering, one +important aspect which is especially relevant for technical documents is the +ability to extract information from tables which are interspersed with text. +The major challenge in this is that unlike free-flow text or isolated set of +tables, the representation of a table in terms of what is a relevant chunk is +not obvious. We conduct a series of experiments examining various +representations of tabular data interspersed with text to understand the +relative benefits of different representations. We choose a corpus of $3^{rd}$ +Generation Partnership Project (3GPP) documents since they are heavily +interspersed with tables. We create expert curated dataset of question answers +to evaluate our approach. We conclude that row level representations with +corresponding table header information being included in every cell improves +the performance of the retrieval, thus leveraging the structural information +present in the tabular data. + +
+
+ comment: 10 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ A Tighter Convergence Proof of Reverse Experience Replay + + +
+ In reinforcement learning, Reverse Experience Replay (RER) is a recently +proposed algorithm that attains better sample complexity than the classic +experience replay method. RER requires the learning algorithm to update the +parameters through consecutive state-action-reward tuples in reverse order. +However, the most recent theoretical analysis only holds for a minimal learning +rate and short consecutive steps, which converge slower than those large +learning rate algorithms without RER. In view of this theoretical and empirical +gap, we provide a tighter analysis that mitigates the limitation on the +learning rate and the length of consecutive steps. Furthermore, we show +theoretically that RER converges with a larger learning rate and a longer +sequence. + +
+
+ comment: This paper is accepted at RLC 2024 +
+
+
+
+
+ + ☆ A Scalable k-Medoids Clustering via Whale Optimization Algorithm + + +
+ Unsupervised clustering has emerged as a critical tool for uncovering hidden +patterns and insights from vast, unlabeled datasets. However, traditional +methods like Partitioning Around Medoids (PAM) struggle with scalability due to +their quadratic computational complexity. To address this limitation, we +introduce WOA-kMedoids, a novel unsupervised clustering method that +incorporates the Whale Optimization Algorithm (WOA), a nature-inspired +metaheuristic inspired by the hunting strategies of humpback whales. By +optimizing centroid selection, WOA-kMedoids reduces computational complexity of +the k-medoids algorithm from quadratic to near-linear with respect to the +number of observations. This improvement in efficiency enables WOA-kMedoids to +be scalable to large datasets while maintaining high clustering accuracy. We +evaluated the performance of WOA-kMedoids on 25 diverse time series datasets +from the UCR archive. Our empirical results demonstrate that WOA-kMedoids +maintains clustering accuracy similar to PAM. While WOA-kMedoids exhibited +slightly higher runtime than PAM on small datasets (less than 300 +observations), it outperformed PAM in computational efficiency on larger +datasets. The scalability of WOA-kMedoids, combined with its consistently high +accuracy, positions it as a promising and practical choice for unsupervised +clustering in big data applications. WOA-kMedoids has implications for +efficient knowledge discovery in massive, unlabeled datasets across various +domains. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ From Model Explanation to Data Misinterpretation: Uncovering the + Pitfalls of Post Hoc Explainers in Business Research + + +
+ Machine learning models have been increasingly used in business research. +However, most state-of-the-art machine learning models, such as deep neural +networks and XGBoost, are black boxes in nature. Therefore, post hoc explainers +that provide explanations for machine learning models by, for example, +estimating numerical importance of the input features, have been gaining wide +usage. Despite the intended use of post hoc explainers being explaining machine +learning models, we found a growing trend in business research where post hoc +explanations are used to draw inferences about the data. In this work, we +investigate the validity of such use. Specifically, we investigate with +extensive experiments whether the explanations obtained by the two most popular +post hoc explainers, SHAP and LIME, provide correct information about the true +marginal effects of X on Y in the data, which we call data-alignment. We then +identify what factors influence the alignment of explanations. Finally, we +propose a set of mitigation strategies to improve the data-alignment of +explanations and demonstrate their effectiveness with real-world data in an +econometric context. In spite of this effort, we nevertheless conclude that it +is often not appropriate to infer data insights from post hoc explanations. We +articulate appropriate alternative uses, the most important of which is to +facilitate the proposition and subsequent empirical investigation of +hypotheses. The ultimate goal of this paper is to caution business researchers +against translating post hoc explanations of machine learning models into +potentially false insights and understanding of data. + +
+
+
+
+
+ + ☆ The Sample-Communication Complexity Trade-off in Federated Q-Learning + + +
+ We consider the problem of federated Q-learning, where $M$ agents aim to +collaboratively learn the optimal Q-function of an unknown infinite-horizon +Markov decision process with finite state and action spaces. We investigate the +trade-off between sample and communication complexities for the widely used +class of intermittent communication algorithms. We first establish the converse +result, where it is shown that a federated Q-learning algorithm that offers any +speedup with respect to the number of agents in the per-agent sample complexity +needs to incur a communication cost of at least an order of +$\frac{1}{1-\gamma}$ up to logarithmic factors, where $\gamma$ is the discount +factor. We also propose a new algorithm, called Fed-DVR-Q, which is the first +federated Q-learning algorithm to simultaneously achieve order-optimal sample +and communication complexities. Thus, together these results provide a complete +characterization of the sample-communication complexity trade-off in federated +Q-learning. + +
+
+
+
+
+ + ☆ Training Ultra Long Context Language Model with Fully Pipelined + Distributed Transformer + + +
+ Large Language Models (LLMs) with long context capabilities are integral to +complex tasks in natural language processing and computational biology, such as +text generation and protein sequence analysis. However, training LLMs directly +on extremely long contexts demands considerable GPU resources and increased +memory, leading to higher costs and greater complexity. Alternative approaches +that introduce long context capabilities via downstream finetuning or +adaptations impose significant design limitations. In this paper, we propose +Fully Pipelined Distributed Transformer (FPDT) for efficiently training +long-context LLMs with extreme hardware efficiency. For GPT and Llama models, +we achieve a 16x increase in sequence length that can be trained on the same +hardware compared to current state-of-the-art solutions. With our dedicated +sequence chunk pipeline design, we can now train 8B LLM with 2 million sequence +length on only 4 GPUs, while also maintaining over 55% of MFU. Our proposed +FPDT is agnostic to existing training techniques and is proven to work +efficiently across different LLM models. + +
+
+
+
+
+ + ☆ Technical Report of HelixFold3 for Biomolecular Structure Prediction + + +
+ The AlphaFold series has transformed protein structure prediction with +remarkable accuracy, often matching experimental methods. AlphaFold2, +AlphaFold-Multimer, and the latest AlphaFold3 represent significant strides in +predicting single protein chains, protein complexes, and biomolecular +structures. While AlphaFold2 and AlphaFold-Multimer are open-sourced, +facilitating rapid and reliable predictions, AlphaFold3 remains partially +accessible through a limited online server and has not been open-sourced, +restricting further development. To address these challenges, the PaddleHelix +team is developing HelixFold3, aiming to replicate AlphaFold3's capabilities. +Using insights from previous models and extensive datasets, HelixFold3 achieves +an accuracy comparable to AlphaFold3 in predicting the structures of +conventional ligands, nucleic acids, and proteins. The initial release of +HelixFold3 is available as open source on GitHub for academic research, +promising to advance biomolecular research and accelerate discoveries. We also +provide online service at PaddleHelix website at +https://paddlehelix.baidu.com/app/all/helixfold3/forecast. + +
+
+
+
+
+ + ☆ Point Neuron Learning: A New Physics-Informed Neural Network + Architecture + + +
+ Machine learning and neural networks have advanced numerous research domains, +but challenges such as large training data requirements and inconsistent model +performance hinder their application in certain scientific problems. To +overcome these challenges, researchers have investigated integrating physics +principles into machine learning models, mainly through: (i) physics-guided +loss functions, generally termed as physics-informed neural networks, and (ii) +physics-guided architectural design. While both approaches have demonstrated +success across multiple scientific disciplines, they have limitations including +being trapped to a local minimum, poor interpretability, and restricted +generalizability. This paper proposes a new physics-informed neural network +(PINN) architecture that combines the strengths of both approaches by embedding +the fundamental solution of the wave equation into the network architecture, +enabling the learned model to strictly satisfy the wave equation. The proposed +point neuron learning method can model an arbitrary sound field based on +microphone observations without any dataset. Compared to other PINN methods, +our approach directly processes complex numbers and offers better +interpretability and generalizability. We evaluate the versatility of the +proposed architecture by a sound field reconstruction problem in a reverberant +environment. Results indicate that the point neuron method outperforms two +competing methods and can efficiently handle noisy environments with sparse +microphone observations. + +
+
+ comment: under the review process of EURASIP Journal on Audio, Speech, and + Music Processing +
+
+
+
+
+ + ☆ UserSumBench: A Benchmark Framework for Evaluating User Summarization + Approaches + + +
+ Large language models (LLMs) have shown remarkable capabilities in generating +user summaries from a long list of raw user activity data. These summaries +capture essential user information such as preferences and interests, and +therefore are invaluable for LLM-based personalization applications, such as +explainable recommender systems. However, the development of new summarization +techniques is hindered by the lack of ground-truth labels, the inherent +subjectivity of user summaries, and human evaluation which is often costly and +time-consuming. To address these challenges, we introduce \UserSumBench, a +benchmark framework designed to facilitate iterative development of LLM-based +summarization approaches. This framework offers two key components: (1) A +reference-free summary quality metric. We show that this metric is effective +and aligned with human preferences across three diverse datasets (MovieLens, +Yelp and Amazon Review). (2) A novel robust summarization method that leverages +time-hierarchical summarizer and self-critique verifier to produce high-quality +summaries while eliminating hallucination. This method serves as a strong +baseline for further innovation in summarization techniques. + +
+
+
+
+
+ + ☆ Discovery of False Data Injection Schemes on Frequency Controllers with + Reinforcement Learning + + +
+ While inverter-based distributed energy resources (DERs) play a crucial role +in integrating renewable energy into the power system, they concurrently +diminish the grid's system inertia, elevating the risk of frequency +instabilities. Furthermore, smart inverters, interfaced via communication +networks, pose a potential vulnerability to cyber threats if not diligently +managed. To proactively fortify the power grid against sophisticated cyber +attacks, we propose to employ reinforcement learning (RL) to identify potential +threats and system vulnerabilities. This study concentrates on analyzing +adversarial strategies for false data injection, specifically targeting smart +inverters involved in primary frequency control. Our findings demonstrate that +an RL agent can adeptly discern optimal false data injection methods to +manipulate inverter settings, potentially causing catastrophic consequences. + +
+
+
+
+
+ + ☆ An Empirical Study of Scaling Laws for Transfer + + +
+ We present a limited empirical study of scaling laws for transfer learning in +transformer models. More specifically, we examine a scaling law that +incorporates a "transfer gap" term, indicating the effectiveness of +pre-training on one distribution when optimizing for downstream performance on +another distribution. When the transfer gap is low, pre-training is a +cost-effective strategy for improving downstream performance. Conversely, when +the gap is high, collecting high-quality fine-tuning data becomes relatively +more cost effective. Fitting the scaling law to experiments from diverse +datasets reveals significant variations in the transfer gap across +distributions. In theory, the scaling law can inform optimal data allocation +strategies and highlights how the scarcity of downstream data can bottleneck +performance. Our findings contribute to a principled way to measure transfer +learning efficiency and understand how data availability affects capabilities. + +
+
+
+
+
+ + ♻ ☆ Quantum Distance Approximation for Persistence Diagrams + + +
+ Topological Data Analysis methods can be useful for classification and +clustering tasks in many different fields as they can provide two dimensional +persistence diagrams that summarize important information about the shape of +potentially complex and high dimensional data sets. The space of persistence +diagrams can be endowed with various metrics such as the Wasserstein distance +which admit a statistical structure and allow to use these summaries for +machine learning algorithms. However, computing the distance between two +persistence diagrams involves finding an optimal way to match the points of the +two diagrams and may not always be an easy task for classical computers. In +this work we explore the potential of quantum computers to estimate the +distance between persistence diagrams, in particular we propose variational +quantum algorithms for the Wasserstein distance as well as the $d^{c}_{p}$ +distance. Our implementation is a weighted version of the Quantum Approximate +Optimization Algorithm that relies on control clauses to encode the constraints +of the optimization problem. + +
+
+ comment: 39 pages, 12 figures, 2 tables, submitted to Journal of Physics: + Complexity +
+
+
+
+
+ + ♻ ☆ A Survey on Knowledge Editing of Neural Networks + + +
+ Deep neural networks are becoming increasingly pervasive in academia and +industry, matching and surpassing human performance on a wide variety of fields +and related tasks. However, just as humans, even the largest artificial neural +networks make mistakes, and once-correct predictions can become invalid as the +world progresses in time. Augmenting datasets with samples that account for +mistakes or up-to-date information has become a common workaround in practical +applications. However, the well-known phenomenon of catastrophic forgetting +poses a challenge in achieving precise changes in the implicitly memorized +knowledge of neural network parameters, often requiring a full model +re-training to achieve desired behaviors. That is expensive, unreliable, and +incompatible with the current trend of large self-supervised pre-training, +making it necessary to find more efficient and effective methods for adapting +neural network models to changing data. To address this need, knowledge editing +is emerging as a novel area of research that aims to enable reliable, +data-efficient, and fast changes to a pre-trained target model, without +affecting model behaviors on previously learned tasks. In this survey, we +provide a brief review of this recent artificial intelligence field of +research. We first introduce the problem of editing neural networks, formalize +it in a common framework and differentiate it from more notorious branches of +research such as continuous learning. Next, we provide a review of the most +relevant knowledge editing approaches and datasets proposed so far, grouping +works under four different families: regularization techniques, meta-learning, +direct model editing, and architectural strategies. Finally, we outline some +intersections with other fields of research and potential directions for future +works. + +
+
+
+
+
+ + ♻ ☆ Evaluating Named Entity Recognition: A comparative analysis of mono- and + multilingual transformer models on a novel Brazilian corporate earnings call + transcripts dataset + + +
+ Since 2018, when the Transformer architecture was introduced, Natural +Language Processing has gained significant momentum with pre-trained +Transformer-based models that can be fine-tuned for various tasks. Most models +are pre-trained on large English corpora, making them less applicable to other +languages, such as Brazilian Portuguese. In our research, we identified two +models pre-trained in Brazilian Portuguese (BERTimbau and PTT5) and two +multilingual models (mBERT and mT5). BERTimbau and mBERT use only the Encoder +module, while PTT5 and mT5 use both the Encoder and Decoder. Our study aimed to +evaluate their performance on a financial Named Entity Recognition (NER) task +and determine the computational requirements for fine-tuning and inference. To +this end, we developed the Brazilian Financial NER (BraFiNER) dataset, +comprising sentences from Brazilian banks' earnings calls transcripts annotated +using a weakly supervised approach. Additionally, we introduced a novel +approach that reframes the token classification task as a text generation +problem. After fine-tuning the models, we evaluated them using performance and +error metrics. Our findings reveal that BERT-based models consistently +outperform T5-based models. While the multilingual models exhibit comparable +macro F1-scores, BERTimbau demonstrates superior performance over PTT5. In +terms of error metrics, BERTimbau outperforms the other models. We also +observed that PTT5 and mT5 generated sentences with changes in monetary and +percentage values, highlighting the importance of accuracy and consistency in +the financial domain. Our findings provide insights into the differing +performance of BERT- and T5-based models for the NER task. + +
+
+
+
+
+ + ♻ ☆ Can We Remove the Square-Root in Adaptive Gradient Methods? A + Second-Order Perspective ICML 2024 + + +
+ Adaptive gradient optimizers like Adam(W) are the default training algorithms +for many deep learning architectures, such as transformers. Their diagonal +preconditioner is based on the gradient outer product which is incorporated +into the parameter update via a square root. While these methods are often +motivated as approximate second-order methods, the square root represents a +fundamental difference. In this work, we investigate how the behavior of +adaptive methods changes when we remove the root, i.e., strengthen their +second-order motivation. Surprisingly, we find that such square-root-free +adaptive methods close the generalization gap to SGD on convolutional +architectures, while maintaining their root-based counterpart's performance on +transformers. The second-order perspective also has practical benefits for +developing non-diagonal methods that can incorporate arbitrary curvature +approximations through the concept of preconditioner invariance. In contrast to +root-based methods like Shampoo, root-free counterparts work well and fast with +half-precision since they do not require numerically unstable matrix root +decompositions and inversions. Overall, our findings provide new insights into +the development of adaptive methods and raise important questions regarding the +overlooked role of adaptivity in their success. (experiment code: +https://github.com/yorkerlin/remove-the-square-root optimizer code: +https://github.com/f-dangel/sirfshampoo) + +
+
+ comment: A long version of the ICML 2024 paper. Added root-free update schemes + for n-dim tensor cases +
+
+
+
+
+ + ♻ ☆ Hoaxpedia: A Unified Wikipedia Hoax Articles Dataset + + +
+ Hoaxes are a recognised form of disinformation created deliberately, with +potential serious implications in the credibility of reference knowledge +resources such as Wikipedia. What makes detecting Wikipedia hoaxes hard is that +they often are written according to the official style guidelines. In this +work, we first provide a systematic analysis of similarities and discrepancies +between legitimate and hoax Wikipedia articles, and introduce Hoaxpedia, a +collection of 311 hoax articles (from existing literature and official +Wikipedia lists), together with semantically similar legitimate articles, which +together form a binary text classification dataset aimed at fostering research +in automated hoax detection. In this paper, We report results after analyzing +several language models, hoax-to-legit ratios, and the amount of text +classifiers are exposed to (full article vs the article's definition alone). +Our results suggest that detecting deceitful content in Wikipedia based on +content alone is hard but feasible, and complement our analysis with a study on +the differences in distributions in edit histories, and find that looking at +this feature yields better classification results than context. + +
+
+
+
+
+ + ♻ ☆ Recursive Estimation of Conditional Kernel Mean Embeddings + + +
+ Kernel mean embeddings, a widely used technique in machine learning, map +probability distributions to elements of a reproducing kernel Hilbert space +(RKHS). For supervised learning problems, where input-output pairs are +observed, the conditional distribution of outputs given the inputs is a key +object. The input dependent conditional distribution of an output can be +encoded with an RKHS valued function, the conditional kernel mean map. In this +paper we present a new recursive algorithm to estimate the conditional kernel +mean map in a Hilbert space valued $L_2$ space, that is in a Bochner space. We +prove the weak and strong $L_2$ consistency of our recursive estimator under +mild conditions. The idea is to generalize Stone's theorem for Hilbert space +valued regression in a locally compact Polish space. We present new insights +about conditional kernel mean embeddings and give strong asymptotic bounds +regarding the convergence of the proposed recursive method. Finally, the +results are demonstrated on three application domains: for inputs coming from +Euclidean spaces, Riemannian manifolds and locally compact subsets of function +spaces. + +
+
+
+
+
+ + ♻ ☆ Complexity of High-Dimensional Identity Testing with Coordinate + Conditional Sampling + + +
+ We study the identity testing problem for high-dimensional distributions. +Given as input an explicit distribution $\mu$, an $\varepsilon>0$, and access +to sampling oracle(s) for a hidden distribution $\pi$, the goal in identity +testing is to distinguish whether the two distributions $\mu$ and $\pi$ are +identical or are at least $\varepsilon$-far apart. When there is only access to +full samples from the hidden distribution $\pi$, it is known that exponentially +many samples (in the dimension) may be needed for identity testing, and hence +previous works have studied identity testing with additional access to various +"conditional" sampling oracles. We consider a significantly weaker conditional +sampling oracle, which we call the $\mathsf{Coordinate\ Oracle}$, and provide a +computational and statistical characterization of the identity testing problem +in this new model. + We prove that if an analytic property known as approximate tensorization of +entropy holds for an $n$-dimensional visible distribution $\mu$, then there is +an efficient identity testing algorithm for any hidden distribution $\pi$ using +$\tilde{O}(n/\varepsilon)$ queries to the $\mathsf{Coordinate\ Oracle}$. +Approximate tensorization of entropy is a pertinent condition as recent works +have established it for a large class of high-dimensional distributions. We +also prove a computational phase transition: for a well-studied class of +$n$-dimensional distributions, specifically sparse antiferromagnetic Ising +models over $\{+1,-1\}^n$, we show that in the regime where approximate +tensorization of entropy fails, there is no efficient identity testing +algorithm unless $\mathsf{RP}=\mathsf{NP}$. We complement our results with a +matching $\Omega(n/\varepsilon)$ statistical lower bound for the sample +complexity of identity testing in the $\mathsf{Coordinate\ Oracle}$ model. + +
+
+
+
+
+ + ♻ ☆ Learning Dynamic Bayesian Networks from Data: Foundations, First + Principles and Numerical Comparisons + + +
+ In this paper, we present a guide to the foundations of learning Dynamic +Bayesian Networks (DBNs) from data in the form of multiple samples of +trajectories for some length of time. We present the formalism for a generic as +well as a set of common types of DBNs for particular variable distributions. We +present the analytical form of the models, with a comprehensive discussion on +the interdependence between structure and weights in a DBN model and their +implications for learning. Next, we give a broad overview of learning methods +and describe and categorize them based on the most important statistical +features, and how they treat the interplay between learning structure and +weights. We give the analytical form of the likelihood and Bayesian score +functions, emphasizing the distinction from the static case. We discuss +functions used in optimization to enforce structural requirements. We briefly +discuss more complex extensions and representations. Finally we present a set +of comparisons in different settings for various distinct but representative +algorithms across the variants. + +
+
+
+
+
+ + ♻ ☆ LightFF: Lightweight Inference for Forward-Forward Algorithm + + +
+ The human brain performs tasks with an outstanding energy efficiency, i.e., +with approximately 20 Watts. The state-of-the-art Artificial/Deep Neural +Networks (ANN/DNN), on the other hand, have recently been shown to consume +massive amounts of energy. The training of these ANNs/DNNs is done almost +exclusively based on the back-propagation algorithm, which is known to be +biologically implausible. This has led to a new generation of forward-only +techniques, including the Forward-Forward algorithm. In this paper, we propose +a lightweight inference scheme specifically designed for DNNs trained using the +Forward-Forward algorithm. We have evaluated our proposed lightweight inference +scheme in the case of the MNIST and CIFAR datasets, as well as two real-world +applications, namely, epileptic seizure detection and cardiac arrhythmia +classification using wearable technologies, where complexity overheads/energy +consumption is a major constraint, and demonstrate its relevance. Our code is +available at https://github.com/AminAminifar/LightFF. + +
+
+
+
+
+ + ♻ ☆ Learning the irreversible progression trajectory of Alzheimer's disease + + +
+ Alzheimer's disease (AD) is a progressive and irreversible brain disorder +that unfolds over the course of 30 years. Therefore, it is critical to capture +the disease progression in an early stage such that intervention can be applied +before the onset of symptoms. Machine learning (ML) models have been shown +effective in predicting the onset of AD. Yet for subjects with follow-up +visits, existing techniques for AD classification only aim for accurate group +assignment, where the monotonically increasing risk across follow-up visits is +usually ignored. Resulted fluctuating risk scores across visits violate the +irreversibility of AD, hampering the trustworthiness of models and also +providing little value to understanding the disease progression. To address +this issue, we propose a novel regularization approach to predict AD +longitudinally. Our technique aims to maintain the expected monotonicity of +increasing disease risk during progression while preserving expressiveness. +Specifically, we introduce a monotonicity constraint that encourages the model +to predict disease risk in a consistent and ordered manner across follow-up +visits. We evaluate our method using the longitudinal structural MRI and +amyloid-PET imaging data from the Alzheimer's Disease Neuroimaging Initiative +(ADNI). Our model outperforms existing techniques in capturing the +progressiveness of disease risk, and at the same time preserves prediction +accuracy. + +
+
+ comment: accepted by ISBI 2024 +
+
+
+
+
+ + ♻ ☆ Parameters Inference for Nonlinear Wave Equations with Markovian + Switching + + +
+ Traditional partial differential equations with constant coefficients often +struggle to capture abrupt changes in real-world phenomena, leading to the +development of variable coefficient PDEs and Markovian switching models. +Recently, research has introduced the concept of PDEs with Markov switching +models, established their well-posedness and presented numerical methods. +However, there has been limited discussion on parameter estimation for the jump +coefficients in these models. This paper addresses this gap by focusing on +parameter inference for the wave equation with Markovian switching. We propose +a Bayesian statistical framework using discrete sparse Bayesian learning to +establish its convergence and a uniform error bound. Our method requires fewer +assumptions and enables independent parameter inference for each segment by +allowing different underlying structures for the parameter estimation problem +within each segmented time interval. The effectiveness of our approach is +demonstrated through three numerical cases, which involve noisy spatiotemporal +data from different wave equations with Markovian switching. The results show +strong performance in parameter estimation for variable coefficient PDEs. + +
+
+
+
+
+ + ♻ ☆ Effectiveness of probabilistic contact tracing in epidemic containment: + the role of super-spreaders and transmission path reconstruction + + +
+ The recent COVID-19 pandemic underscores the significance of early-stage +non-pharmacological intervention strategies. The widespread use of masks and +the systematic implementation of contact tracing strategies provide a +potentially equally effective and socially less impactful alternative to more +conventional approaches, such as large-scale mobility restrictions. However, +manual contact tracing faces strong limitations in accessing the network of +contacts, and the scalability of currently implemented protocols for +smartphone-based digital contact tracing becomes impractical during the rapid +expansion phases of the outbreaks, due to the surge in exposure notifications +and associated tests. A substantial improvement in digital contact tracing can +be obtained through the integration of probabilistic techniques for risk +assessment that can more effectively guide the allocation of new diagnostic +tests. In this study, we first quantitatively analyze the diagnostic and social +costs associated with these containment measures based on contact tracing, +employing three state-of-the-art models of SARS-CoV-2 spreading. Our results +suggest that probabilistic techniques allow for more effective mitigation at a +lower cost. Secondly, our findings reveal a remarkable efficacy of +probabilistic contact-tracing techniques in performing backward and multi-step +tracing and capturing super-spreading events. + +
+
+
+
+
+ + ♻ ☆ Foundational Models for Pathology and Endoscopy Images: Application for + Gastric Inflammation + + +
+ The integration of artificial intelligence (AI) in medical diagnostics +represents a significant advancement in managing upper gastrointestinal (GI) +cancer, a major cause of global cancer mortality. Specifically for gastric +cancer (GC), chronic inflammation causes changes in the mucosa such as atrophy, +intestinal metaplasia (IM), dysplasia and ultimately cancer. Early detection +through endoscopic regular surveillance is essential for better outcomes. +Foundation models (FM), which are machine or deep learning models trained on +diverse data and applicable to broad use cases, offer a promising solution to +enhance the accuracy of endoscopy and its subsequent pathology image analysis. +This review explores the recent advancements, applications, and challenges +associated with FM in endoscopy and pathology imaging. We started by +elucidating the core principles and architectures underlying these models, +including their training methodologies and the pivotal role of large-scale data +in developing their predictive capabilities. Moreover, this work discusses +emerging trends and future research directions, emphasizing the integration of +multimodal data, the development of more robust and equitable models, and the +potential for real-time diagnostic support. This review aims to provide a +roadmap for researchers and practitioners in navigating the complexities of +incorporating FM into clinical practice for prevention/management of GC cases, +thereby improving patient outcomes. + +
+
+
+
+
+ + ♻ ☆ A Newton-CG based barrier-augmented Lagrangian method for general + nonconvex conic optimization + + +
+ In this paper we consider finding an approximate second-order stationary +point (SOSP) of general nonconvex conic optimization that minimizes a twice +differentiable function subject to nonlinear equality constraints and also a +convex conic constraint. In particular, we propose a Newton-conjugate gradient +(Newton-CG) based barrier-augmented Lagrangian method for finding an +approximate SOSP of this problem. Under some mild assumptions, we show that our +method enjoys a total inner iteration complexity of $\widetilde{\cal +O}(\epsilon^{-11/2})$ and an operation complexity of $\widetilde{\cal +O}(\epsilon^{-11/2}\min\{n,\epsilon^{-5/4}\})$ for finding an +$(\epsilon,\sqrt{\epsilon})$-SOSP of general nonconvex conic optimization with +high probability. Moreover, under a constraint qualification, these complexity +bounds are improved to $\widetilde{\cal O}(\epsilon^{-7/2})$ and +$\widetilde{\cal O}(\epsilon^{-7/2}\min\{n,\epsilon^{-3/4}\})$, respectively. +To the best of our knowledge, this is the first study on the complexity of +finding an approximate SOSP of general nonconvex conic optimization. +Preliminary numerical results are presented to demonstrate superiority of the +proposed method over first-order methods in terms of solution quality. + +
+
+ comment: To appear in Computational Optimization and Applications. arXiv admin + note: text overlap with arXiv:2301.03139 +
+
+
+
+
+ + ♻ ☆ Invariant Causal Prediction with Local Models + + +
+ We consider the task of identifying the causal parents of a target variable +among a set of candidates from observational data. Our main assumption is that +the candidate variables are observed in different environments which may, under +certain assumptions, be regarded as interventions on the observed system. We +assume a linear relationship between target and candidates, which can be +different in each environment with the only restriction that the causal +structure is invariant across environments. Within our proposed setting we +provide sufficient conditions for identifiability of the causal parents and +introduce a practical method called L-ICP ($\textbf{L}$ocalized +$\textbf{I}$nvariant $\textbf{Ca}$usal $\textbf{P}$rediction), which is based +on a hypothesis test for parent identification using a ratio of minimum and +maximum statistics. We then show in a simplified setting that the statistical +power of L-ICP converges exponentially fast in the sample size, and finally we +analyze the behavior of L-ICP experimentally in more general settings. + +
+
+
+
+
+ + ♻ ☆ On the Curse of Memory in Recurrent Neural Networks: Approximation and + Optimization Analysis + + +
+ We study the approximation properties and optimization dynamics of recurrent +neural networks (RNNs) when applied to learn input-output relationships in +temporal data. We consider the simple but representative setting of using +continuous-time linear RNNs to learn from data generated by linear +relationships. Mathematically, the latter can be understood as a sequence of +linear functionals. We prove a universal approximation theorem of such linear +functionals, and characterize the approximation rate and its relation with +memory. Moreover, we perform a fine-grained dynamical analysis of training +linear RNNs, which further reveal the intricate interactions between memory and +learning. A unifying theme uncovered is the non-trivial effect of memory, a +notion that can be made precise in our framework, on approximation and +optimization: when there is long term memory in the target, it takes a large +number of neurons to approximate it. Moreover, the training process will suffer +from slow downs. In particular, both of these effects become exponentially more +pronounced with memory - a phenomenon we call the "curse of memory". These +analyses represent a basic step towards a concrete mathematical understanding +of new phenomenon that may arise in learning temporal relationships using +recurrent architectures. + +
+
+ comment: Updated to include the condition $\sup_n \| \boldsymbol{x}(n) + \|_{\mathcal{X}} \leq 1$ in the definition of regularity, which excludes the + trivial case where only the zero functional is regular. Fixed various typos + and improved clarity +
+
+
+
+
+ + ♻ ☆ Wasserstein multivariate auto-regressive models for modeling + distributional time series + + +
+ This paper is focused on the statistical analysis of data consisting of a +collection of multiple series of probability measures that are indexed by +distinct time instants and supported over a bounded interval of the real line. +By modeling these time-dependent probability measures as random objects in the +Wasserstein space, we propose a new auto-regressive model for the statistical +analysis of multivariate distributional time series. Using the theory of +iterated random function systems, results on the existence, uniqueness and +stationarity of the solution of such a model are provided. We also propose a +consistent estimator for the auto-regressive coefficients of this model. Due to +the simplex constraints that we impose on the model coefficients, the proposed +estimator that is learned under these constraints, naturally has a sparse +structure. The sparsity allows the application of the proposed model in +learning a graph of temporal dependency from multivariate distributional time +series. We explore the numerical performances of our estimation procedure using +simulated data. To shed some light on the benefits of our approach for real +data analysis, we also apply this methodology to a data set made of +observations from age distribution in different countries. + +
+
+
+
+
+ + ♻ ☆ Scalable Multi-Agent Reinforcement Learning for Warehouse Logistics with + Robotic and Human Co-Workers IROS + + +
+ We consider a warehouse in which dozens of mobile robots and human pickers +work together to collect and deliver items within the warehouse. The +fundamental problem we tackle, called the order-picking problem, is how these +worker agents must coordinate their movement and actions in the warehouse to +maximise performance in this task. Established industry methods using heuristic +approaches require large engineering efforts to optimise for innately variable +warehouse configurations. In contrast, multi-agent reinforcement learning +(MARL) can be flexibly applied to diverse warehouse configurations (e.g. size, +layout, number/types of workers, item replenishment frequency), and different +types of order-picking paradigms (e.g. Goods-to-Person and Person-to-Goods), as +the agents can learn how to cooperate optimally through experience. We develop +hierarchical MARL algorithms in which a manager agent assigns goals to worker +agents, and the policies of the manager and workers are co-trained toward +maximising a global objective (e.g. pick rate). Our hierarchical algorithms +achieve significant gains in sample efficiency over baseline MARL algorithms +and overall pick rates over multiple established industry heuristics in a +diverse set of warehouse configurations and different order-picking paradigms. + +
+
+ comment: IEEE/RSJ International Conference on Intelligent Robots and Systems + (IROS), 2024 +
+
+
+
+
+ + ♻ ☆ A Deep-Learning Technique to Locate Cryptographic Operations in + Side-Channel Traces DATE24 + + +
+ Side-channel attacks allow extracting secret information from the execution +of cryptographic primitives by correlating the partially known computed data +and the measured side-channel signal. However, to set up a successful +side-channel attack, the attacker has to perform i) the challenging task of +locating the time instant in which the target cryptographic primitive is +executed inside a side-channel trace and then ii)the time-alignment of the +measured data on that time instant. This paper presents a novel deep-learning +technique to locate the time instant in which the target computed cryptographic +operations are executed in the side-channel trace. In contrast to +state-of-the-art solutions, the proposed methodology works even in the presence +of trace deformations obtained through random delay insertion techniques. We +validated our proposal through a successful attack against a variety of +unprotected and protected cryptographic primitives that have been executed on +an FPGA-implemented system-on-chip featuring a RISC-V CPU. + +
+
+ comment: 6 pages, 3 figures. Presented at DATE24 +
+
+
+
+
+ + ♻ ☆ Object-Centric Diffusion for Efficient Video Editing ECCV24 + + +
+ Diffusion-based video editing have reached impressive quality and can +transform either the global style, local structure, and attributes of given +video inputs, following textual edit prompts. However, such solutions typically +incur heavy memory and computational costs to generate temporally-coherent +frames, either in the form of diffusion inversion and/or cross-frame attention. +In this paper, we conduct an analysis of such inefficiencies, and suggest +simple yet effective modifications that allow significant speed-ups whilst +maintaining quality. Moreover, we introduce Object-Centric Diffusion, to fix +generation artifacts and further reduce latency by allocating more computations +towards foreground edited regions, arguably more important for perceptual +quality. We achieve this by two novel proposals: i) Object-Centric Sampling, +decoupling the diffusion steps spent on salient or background regions and +spending most on the former, and ii) Object-Centric Token Merging, which +reduces cost of cross-frame attention by fusing redundant tokens in unimportant +background regions. Both techniques are readily applicable to a given video +editing model without retraining, and can drastically reduce its memory and +computational cost. We evaluate our proposals on inversion-based and +control-signal-based editing pipelines, and show a latency reduction up to 10x +for a comparable synthesis quality. Project page: +qualcomm-ai-research.github.io/object-centric-diffusion. + +
+
+ comment: ECCV24 +
+
+
+
+
+ + ♻ ☆ Fast Fishing: Approximating BAIT for Efficient and Scalable Deep Active + Image Classification ECML + + +
+ Deep active learning (AL) seeks to minimize the annotation costs for training +deep neural networks. BAIT, a recently proposed AL strategy based on the Fisher +Information, has demonstrated impressive performance across various datasets. +However, BAIT's high computational and memory requirements hinder its +applicability on large-scale classification tasks, resulting in current +research neglecting BAIT in their evaluation. This paper introduces two methods +to enhance BAIT's computational efficiency and scalability. Notably, we +significantly reduce its time complexity by approximating the Fisher +Information. In particular, we adapt the original formulation by i) taking the +expectation over the most probable classes, and ii) constructing a binary +classification task, leading to an alternative likelihood for gradient +computations. Consequently, this allows the efficient use of BAIT on +large-scale datasets, including ImageNet. Our unified and comprehensive +evaluation across a variety of datasets demonstrates that our approximations +achieve strong performance with considerably reduced time complexity. +Furthermore, we provide an extensive open-source toolbox that implements recent +state-of-the-art AL strategies, available at +https://github.com/dhuseljic/dal-toolbox. + +
+
+ comment: Accepted at ECML PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Incorporating Unlabelled Data into Bayesian Neural Networks + + +
+ Conventional Bayesian Neural Networks (BNNs) are unable to leverage +unlabelled data to improve their predictions. To overcome this limitation, we +introduce Self-Supervised Bayesian Neural Networks, which use unlabelled data +to learn models with suitable prior predictive distributions. This is achieved +by leveraging contrastive pretraining techniques and optimising a variational +lower bound. We then show that the prior predictive distributions of +self-supervised BNNs capture problem semantics better than conventional BNN +priors. In turn, our approach offers improved predictive performance over +conventional BNNs, especially in low-budget regimes. + +
+
+ comment: Published in the Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ EUvsDisinfo: A Dataset for Multilingual Detection of Pro-Kremlin + Disinformation in News Articles CIKM 2024 + + +
+ This work introduces EUvsDisinfo, a multilingual dataset of disinformation +articles originating from pro-Kremlin outlets, along with trustworthy articles +from credible / less biased sources. It is sourced directly from the debunk +articles written by experts leading the EUvsDisinfo project. Our dataset is the +largest to-date resource in terms of the overall number of articles and +distinct languages. It also provides the largest topical and temporal coverage. +Using this dataset, we investigate the dissemination of pro-Kremlin +disinformation across different languages, uncovering language-specific +patterns targeting certain disinformation topics. We further analyse the +evolution of topic distribution over an eight-year period, noting a significant +surge in disinformation content before the full-scale invasion of Ukraine in +2022. Lastly, we demonstrate the dataset's applicability in training models to +effectively distinguish between disinformation and trustworthy content in +multilingual settings. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Jailbreak Attacks and Defenses Against Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have performed exceptionally in various +text-generative tasks, including question answering, translation, code +completion, etc. However, the over-assistance of LLMs has raised the challenge +of "jailbreaking", which induces the model to generate malicious responses +against the usage policy and society by designing adversarial prompts. With the +emergence of jailbreak attack methods exploiting different vulnerabilities in +LLMs, the corresponding safety alignment measures are also evolving. In this +paper, we propose a comprehensive and detailed taxonomy of jailbreak attack and +defense methods. For instance, the attack methods are divided into black-box +and white-box attacks based on the transparency of the target model. Meanwhile, +we classify defense methods into prompt-level and model-level defenses. +Additionally, we further subdivide these attack and defense methods into +distinct sub-classes and present a coherent diagram illustrating their +relationships. We also conduct an investigation into the current evaluation +methods and compare them from different perspectives. Our findings aim to +inspire future research and practical implementations in safeguarding LLMs +against adversarial attacks. Above all, although jailbreak remains a +significant concern within the community, we believe that our work enhances the +understanding of this domain and provides a foundation for developing more +secure LLMs. + +
+
+
+
+
+ + ♻ ☆ Robust Statistical Scaling of Outlier Scores: Improving the Quality of + Outlier Probabilities for Outliers (Extended Version) + + +
+ Outlier detection algorithms typically assign an outlier score to each +observation in a dataset, indicating the degree to which an observation is an +outlier. However, these scores are often not comparable across algorithms and +can be difficult for humans to interpret. Statistical scaling addresses this +problem by transforming outlier scores into outlier probabilities without using +ground-truth labels, thereby improving interpretability and comparability +across algorithms. However, the quality of this transformation can be different +for outliers and inliers. Missing outliers in scenarios where they are of +particular interest - such as healthcare, finance, or engineering - can be +costly or dangerous. Thus, ensuring good probabilities for outliers is +essential. This paper argues that statistical scaling, as commonly used in the +literature, does not produce equally good probabilities for outliers as for +inliers. Therefore, we propose robust statistical scaling, which uses robust +estimators to improve the probabilities for outliers. We evaluate several +variants of our method against other outlier score transformations for +real-world datasets and outlier detection algorithms, where it can improve the +probabilities for outliers. + +
+
+ comment: 15 pages, 4 figures, extended version of an original article accepted + for publication in SISAP 2024 by Springer Nature +
+
+
+
+
+ + ♻ ☆ Lamarr: LHCb ultra-fast simulation based on machine learning models + deployed within Gauss + + +
+ About 90% of the computing resources available to the LHCb experiment has +been spent to produce simulated data samples for Run 2 of the Large Hadron +Collider at CERN. The upgraded LHCb detector will be able to collect larger +data samples, requiring many more simulated events to analyze the data to be +collected in Run 3. Simulation is a key necessity of analysis to interpret +signal, reject background and measure efficiencies. The needed simulation will +far exceed the pledged resources, requiring an evolution in technologies and +techniques to produce these simulated data samples. In this contribution, we +discuss Lamarr, a Gaudi-based framework to speed-up the simulation production +parameterizing both the detector response and the reconstruction algorithms of +the LHCb experiment. Deep Generative Models powered by several algorithms and +strategies are employed to effectively parameterize the high-level response of +the single components of the LHCb detector, encoding within neural networks the +experimental errors and uncertainties introduced in the detection and +reconstruction phases. Where possible, models are trained directly on real +data, statistically subtracting any background components by applying +appropriate reweighing procedures. Embedding Lamarr in the general LHCb Gauss +Simulation framework allows to combine its execution with any of the available +generators in a seamless way. The resulting software package enables a +simulation process independent of the detailed simulation used to date. + +
+
+ comment: To be published in Journal of Physics: Conference Series (ACAT 2022) +
+
+
+
+
+ + ♻ ☆ Non-Homophilic Graph Pre-Training and Prompt Learning + + +
+ Graphs are ubiquitous for modeling complex relationships between objects +across various fields. Graph neural networks (GNNs) have become a mainstream +technique for graph-based applications, but their performance heavily relies on +abundant labeled data. To reduce labeling requirement, pre-training and prompt +learning has become a popular alternative. However, most existing prompt +methods do not differentiate homophilic and heterophilic characteristics of +real-world graphs. In particular, many real-world graphs are non-homophilic, +not strictly or uniformly homophilic with mixing homophilic and heterophilic +patterns, exhibiting varying non-homophilic characteristics across graphs and +nodes. In this paper, we propose ProNoG, a novel pre-training and prompt +learning framework for such non-homophilic graphs. First, we analyze existing +graph pre-training methods, providing theoretical insights into the choice of +pre-training tasks. Second, recognizing that each node exhibits unique +non-homophilic characteristics, we propose a conditional network to +characterize the node-specific patterns in downstream tasks. Finally, we +thoroughly evaluate and analyze ProNoG through extensive experiments on ten +public datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ DiffLoad: Uncertainty Quantification in Electrical Load Forecasting with + Diffusion Model + + +
+ Electrical load forecasting plays a crucial role in decision-making for power +systems, including unit commitment and economic dispatch. The integration of +renewable energy sources and the occurrence of external events, such as the +COVID-19 pandemic, have rapidly increased uncertainties in load forecasting. +The uncertainties in load forecasting can be divided into two types: epistemic +uncertainty and aleatoric uncertainty. Separating these types of uncertainties +can help decision-makers better understand where and to what extent the +uncertainty is, thereby enhancing their confidence in the following +decision-making. This paper proposes a diffusion-based Seq2Seq structure to +estimate epistemic uncertainty and employs the robust additive Cauchy +distribution to estimate aleatoric uncertainty. Our method not only ensures the +accuracy of load forecasting but also demonstrates the ability to separate the +two types of uncertainties and be applicable to different levels of loads. The +relevant code can be found at +\url{https://anonymous.4open.science/r/DiffLoad-4714/}. + +
+
+ comment: Accepted by IEEE Transactions on Power Systems, 2024 +
+
+
+
+
+ + ♻ ☆ Hyperparameter Optimization as a Service on INFN Cloud + + +
+ The simplest and often most effective way of parallelizing the training of +complex machine learning models is to execute several training instances on +multiple machines, scanning the hyperparameter space to optimize the underlying +statistical model and the learning procedure. Often, such a meta-learning +procedure is limited by the ability of accessing securely a common database +organizing the knowledge of the previous and ongoing trials. Exploiting +opportunistic GPUs provided in different environments represents a further +challenge when designing such optimization campaigns. In this contribution, we +discuss how a set of REST APIs can be used to access a dedicated service based +on INFN Cloud to monitor and coordinate multiple training instances, with +gradient-less optimization techniques, via simple HTTP requests. The service, +called Hopaas (Hyperparameter OPtimization As A Service), is made of a web +interface and sets of APIs implemented with a FastAPI backend running through +Uvicorn and NGINX in a virtual instance of INFN Cloud. The optimization +algorithms are currently based on Bayesian techniques as provided by Optuna. A +Python frontend is also made available for quick prototyping. We present +applications to hyperparameter optimization campaigns performed by combining +private, INFN Cloud, and CINECA resources. Such multi-node multi-site +optimization studies have given a significant boost to the development of a set +of parameterizations for the ultra-fast simulation of the LHCb experiment. + +
+
+ comment: To be published in Journal of Physics: Conference Series (ACAT 2022) +
+
+
+
+
+ + ♻ ☆ On the Causal Sufficiency and Necessity of Multi-Modal Representation + Learning + + +
+ An effective paradigm of multi-modal learning (MML) is to learn unified +representations among modalities. From a causal perspective, constraining the +consistency between different modalities can mine causal representations that +convey primary events. However, such simple consistency may face the risk of +learning insufficient or unnecessary information: a necessary but insufficient +cause is invariant across modalities but may not have the required accuracy; a +sufficient but unnecessary cause tends to adapt well to specific modalities but +may be hard to adapt to new data. To address this issue, in this paper, we aim +to learn representations that are both causal sufficient and necessary, i.e., +Causal Complete Cause ($C^3$), for MML. Firstly, we define the concept of $C^3$ +for MML, which reflects the probability of being causal sufficiency and +necessity. We also propose the identifiability and measurement of $C^3$, i.e., +$C^3$ risk, to ensure calculating the learned representations' $C^3$ scores in +practice. Then, we theoretically prove the effectiveness of $C^3$ risk by +establishing the performance guarantee of MML with a tight generalization +bound. Based on these theoretical results, we propose a plug-and-play method, +namely Causal Complete Cause Regularization ($C^3$R), to learn causal complete +representations by constraining the $C^3$ risk bound. Extensive experiments +conducted on various benchmark datasets empirically demonstrate the +effectiveness of $C^3$R. + +
+
+
+
+
+ + ♻ ☆ Empowering Aggregators with Practical Data-Driven Tools: Harnessing + Aggregated and Disaggregated Flexibility for Demand Response + + +
+ This study explores the interaction between aggregators and building +occupants in activating flexibility through Demand Response (DR) programs, with +a focus on reinforcing the resilience of the energy system considering the +uncertainties presented by Renewable Energy Sources (RES). Firstly, it +introduces a methodology of optimizing aggregated flexibility provision +strategies in environments with limited data, utilizing Discrete Fourier +Transformation (DFT) and clustering techniques to identify building occupants' +activity patterns. Secondly, the study assesses the disaggregated flexibility +provision of Heating Ventilation and Air Conditioning (HVAC) systems during DR +events, employing machine learning and optimization techniques for precise, +device-level analysis. The first approach offers a non-intrusive pathway for +aggregators to provide flexibility services in environments of a single smart +meter for the whole building's consumption, while the second approach maximizes +the amount of flexibility in the case of dedicated metering devices to the HVAC +systems by carefully considering building occupants' thermal comfort profiles. +Through the application of data-driven techniques and encompassing case studies +from both industrial and residential buildings, this paper not only unveils +pivotal opportunities for aggregators in the balancing and emerging flexibility +markets but also successfully develops and demonstrates end-to-end practical +tools for aggregators. + +
+
+
+
+
+ + ♻ ☆ Markov flow policy -- deep MC + + +
+ Discounted algorithms often encounter evaluation errors due to their reliance +on short-term estimations, which can impede their efficacy in addressing +simple, short-term tasks and impose undesired temporal discounts (\(\gamma\)). +Interestingly, these algorithms are often tested without applying a discount, a +phenomenon we refer as the \textit{train-test bias}. In response to these +challenges, we propose the Markov Flow Policy, which utilizes a non-negative +neural network flow to enable comprehensive forward-view predictions. Through +integration into the TD7 codebase and evaluation using the MuJoCo benchmark, we +observe significant performance improvements, positioning MFP as a +straightforward, practical, and easily implementable solution within the domain +of average rewards algorithms. + +
+
+ comment: Paper has been not finished +
+
+
+
+
+ + ♻ ☆ Solving Collaborative Dec-POMDPs with Deep Reinforcement Learning + Heuristics + + +
+ WQMIX, QMIX, QTRAN, and VDN are SOTA algorithms for Dec-POMDP. All of them +cannot solve complex agents' cooperation domains. We give an algorithm to solve +such problems. In the first stage, we solve a single-agent problem and get a +policy. In the second stage, we solve the multi-agent problem with the +single-agent policy. SA2MA has a clear advantage over all competitors in +complex agents' cooperative domains. + +
+
+ comment: Paper has been not finished +
+
+
+
+
+ + ♻ ☆ FedAgg: Adaptive Federated Learning with Aggregated Gradients + + +
+ Federated Learning (FL) has emerged as a crucial distributed training +paradigm, enabling discrete devices to collaboratively train a shared model +under the coordination of a central server, while leveraging their locally +stored private data. Nonetheless, the +non-independent-and-identically-distributed (Non-IID) data generated on +heterogeneous clients and the incessant information exchange among participants +may significantly impede training efficacy, retard the model convergence rate +and increase the risk of privacy leakage. To alleviate the divergence between +the local and average model parameters and obtain a fast model convergence +rate, we propose an adaptive FEDerated learning algorithm called FedAgg by +refining the conventional stochastic gradient descent (SGD) methodology with an +AGgregated Gradient term at each local training epoch and adaptively adjusting +the learning rate based on a penalty term that quantifies the local model +deviation. To tackle the challenge of information exchange among clients during +local training and design a decentralized adaptive learning rate for each +client, we introduce two mean-field terms to approximate the average local +parameters and gradients over time. Through rigorous theoretical analysis, we +demonstrate the existence and convergence of the mean-field terms and provide a +robust upper bound on the convergence of our proposed algorithm. The extensive +experimental results on real-world datasets substantiate the superiority of our +framework in comparison with existing state-of-the-art FL strategies for +enhancing model performance and accelerating convergence rate under IID and +Non-IID datasets. + +
+
+
+
+
+ + ♻ ☆ Training Neural Networks on Data Sources with Unknown Reliability + + +
+ When data is generated by multiple sources, conventional training methods +update models assuming equal reliability for each source and do not consider +their individual data quality during training. However, in many applications, +sources have varied levels of reliability that can have negative effects on the +performance of a neural network. A key issue is that often the quality of data +for individual sources is not known during training. Focusing on supervised +learning, this work presents a solution that aims to train neural networks on +each data source for a number of steps proportional to the source's estimated +relative reliability. This way, we allow training on all sources during the +warm-up, and reduce learning on less reliable sources during the final training +stages, when it has been shown models overfit to noise. We show through diverse +experiments, this can significantly improve model performance when trained on +mixtures of reliable and unreliable data sources, and maintain performance when +models are trained on reliable sources only. + +
+
+
+
+
+ + ♻ ☆ Mending of Spatio-Temporal Dependencies in Block Adjacency Matrix ICONIP 2024 + + +
+ In the realm of applications where data dynamically evolves across spatial +and temporal dimensions, Graph Neural Networks (GNNs) are often complemented by +sequence modeling architectures, such as RNNs and transformers, to effectively +model temporal changes. These hybrid models typically arrange the spatial and +temporal learning components in series. A pioneering effort to jointly model +the spatio-temporal dependencies using only GNNs was the introduction of the +Block Adjacency Matrix \(\mathbf{A_B}\) \cite{1}, which was constructed by +diagonally concatenating adjacency matrices from graphs at different time +steps. This approach resulted in a single graph encompassing complete +spatio-temporal data; however, the graphs from different time steps remained +disconnected, limiting GNN message-passing to spatially connected nodes only. +Addressing this critical challenge, we propose a novel end-to-end learning +architecture specifically designed to mend the temporal dependencies, resulting +in a well-connected graph. Thus, we provide a framework for the learnable +representation of spatio-temporal data as graphs. Our methodology demonstrates +superior performance on benchmark datasets, such as SurgVisDom and C2D2, +surpassing existing state-of-the-art graph models in terms of accuracy. Our +model also achieves significantly lower computational complexity, having far +fewer parameters than methods reliant on CLIP and 3D CNN architectures. + +
+
+ comment: Accepted at ICONIP 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Weather Predictions: Super-Resolution via Deep Diffusion + Models + + +
+ This study investigates the application of deep-learning diffusion models for +the super-resolution of weather data, a novel approach aimed at enhancing the +spatial resolution and detail of meteorological variables. Leveraging the +capabilities of diffusion models, specifically the SR3 and ResDiff +architectures, we present a methodology for transforming low-resolution weather +data into high-resolution outputs. Our experiments, conducted using the +WeatherBench dataset, focus on the super-resolution of the two-meter +temperature variable, demonstrating the models' ability to generate detailed +and accurate weather maps. The results indicate that the ResDiff model, further +improved by incorporating physics-based modifications, significantly +outperforms traditional SR3 methods in terms of Mean Squared Error (MSE), +Structural Similarity Index (SSIM), and Peak Signal-to-Noise Ratio (PSNR). This +research highlights the potential of diffusion models in meteorological +applications, offering insights into their effectiveness, challenges, and +prospects for future advancements in weather prediction and climate analysis. + +
+
+
+
+
+ + ♻ ☆ On Robust Reinforcement Learning with Lipschitz-Bounded Policy Networks + + +
+ This paper presents a study of robust policy networks in deep reinforcement +learning. We investigate the benefits of policy parameterizations that +naturally satisfy constraints on their Lipschitz bound, analyzing their +empirical performance and robustness on two representative problems: pendulum +swing-up and Atari Pong. We illustrate that policy networks with smaller +Lipschitz bounds are more robust to disturbances, random noise, and targeted +adversarial attacks than unconstrained policies composed of vanilla multi-layer +perceptrons or convolutional neural networks. However, the structure of the +Lipschitz layer is important. We find that the widely-used method of spectral +normalization is too conservative and severely impacts clean performance, +whereas more expressive Lipschitz layers such as the recently-proposed Sandwich +layer can achieve improved robustness without sacrificing clean performance. + +
+
+
+
+
+ + ♻ ☆ Generative Design of Crystal Structures by Point Cloud Representations + and Diffusion Model + + +
+ Efficiently generating energetically stable crystal structures has long been +a challenge in material design, primarily due to the immense arrangement of +atoms in a crystal lattice. To facilitate the discovery of stable material, we +present a framework for the generation of synthesizable materials, leveraging a +point cloud representation to encode intricate structural information. At the +heart of this framework lies the introduction of a diffusion model as its +foundational pillar. To gauge the efficacy of our approach, we employ it to +reconstruct input structures from our training datasets, rigorously validating +its high reconstruction performance. Furthermore, we demonstrate the profound +potential of Point Cloud-Based Crystal Diffusion (PCCD) by generating entirely +new materials, emphasizing their synthesizability. Our research stands as a +noteworthy contribution to the advancement of materials design and synthesis +through the cutting-edge avenue of generative design instead of the +conventional substitution or experience-based discovery. + +
+
+ comment: I have submitted to a journal +
+
+
+
+
+ + ♻ ☆ SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding + + +
+ Scientific literature understanding is crucial for extracting targeted +information and garnering insights, thereby significantly advancing scientific +discovery. Despite the remarkable success of Large Language Models (LLMs), they +face challenges in scientific literature understanding, primarily due to (1) a +lack of scientific knowledge and (2) unfamiliarity with specialized scientific +tasks. + To develop an LLM specialized in scientific literature understanding, we +propose a hybrid strategy that integrates continual pre-training (CPT) and +supervised fine-tuning (SFT), to simultaneously infuse scientific domain +knowledge and enhance instruction-following capabilities for domain-specific +tasks.cIn this process, we identify two key challenges: (1) constructing +high-quality CPT corpora, and (2) generating diverse SFT instructions. We +address these challenges through a meticulous pipeline, including PDF text +extraction, parsing content error correction, quality filtering, and synthetic +instruction creation. Applying this strategy, we present a suite of LLMs: +SciLitLLM, specialized in scientific literature understanding. These models +demonstrate promising performance on scientific literature understanding +benchmarks. + Our contributions are threefold: (1) We present an effective framework that +integrates CPT and SFT to adapt LLMs to scientific literature understanding, +which can also be easily adapted to other domains. (2) We propose an LLM-based +synthesis method to generate diverse and high-quality scientific instructions, +resulting in a new instruction set -- SciLitIns -- for supervised fine-tuning +in less-represented scientific domains. (3) SciLitLLM achieves promising +performance improvements on scientific literature understanding benchmarks. + +
+
+
+
+
+ + ♻ ☆ Anomaly Detection in Time Series of EDFA Pump Currents to Monitor + Degeneration Processes using Fuzzy Clustering + + +
+ This article proposes a novel fuzzy clustering based anomaly detection method +for pump current time series of EDFA systems. The proposed change detection +framework (CDF) strategically combines the advantages of entropy analysis (EA) +and principle component analysis (PCA) with fuzzy clustering procedures. In the +framework, EA is applied for dynamic selection of features for reduction of the +feature space and increase of computational performance. Furthermore, PCA is +utilized to extract features from the raw feature space to enable +generalization capability of the subsequent fuzzy clustering procedures. Three +different fuzzy clustering methods, more precisely the fuzzy clustering +algorithm, a probabilistic clustering algorithm and a possibilistic clustering +algorithm are evaluated for performance and generalization. Hence, the proposed +framework has the innovative feature to detect changes in pump current time +series at an early stage for arbitrary points of operation, compared to +state-of-the-art predefined alarms in commercially used EDFAs. Moreover, the +approach is implemented and tested using experimental data. In addition, the +proposed framework enables further approaches of applying decentralized +predictive maintenance for optical fiber networks. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Towards Learning Abductive Reasoning using VSA Distributed + Representations + + +
+ We introduce the Abductive Rule Learner with Context-awareness (ARLC), a +model that solves abstract reasoning tasks based on Learn-VRF. ARLC features a +novel and more broadly applicable training objective for abductive reasoning, +resulting in better interpretability and higher accuracy when solving Raven's +progressive matrices (RPM). ARLC allows both programming domain knowledge and +learning the rules underlying a data distribution. We evaluate ARLC on the +I-RAVEN dataset, showcasing state-of-the-art accuracy across both +in-distribution and out-of-distribution (unseen attribute-rule pairs) tests. +ARLC surpasses neuro-symbolic and connectionist baselines, including large +language models, despite having orders of magnitude fewer parameters. We show +ARLC's robustness to post-programming training by incrementally learning from +examples on top of programmed knowledge, which only improves its performance +and does not result in catastrophic forgetting of the programmed solution. We +validate ARLC's seamless transfer learning from a 2x2 RPM constellation to +unseen constellations. Our code is available at +https://github.com/IBM/abductive-rule-learner-with-context-awareness. + +
+
+ comment: Accepted at the 18th International Conference on Neural-Symbolic + Learning and Reasoning (NeSy) 2024 [Spotlight] +
+
+
+
+
+ + ♻ ☆ Tailoring Adversarial Attacks on Deep Neural Networks for Targeted Class + Manipulation Using DeepFool Algorithm + + +
+ The susceptibility of deep neural networks (DNNs) to adversarial attacks +undermines their reliability across numerous applications, underscoring the +necessity for an in-depth exploration of these vulnerabilities and the +formulation of robust defense strategies. The DeepFool algorithm by +Moosavi-Dezfooli et al. (2016) represents a pivotal step in identifying minimal +perturbations required to induce misclassification of input images. +Nonetheless, its generic methodology falls short in scenarios necessitating +targeted interventions. Additionally, previous research studies have +predominantly concentrated on the success rate of attacks without adequately +addressing the consequential distortion of images, the maintenance of image +quality, or the confidence threshold required for misclassification. To bridge +these gaps, we introduce the Enhanced Targeted DeepFool (ET DeepFool) +algorithm, an evolution of DeepFool that not only facilitates the specification +of desired misclassification targets but also incorporates a configurable +minimum confidence score. Our empirical investigations demonstrate the +superiority of this refined approach in maintaining the integrity of images and +minimizing perturbations across a variety of DNN architectures. Unlike previous +iterations, such as the Targeted DeepFool by Gajjar et al. (2022), our method +grants unparalleled control over the perturbation process, enabling precise +manipulation of model responses. Preliminary outcomes reveal that certain +models, including AlexNet and the advanced Vision Transformer, display +commendable robustness to such manipulations. This discovery of varying levels +of model robustness, as unveiled through our confidence level adjustments, +could have far-reaching implications for the field of image recognition. Our +code will be made public upon acceptance of the paper. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Transformers are Expressive, But Are They Expressive Enough for + Regression? + + +
+ Transformers have become pivotal in Natural Language Processing, +demonstrating remarkable success in applications like Machine Translation and +Summarization. Given their widespread adoption, several works have attempted to +analyze the expressivity of Transformers. Expressivity of a neural network is +the class of functions it can approximate. A neural network is fully expressive +if it can act as a universal function approximator. We attempt to analyze the +same for Transformers. Contrary to existing claims, our findings reveal that +Transformers struggle to reliably approximate smooth functions, relying on +piecewise constant approximations with sizable intervals. The central question +emerges as: ''Are Transformers truly Universal Function Approximators?'' To +address this, we conduct a thorough investigation, providing theoretical +insights and supporting evidence through experiments. Theoretically, we prove +that Transformer Encoders cannot approximate smooth functions. Experimentally, +we complement our theory and show that the full Transformer architecture cannot +approximate smooth functions. By shedding light on these challenges, we +advocate a refined understanding of Transformers' capabilities. Code Link: +https://github.com/swaroop-nath/transformer-expressivity. + +
+
+ comment: 18 pages, 17 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Perceptual Similarity for Measuring Decision-Making Style and Policy + Diversity in Games + + +
+ Defining and measuring decision-making styles, also known as playstyles, is +crucial in gaming, where these styles reflect a broad spectrum of individuality +and diversity. However, finding a universally applicable measure for these +styles poses a challenge. Building on Playstyle Distance, the first +unsupervised metric to measure playstyle similarity based on game screens and +raw actions, we introduce three enhancements to increase accuracy: multiscale +analysis with varied state granularity, a perceptual kernel rooted in +psychology, and the utilization of the intersection-over-union method for +efficient evaluation. These innovations not only advance measurement precision +but also offer insights into human cognition of similarity. Across two racing +games and seven Atari games, our techniques significantly improve the precision +of zero-shot playstyle classification, achieving an accuracy exceeding 90 +percent with fewer than 512 observation-action pairs, which is less than half +an episode of these games. Furthermore, our experiments with 2048 and Go +demonstrate the potential of discrete playstyle measures in puzzle and board +games. We also develop an algorithm for assessing decision-making diversity +using these measures. Our findings improve the measurement of end-to-end game +analysis and the evolution of artificial intelligence for diverse playstyles. + +
+
+ comment: TMLR 08/2024 https://openreview.net/forum?id=30C9AWBW49 +
+
+
+
+
+ + ♻ ☆ Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming + + +
+ Recent advances in language models have achieved significant progress. +GPT-4o, as a new milestone, has enabled real-time conversations with humans, +demonstrating near-human natural fluency. Such human-computer interaction +necessitates models with the capability to perform reasoning directly with the +audio modality and generate output in streaming. However, this remains beyond +the reach of current academic models, as they typically depend on extra TTS +systems for speech synthesis, resulting in undesirable latency. This paper +introduces the Mini-Omni, an audio-based end-to-end conversational model, +capable of real-time speech interaction. To achieve this capability, we propose +a text-instructed speech generation method, along with batch-parallel +strategies during inference to further boost the performance. Our method also +helps to retain the original model's language capabilities with minimal +degradation, enabling other works to establish real-time interaction +capabilities. We call this training method "Any Model Can Talk". We also +introduce the VoiceAssistant-400K dataset to fine-tune models optimized for +speech output. To our best knowledge, Mini-Omni is the first fully end-to-end, +open-source model for real-time speech interaction, offering valuable potential +for future research. + +
+
+ comment: Technical report, work in progress. Demo and code: + https://github.com/gpt-omni/mini-omni +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks in EEG-based Emotion Recognition: A Survey + + +
+ Compared to other modalities, EEG-based emotion recognition can intuitively +respond to the emotional patterns in the human brain and, therefore, has become +one of the most concerning tasks in the brain-computer interfaces field. Since +dependencies within brain regions are closely related to emotion, a significant +trend is to develop Graph Neural Networks (GNNs) for EEG-based emotion +recognition. However, brain region dependencies in emotional EEG have +physiological bases that distinguish GNNs in this field from those in other +time series fields. Besides, there is neither a comprehensive review nor +guidance for constructing GNNs in EEG-based emotion recognition. In the survey, +our categorization reveals the commonalities and differences of existing +approaches under a unified framework of graph construction. We analyze and +categorize methods from three stages in the framework to provide clear guidance +on constructing GNNs in EEG-based emotion recognition. In addition, we discuss +several open challenges and future directions, such as Temporal full-connected +graph and Graph condensation. + +
+
+
+
+
+ + ♻ ☆ Advancing Chinese biomedical text mining with community challenges + + +
+ Objective: This study aims to review the recent advances in community +challenges for biomedical text mining in China. Methods: We collected +information of evaluation tasks released in community challenges of biomedical +text mining, including task description, dataset description, data source, task +type and related links. A systematic summary and comparative analysis were +conducted on various biomedical natural language processing tasks, such as +named entity recognition, entity normalization, attribute extraction, relation +extraction, event extraction, text classification, text similarity, knowledge +graph construction, question answering, text generation, and large language +model evaluation. Results: We identified 39 evaluation tasks from 6 community +challenges that spanned from 2017 to 2023. Our analysis revealed the diverse +range of evaluation task types and data sources in biomedical text mining. We +explored the potential clinical applications of these community challenge tasks +from a translational biomedical informatics perspective. We compared with their +English counterparts, and discussed the contributions, limitations, lessons and +guidelines of these community challenges, while highlighting future directions +in the era of large language models. Conclusion: Community challenge evaluation +competitions have played a crucial role in promoting technology innovation and +fostering interdisciplinary collaboration in the field of biomedical text +mining. These challenges provide valuable platforms for researchers to develop +state-of-the-art solutions. + +
+
+
+
+
+ + ♻ ☆ Model-based RL as a Minimalist Approach to Horizon-Free and Second-Order + Bounds + + +
+ Learning a transition model via Maximum Likelihood Estimation (MLE) followed +by planning inside the learned model is perhaps the most standard and simplest +Model-based Reinforcement Learning (RL) framework. In this work, we show that +such a simple Model-based RL scheme, when equipped with optimistic and +pessimistic planning procedures, achieves strong regret and sample complexity +bounds in online and offline RL settings. Particularly, we demonstrate that +under the conditions where the trajectory-wise reward is normalized between +zero and one and the transition is time-homogenous, it achieves horizon-free +and second-order bounds. Horizon-free means that our bounds have no polynomial +dependence on the horizon of the Markov Decision Process. A second-order bound +is a type of instance-dependent bound that scales with respect to the variances +of the returns of the policies which can be small when the system is nearly +deterministic and (or) the optimal policy has small values. We highlight that +our algorithms are simple, fairly standard, and indeed have been extensively +studied in the RL literature: they learn a model via MLE, build a version space +around the MLE solution, and perform optimistic or pessimistic planning +depending on whether operating in the online or offline mode. These algorithms +do not rely on additional specialized algorithmic designs such as learning +variances and performing variance-weighted learning and thus can leverage rich +function approximations that are significantly beyond linear or tabular +structures. The simplicity of the algorithms also implies that our horizon-free +and second-order regret analysis is actually standard and mainly follows the +general framework of optimism/pessimism in the face of uncertainty. + +
+
+
+
+
+ + ♻ ☆ Towards Graph Prompt Learning: A Survey and Beyond + + +
+ Large-scale "pre-train and prompt learning" paradigms have demonstrated +remarkable adaptability, enabling broad applications across diverse domains +such as question answering, image recognition, and multimodal retrieval. This +approach fully leverages the potential of large-scale pre-trained models, +reducing downstream data requirements and computational costs while enhancing +model applicability across various tasks. Graphs, as versatile data structures +that capture relationships between entities, play pivotal roles in fields such +as social network analysis, recommender systems, and biological graphs. Despite +the success of pre-train and prompt learning paradigms in Natural Language +Processing (NLP) and Computer Vision (CV), their application in graph domains +remains nascent. In graph-structured data, not only do the node and edge +features often have disparate distributions, but the topological structures +also differ significantly. This diversity in graph data can lead to +incompatible patterns or gaps between pre-training and fine-tuning on +downstream graphs. We aim to bridge this gap by summarizing methods for +alleviating these disparities. This includes exploring prompt design +methodologies, comparing related techniques, assessing application scenarios +and datasets, and identifying unresolved problems and challenges. This survey +categorizes over 100 relevant works in this field, summarizing general design +principles and the latest applications, including text-attributed graphs, +molecules, proteins, and recommendation systems. Through this extensive review, +we provide a foundational understanding of graph prompt learning, aiming to +impact not only the graph mining community but also the broader Artificial +General Intelligence (AGI) community. + +
+
+ comment: 19 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Etalon: Holistic Performance Evaluation Framework for LLM Inference + Systems + + +
+ Serving large language models (LLMs) in production can incur substantial +costs, which has prompted recent advances in inference system optimizations. +Today, these systems are evaluated against conventional latency and throughput +metrics (eg. TTFT, TBT, Normalised Latency and TPOT). However, these metrics +fail to fully capture the nuances of LLM inference, leading to an incomplete +assessment of user-facing performance crucial for real-time applications such +as chat and translation. In this paper, we first identify the pitfalls of +current performance metrics in evaluating LLM inference systems. We then +propose Etalon, a comprehensive performance evaluation framework that includes +fluidity-index -- a novel metric designed to reflect the intricacies of the LLM +inference process and its impact on real-time user experience. Finally, we +evaluate various existing open-source platforms and model-as-a-service +offerings using Etalon, discussing their strengths and weaknesses. Etalon is +available at https://github.com/project-etalon/etalon. + +
+
+
+
+
+ + ♻ ☆ WhiteFox: White-Box Compiler Fuzzing Empowered by Large Language Models + + +
+ Compiler correctness is crucial, as miscompilation can falsify program +behaviors, leading to serious consequences. Fuzzing has been studied to uncover +compiler defects. However, compiler fuzzing remains challenging: Existing arts +focus on black- and grey-box fuzzing, which generates tests without sufficient +understanding of internal compiler behaviors. Meanwhile, traditional white-box +techniques, like symbolic execution, are computationally inapplicable to the +giant codebase of compilers. Recent advances demonstrate that Large Language +Models (LLMs) excel in code generation/understanding tasks. Nonetheless, +guiding LLMs with compiler source-code information remains a missing piece of +research in compiler testing. + To this end, we propose WhiteFox, the first white-box compiler fuzzer using +LLMs with source-code information to test compiler optimization, with a +spotlight on detecting deep logic bugs in the deep learning (DL) compilers. +WhiteFox adopts a multi-agent framework: an LLM-based analysis agent examines +the low-level optimization source code and produces requirements on the +high-level test programs that can trigger the optimization; an LLM-based +generation agent produces test programs based on the summarized requirements. +Additionally, optimization-triggering tests are used as feedback to enhance the +generation on the fly. Our evaluation on the three most popular DL compilers +(i.e., PyTorch Inductor, TensorFlow-XLA, and TensorFlow Lite) shows WhiteFox +can generate high-quality test programs to exercise deep optimizations, +practicing up to 8X more than state-of-the-art fuzzers. WhiteFox has found 101 +bugs for the DL compilers, with 92 confirmed as previously unknown and 70 +fixed. WhiteFox has been acknowledged by the PyTorch team and is being +incorporated into its development workflow. Beyond DL compilers, WhiteFox can +also be adapted for compilers in different domains. + +
+
+ comment: Published in OOPSLA 2024 +
+
+
+
+
+ + ♻ ☆ EEGMatch: Learning with Incomplete Labels for Semi-Supervised EEG-based + Cross-Subject Emotion Recognition + + +
+ Electroencephalography (EEG) is an objective tool for emotion recognition and +shows promising performance. However, the label scarcity problem is a main +challenge in this field, which limits the wide application of EEG-based emotion +recognition. In this paper, we propose a novel semi-supervised learning +framework (EEGMatch) to leverage both labeled and unlabeled EEG data. First, an +EEG-Mixup based data augmentation method is developed to generate more valid +samples for model learning. Second, a semi-supervised two-step pairwise +learning method is proposed to bridge prototype-wise and instance-wise pairwise +learning, where the prototype-wise pairwise learning measures the global +relationship between EEG data and the prototypical representation of each +emotion class and the instance-wise pairwise learning captures the local +intrinsic relationship among EEG data. Third, a semi-supervised multi-domain +adaptation is introduced to align the data representation among multiple +domains (labeled source domain, unlabeled source domain, and target domain), +where the distribution mismatch is alleviated. Extensive experiments are +conducted on two benchmark databases (SEED and SEED-IV) under a cross-subject +leave-one-subject-out cross-validation evaluation protocol. The results show +the proposed EEGmatch performs better than the state-of-the-art methods under +different incomplete label conditions (with 6.89% improvement on SEED and 1.44% +improvement on SEED-IV), which demonstrates the effectiveness of the proposed +EEGMatch in dealing with the label scarcity problem in emotion recognition +using EEG signals. The source code is available at +https://github.com/KAZABANA/EEGMatch. + +
+
+
+
+
+ + ♻ ☆ High Probability Complexity Bounds for Non-Smooth Stochastic + Optimization with Heavy-Tailed Noise + + +
+ Stochastic first-order methods are standard for training large-scale machine +learning models. Random behavior may cause a particular run of an algorithm to +result in a highly suboptimal objective value, whereas theoretical guarantees +are usually proved for the expectation of the objective value. Thus, it is +essential to theoretically guarantee that algorithms provide small objective +residual with high probability. Existing methods for non-smooth stochastic +convex optimization have complexity bounds with the dependence on the +confidence level that is either negative-power or logarithmic but under an +additional assumption of sub-Gaussian (light-tailed) noise distribution that +may not hold in practice. In our paper, we resolve this issue and derive the +first high-probability convergence results with logarithmic dependence on the +confidence level for non-smooth convex stochastic optimization problems with +non-sub-Gaussian (heavy-tailed) noise. To derive our results, we propose novel +stepsize rules for two stochastic methods with gradient clipping. Moreover, our +analysis works for generalized smooth objectives with H\"older-continuous +gradients, and for both methods, we provide an extension for strongly convex +problems. Finally, our results imply that the first (accelerated) method we +consider also has optimal iteration and oracle complexity in all the regimes, +and the second one is optimal in the non-smooth setting. + +
+
+ comment: 61 pages, 12 figures. Changes in V2: different presentation of the + results, different structure, new experiments. Changes in V3: some typos were + fixed +
+
+
+
+
+ + ♻ ☆ Dense-Sparse Deep Convolutional Neural Networks Training for Image + Denoising + + +
+ Recently, deep learning methods such as the convolutional neural networks +have gained prominence in the area of image denoising. This is owing to their +proven ability to surpass state-of-the-art classical image denoising algorithms +such as block-matching and 3D filtering algorithm. Deep denoising convolutional +neural networks use many feed-forward convolution layers with added +regularization methods of batch normalization and residual learning to speed up +training and improve denoising performance significantly. However, this comes +at the expense of a huge number of trainable parameters. In this paper, we show +that by employing an enhanced dense-sparse-dense network training procedure to +the deep denoising convolutional neural networks, comparable denoising +performance level can be achieved at a significantly reduced number of +trainable parameters. We derive motivation from the fact that networks trained +using the dense-sparse-dense approach have been shown to attain performance +boost with reduced number of parameters. The proposed reduced deep denoising +convolutional neural networks network is an efficient denoising model with +significantly reduced parameters and comparable performance to the deep +denoising convolutional neural networks. Additionally, denoising was achieved +at significantly reduced processing time. + +
+
+
+
+
+ + ♻ ☆ On The Fairness Impacts of Hardware Selection in Machine Learning + + +
+ In the machine learning ecosystem, hardware selection is often regarded as a +mere utility, overshadowed by the spotlight on algorithms and data. This +oversight is particularly problematic in contexts like ML-as-a-service +platforms, where users often lack control over the hardware used for model +deployment. How does the choice of hardware impact generalization properties? +This paper investigates the influence of hardware on the delicate balance +between model performance and fairness. We demonstrate that hardware choices +can exacerbate existing disparities, attributing these discrepancies to +variations in gradient flows and loss surfaces across different demographic +groups. Through both theoretical and empirical analysis, the paper not only +identifies the underlying factors but also proposes an effective strategy for +mitigating hardware-induced performance imbalances. + +
+
+
+
+
+ + ♻ ☆ FlakyFix: Using Large Language Models for Predicting Flaky Test Fix + Categories and Test Code Repair + + +
+ Flaky tests are problematic because they non-deterministically pass or fail +for the same software version under test, causing confusion and wasting +development effort. While machine learning models have been used to predict +flakiness and its root causes, there is much less work on providing support to +fix the problem. To address this gap, in this paper, we focus on predicting the +type of fix that is required to remove flakiness and then repair the test code +on that basis. We do this for a subset of flaky tests where the root cause of +flakiness is in the test itself and not in the production code. One key idea is +to guide the repair process with additional knowledge about the test's +flakiness in the form of its predicted fix category. Thus, we first propose a +framework that automatically generates labeled datasets for 13 fix categories +and trains models to predict the fix category of a flaky test by analyzing the +test code only. Our experimental results using code models and few-shot +learning show that we can correctly predict most of the fix categories. To show +the usefulness of such fix category labels for automatically repairing +flakiness, we augment the prompts of GPT-3.5 Turbo, a Large Language Model +(LLM), with such extra knowledge to request repair suggestions. The results +show that our suggested fix category labels, complemented with in-context +learning, significantly enhance the capability of GPT-3.5 Turbo in generating +fixes for flaky tests. Based on the execution and analysis of a sample of +GPT-repaired flaky tests, we estimate that a large percentage of such repairs +(roughly between 51% and 83%) can be expected to pass. For the failing repaired +tests, on average, 16% of the test code needs to be further changed for them to +pass. + +
+
+ comment: 26 pages, 20 Figures +
+
+
+
+
+ + ♻ ☆ Rasa: Building Expressive Speech Synthesis Systems for Indian Languages + in Low-resource Settings INTERSPEECH 2024 + + +
+ We release Rasa, the first multilingual expressive TTS dataset for any Indian +language, which contains 10 hours of neutral speech and 1-3 hours of expressive +speech for each of the 6 Ekman emotions covering 3 languages: Assamese, +Bengali, & Tamil. Our ablation studies reveal that just 1 hour of neutral and +30 minutes of expressive data can yield a Fair system as indicated by MUSHRA +scores. Increasing neutral data to 10 hours, with minimal expressive data, +significantly enhances expressiveness. This offers a practical recipe for +resource-constrained languages, prioritizing easily obtainable neutral data +alongside smaller amounts of expressive data. We show the importance of +syllabically balanced data and pooling emotions to enhance expressiveness. We +also highlight challenges in generating specific emotions, e.g., fear and +surprise. + +
+
+ comment: Accepted at INTERSPEECH 2024. First two authors listed contributed + equally +
+
+
+
+
+ + ♻ ☆ Nonsmooth Projection-Free Optimization with Functional Constraints + + +
+ This paper presents a subgradient-based algorithm for constrained nonsmooth +convex optimization that does not require projections onto the feasible set. +While the well-established Frank-Wolfe algorithm and its variants already avoid +projections, they are primarily designed for smooth objective functions. In +contrast, our proposed algorithm can handle nonsmooth problems with general +convex functional inequality constraints. It achieves an $\epsilon$-suboptimal +solution in $\mathcal{O}(\epsilon^{-2})$ iterations, with each iteration +requiring only a single (potentially inexact) Linear Minimization Oracle (LMO) +call and a (possibly inexact) subgradient computation. This performance is +consistent with existing lower bounds. Similar performance is observed when +deterministic subgradients are replaced with stochastic subgradients. In the +special case where there are no functional inequality constraints, our +algorithm competes favorably with a recent nonsmooth projection-free method +designed for constraint-free problems. Our approach utilizes a simple +separation scheme in conjunction with a new Lagrange multiplier update rule. + +
+
+
+
+
+ + ♻ ☆ Causality for Earth Science -- A Review on Time-series and + Spatiotemporal Causality Methods + + +
+ This survey paper covers the breadth and depth of time-series and +spatiotemporal causality methods, and their applications in Earth Science. More +specifically, the paper presents an overview of causal discovery and causal +inference, explains the underlying causal assumptions, and enlists evaluation +techniques and key terminologies of the domain area. The paper elicits the +various state-of-the-art methods introduced for time-series and spatiotemporal +causal analysis along with their strengths and limitations. The paper further +describes the existing applications of several methods for answering specific +Earth Science questions such as extreme weather events, sea level rise, +teleconnections etc. This survey paper can serve as a primer for Data Science +researchers interested in data-driven causal study as we share a list of +resources, such as Earth Science datasets (synthetic, simulated and +observational data) and open source tools for causal analysis. It will equally +benefit the Earth Science community interested in taking an AI-driven approach +to study the causality of different dynamic and thermodynamic processes as we +present the open challenges and opportunities in performing causality-based +Earth Science study. + +
+
+
+
+
+ + ♻ ☆ PSO Fuzzy XGBoost Classifier Boosted with Neural Gas Features on EEG + Signals in Emotion Recognition + + +
+ Emotion recognition is the technology-driven process of identifying and +categorizing human emotions from various data sources, such as facial +expressions, voice patterns, body motion, and physiological signals, such as +EEG. These physiological indicators, though rich in data, present challenges +due to their complexity and variability, necessitating sophisticated feature +selection and extraction methods. NGN, an unsupervised learning algorithm, +effectively adapts to input spaces without predefined grid structures, +improving feature extraction from physiological data. Furthermore, the +incorporation of fuzzy logic enables the handling of fuzzy data by introducing +reasoning that mimics human decision-making. The combination of PSO with +XGBoost aids in optimizing model performance through efficient hyperparameter +tuning and decision process optimization. This study explores the integration +of Neural-Gas Network (NGN), XGBoost, Particle Swarm Optimization (PSO), and +fuzzy logic to enhance emotion recognition using physiological signals. Our +research addresses three critical questions concerning the improvement of +XGBoost with PSO and fuzzy logic, NGN's effectiveness in feature selection, and +the performance comparison of the PSO-fuzzy XGBoost classifier with standard +benchmarks. Acquired results indicate that our methodologies enhance the +accuracy of emotion recognition systems and outperform other feature selection +techniques using the majority of classifiers, offering significant implications +for both theoretical advancement and practical application in emotion +recognition technology. + +
+
+ comment: PSO, Fuzzy, XGBoost, Neural Gas Network (NGN), Feature Selection, EEG + Signals, Emotion Recognition +
+
+
+
+
+ + ♻ ☆ Hardware-Assisted Virtualization of Neural Processing Units for Cloud + Platforms MICRO'24 + + +
+ Cloud platforms today have been deploying hardware accelerators like neural +processing units (NPUs) for powering machine learning (ML) inference services. +To maximize the resource utilization while ensuring reasonable quality of +service, a natural approach is to virtualize NPUs for efficient resource +sharing for multi-tenant ML services. However, virtualizing NPUs for modern +cloud platforms is not easy. This is not only due to the lack of system +abstraction support for NPU hardware, but also due to the lack of architectural +and ISA support for enabling fine-grained dynamic operator scheduling for +virtualized NPUs. + We present Neu10, a holistic NPU virtualization framework. We investigate +virtualization techniques for NPUs across the entire software and hardware +stack. Neu10 consists of (1) a flexible NPU abstraction called vNPU, which +enables fine-grained virtualization of the heterogeneous compute units in a +physical NPU (pNPU); (2) a vNPU resource allocator that enables pay-as-you-go +computing model and flexible vNPU-to-pNPU mappings for improved resource +utilization and cost-effectiveness; (3) an ISA extension of modern NPU +architecture for facilitating fine-grained tensor operator scheduling for +multiple vNPUs. We implement Neu10 based on a production-level NPU simulator. +Our experiments show that Neu10 improves the throughput of ML inference +services by up to 1.4$\times$ and reduces the tail latency by up to +4.6$\times$, while improving the NPU utilization by 1.2$\times$ on average, +compared to state-of-the-art NPU sharing approaches. + +
+
+ comment: Accepted to MICRO'24 +
+
+
+
+
+ + ♻ ☆ LLaVaOLMoBitnet1B: Ternary LLM goes Multimodal! + + +
+ Multimodal Large Language Models (MM-LLMs) have seen significant advancements +in the last year, demonstrating impressive performance across tasks. However, +to truly democratize AI, models must exhibit strong capabilities and be able to +run efficiently on small compute footprints accessible by most. Part of this +quest, we introduce LLaVaOLMoBitnet1B - the first Ternary Multimodal LLM +capable of accepting Image(s)+Text inputs to produce coherent textual +responses. The model is fully open-sourced along with training scripts to +encourage further research in this space. This accompanying technical report +highlights the training process, evaluation details, challenges associated with +ternary models and future opportunities. Link to the model: +https://huggingface.co/IntelLabs/LlavaOLMoBitnet1B + +
+
+
+
+
+ + ♻ ☆ Scaling Laws for Data Poisoning in LLMs + + +
+ Recent work shows that LLMs are vulnerable to data poisoning, in which they +are trained on partially corrupted or harmful data. Poisoned data is hard to +detect, breaks guardrails, and leads to undesirable and harmful behavior. Given +the intense efforts by leading labs to train and deploy increasingly larger and +more capable LLMs, it is critical to ask if the risk of data poisoning will be +naturally mitigated by scale, or if it is an increasing threat. We consider +three threat models by which data poisoning can occur: malicious fine-tuning, +imperfect data curation, and intentional data contamination. Our experiments +evaluate the effects of data poisoning on 23 frontier LLMs ranging from 1.5-72 +billion parameters on three datasets which speak to each of our threat models. +We find that larger LLMs are increasingly vulnerable, learning harmful behavior +significantly more quickly than smaller LLMs with even minimal data poisoning. +These results underscore the need for robust safeguards against data poisoning +in larger LLMs. + +
+
+
+
+
+ + ♻ ☆ Criticality Leveraged Adversarial Training (CLAT) for Boosted + Performance via Parameter Efficiency + + +
+ Adversarial training enhances neural network robustness but suffers from a +tendency to overfit and increased generalization errors on clean data. This +work introduces CLAT, an innovative approach that mitigates adversarial +overfitting by introducing parameter efficiency into the adversarial training +process, improving both clean accuracy and adversarial robustness. Instead of +tuning the entire model, CLAT identifies and fine-tunes robustness-critical +layers - those predominantly learning non-robust features - while freezing the +remaining model to enhance robustness. It employs dynamic critical layer +selection to adapt to changes in layer criticality throughout the fine-tuning +process. Empirically, CLAT can be applied on top of existing adversarial +training methods, significantly reduces the number of trainable parameters by +approximately 95%, and achieves more than a 2% improvement in adversarial +robustness compared to baseline methods. + +
+
+ comment: 9 pages + appendix/ additional experiments +
+
+
+
+
+ + ♻ ☆ Mix-Domain Contrastive Learning for Unpaired H&E-to-IHC Stain + Translation + + +
+ H&E-to-IHC stain translation techniques offer a promising solution for +precise cancer diagnosis, especially in low-resource regions where there is a +shortage of health professionals and limited access to expensive equipment. +Considering the pixel-level misalignment of H&E-IHC image pairs, current +research explores the pathological consistency between patches from the same +positions of the image pair. However, most of them overemphasize the +correspondence between domains or patches, overlooking the side information +provided by the non-corresponding objects. In this paper, we propose a +Mix-Domain Contrastive Learning (MDCL) method to leverage the supervision +information in unpaired H&E-to-IHC stain translation. Specifically, the +proposed MDCL method aggregates the inter-domain and intra-domain pathology +information by estimating the correlation between the anchor patch and all the +patches from the matching images, encouraging the network to learn additional +contrastive knowledge from mixed domains. With the mix-domain pathology +information aggregation, MDCL enhances the pathological consistency between the +corresponding patches and the component discrepancy of the patches from the +different positions of the generated IHC image. Extensive experiments on two +H&E-to-IHC stain translation datasets, namely MIST and BCI, demonstrate that +the proposed method achieves state-of-the-art performance across multiple +metrics. + +
+
+
+
+
+ + ♻ ☆ FRACTAL: An Ultra-Large-Scale Aerial Lidar Dataset for 3D Semantic + Segmentation of Diverse Landscapes + + +
+ Mapping agencies are increasingly adopting Aerial Lidar Scanning (ALS) as a +new tool to map buildings and other above-ground structures. Processing ALS +data at scale requires efficient point classification methods that perform well +over highly diverse territories. Large annotated Lidar datasets are needed to +evaluate these classification methods, however, current Lidar benchmarks have +restricted scope and often cover a single urban area. To bridge this data gap, +we introduce the FRench ALS Clouds from TArgeted Landscapes (FRACTAL) dataset: +an ultra-large-scale aerial Lidar dataset made of 100,000 dense point clouds +with high quality labels for 7 semantic classes and spanning 250 km$^2$. +FRACTAL achieves high spatial and semantic diversity by explicitly sampling +rare classes and challenging landscapes from five different regions of France. +We describe the data collection, annotation, and curation process of the +dataset. We provide baseline semantic segmentation results using a state of the +art 3D point cloud classification model. FRACTAL aims to support the +development of 3D deep learning approaches for large-scale land monitoring. + +
+
+ comment: 9 (body) + 2 (bibliography) + 8 (appendices) pages | Dataset is + available at https://huggingface.co/datasets/IGNF/FRACTAL | Trained model is + available at https://huggingface.co/IGNF/FRACTAL-LidarHD_7cl_randlanet | Deep + learning code repository is on Gihtub at https://github.com/IGNF/myria3d | + Data engineering code repository is on Github at + https://github.com/IGNF/pacasam +
+
+
+
+
+ + ♻ ☆ Matryoshka Diffusion Models ICLR2024 + + +
+ Diffusion models are the de facto approach for generating high-quality images +and videos, but learning high-dimensional models remains a formidable task due +to computational and optimization challenges. Existing methods often resort to +training cascaded models in pixel space or using a downsampled latent space of +a separately trained auto-encoder. In this paper, we introduce Matryoshka +Diffusion Models(MDM), an end-to-end framework for high-resolution image and +video synthesis. We propose a diffusion process that denoises inputs at +multiple resolutions jointly and uses a NestedUNet architecture where features +and parameters for small-scale inputs are nested within those of large scales. +In addition, MDM enables a progressive training schedule from lower to higher +resolutions, which leads to significant improvements in optimization for +high-resolution generation. We demonstrate the effectiveness of our approach on +various benchmarks, including class-conditioned image generation, +high-resolution text-to-image, and text-to-video applications. Remarkably, we +can train a single pixel-space model at resolutions of up to 1024x1024 pixels, +demonstrating strong zero-shot generalization using the CC12M dataset, which +contains only 12 million images. Our code is released at +https://github.com/apple/ml-mdm + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality + Assessment Model + + +
+ Recent advancements in the field of No-Reference Image Quality Assessment +(NR-IQA) using deep learning techniques demonstrate high performance across +multiple open-source datasets. However, such models are typically very large +and complex making them not so suitable for real-world deployment, especially +on resource- and battery-constrained mobile devices. To address this +limitation, we propose a compact, lightweight NR-IQA model that achieves +state-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation +and test datasets while being also nearly 5.7 times faster than the fastest +SOTA model. Our model features a dual-branch architecture, with each branch +separately trained on synthetically and authentically distorted images which +enhances the model's generalizability across different distortion types. To +improve robustness under diverse real-world visual conditions, we additionally +incorporate multiple color spaces during the training process. We also +demonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks +(KANs) for final quality regression as compared to the conventional Multi-Layer +Perceptrons (MLPs). Our evaluation considering various open-source datasets +highlights the practical, high-accuracy, and robust performance of our proposed +lightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA. + +
+
+
+
+
+ + ☆ Video to Music Moment Retrieval + + +
+ Adding proper background music helps complete a short video to be shared. +Towards automating the task, previous research focuses on video-to-music +retrieval (VMR), aiming to find amidst a collection of music the one best +matching the content of a given video. Since music tracks are typically much +longer than short videos, meaning the returned music has to be cut to a shorter +moment, there is a clear gap between the practical need and VMR. In order to +bridge the gap, we propose in this paper video to music moment retrieval (VMMR) +as a new task. To tackle the new task, we build a comprehensive dataset +Ad-Moment which contains 50K short videos annotated with music moments and +develop a two-stage approach. In particular, given a test video, the most +similar music is retrieved from a given collection. Then, a Transformer based +music moment localization is performed. We term this approach Retrieval and +Localization (ReaL). Extensive experiments on real-world datasets verify the +effectiveness of the proposed method for VMMR. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 78 + +
+
+
+ + ☆ SAM2Point: Segment Any 3D as Videos in Zero-shot and Promptable Manners + + +
+ We introduce SAM2Point, a preliminary exploration adapting Segment Anything +Model 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point +interprets any 3D data as a series of multi-directional videos, and leverages +SAM 2 for 3D-space segmentation, without further training or 2D-3D projection. +Our framework supports various prompt types, including 3D points, boxes, and +masks, and can generalize across diverse scenarios, such as 3D objects, indoor +scenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple +3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight +the robust generalization capabilities of SAM2Point. To our best knowledge, we +present the most faithful implementation of SAM in 3D, which may serve as a +starting point for future research in promptable 3D segmentation. Online Demo: +https://huggingface.co/spaces/ZiyuG/SAM2Point . Code: +https://github.com/ZiyuGuo99/SAM2Point . + +
+
+ comment: Work in progress. Online Demo: + https://huggingface.co/spaces/ZiyuG/SAM2Point . Code: + https://github.com/ZiyuGuo99/SAM2Point +
+
+
+
+
+ + ☆ How Far Can Cantonese NLP Go? Benchmarking Cantonese Capabilities of + Large Language Models + + +
+ The rapid evolution of large language models (LLMs) has transformed the +competitive landscape in natural language processing (NLP), particularly for +English and other data-rich languages. However, underrepresented languages like +Cantonese, spoken by over 85 million people, face significant development gaps, +which is particularly concerning given the economic significance of the +Guangdong-Hong Kong-Macau Greater Bay Area, and in substantial +Cantonese-speaking populations in places like Singapore and North America. +Despite its wide use, Cantonese has scant representation in NLP research, +especially compared to other languages from similarly developed regions. To +bridge these gaps, we outline current Cantonese NLP methods and introduce new +benchmarks designed to evaluate LLM performance in factual generation, +mathematical logic, complex reasoning, and general knowledge in Cantonese, +which aim to advance open-source Cantonese LLM technology. We also propose +future research directions and recommended models to enhance Cantonese LLM +development. + +
+
+
+
+
+ + ☆ Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning + of Large Language Models + + +
+ Reinforcement learning is used to align language models with human preference +signals after first pre-training the model to predict the next token of text +within a large corpus using likelihood maximization. Before being deployed in a +specific domain, models are often further fine-tuned on task specific data. +Since human preferences are often unavailable for the last step, it is +performed using likelihood maximization as that is the typical default method. +However, reinforcement learning has other advantages besides facilitating +alignment to a human derived reward function. For one, whereas likelihood +maximization is a form of imitation learning in which the model is trained on +what to do under ideal conditions, reinforcement learning is not limited to +demonstrating actions just for optimally reached states and trains a model what +to do under a range of scenarios as it explores the policy space. In addition, +it also trains a model what not to do, suppressing competitive but poor +actions. This work develops a framework for last-mile fine-tuning using +reinforcement learning and tests whether it garners performance gains. The +experiments center on abstractive summarization, but the framework is general +and broadly applicable. Use of the procedure produced significantly better +results than likelihood maximization when comparing raw predictions. For the +specific data tested, the gap could be bridged by employing post-processing of +the maximum likelihood outputs. Nonetheless, the framework offers a new avenue +for model optimization in situations where post-processing may be less +straightforward or effective, and it can be extended to include more complex +classes of undesirable outputs to penalize and train against, such as +hallucinations. + +
+
+
+
+
+ + ☆ A Gradient Analysis Framework for Rewarding Good and Penalizing Bad + Examples in Language Models + + +
+ Beyond maximum likelihood estimation (MLE), the standard objective of a +language model (LM) that optimizes good examples probabilities, many studies +have explored ways that also penalize bad examples for enhancing the quality of +output distribution, including unlikelihood training, exponential maximizing +average treatment effect (ExMATE), and direct preference optimization (DPO). To +systematically compare these methods and further provide a unified recipe for +LM optimization, in this paper, we present a unique angle of gradient analysis +of loss functions that simultaneously reward good examples and penalize bad +ones in LMs. Through both mathematical results and experiments on +CausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional +characteristics among these methods. We find that ExMATE serves as a superior +surrogate for MLE, and that combining DPO with ExMATE instead of MLE further +enhances both the statistical (5-7%) and generative (+18% win rate) +performance. + +
+
+
+
+
+ + ☆ Assessing Large Language Models for Online Extremism Research: + Identification, Explanation, and New Knowledge + + +
+ The United States has experienced a significant increase in violent +extremism, prompting the need for automated tools to detect and limit the +spread of extremist ideology online. This study evaluates the performance of +Bidirectional Encoder Representations from Transformers (BERT) and Generative +Pre-Trained Transformers (GPT) in detecting and classifying online domestic +extremist posts. We collected social media posts containing "far-right" and +"far-left" ideological keywords and manually labeled them as extremist or +non-extremist. Extremist posts were further classified into one or more of five +contributing elements of extremism based on a working definitional framework. +The BERT model's performance was evaluated based on training data size and +knowledge transfer between categories. We also compared the performance of GPT +3.5 and GPT 4 models using different prompts: na\"ive, layperson-definition, +role-playing, and professional-definition. Results showed that the best +performing GPT models outperformed the best performing BERT models, with more +detailed prompts generally yielding better results. However, overly complex +prompts may impair performance. Different versions of GPT have unique +sensitives to what they consider extremist. GPT 3.5 performed better at +classifying far-left extremist posts, while GPT 4 performed better at +classifying far-right extremist posts. Large language models, represented by +GPT models, hold significant potential for online extremism classification +tasks, surpassing traditional BERT models in a zero-shot setting. Future +research should explore human-computer interactions in optimizing GPT models +for extremist detection and classification tasks to develop more efficient +(e.g., quicker, less effort) and effective (e.g., fewer errors or mistakes) +methods for identifying extremist content. + +
+
+
+
+
+ + ☆ Theoretical and Methodological Framework for Studying Texts Produced by + Large Language Models + + +
+ This paper addresses the conceptual, methodological and technical challenges +in studying large language models (LLMs) and the texts they produce from a +quantitative linguistics perspective. It builds on a theoretical framework that +distinguishes between the LLM as a substrate and the entities the model +simulates. The paper advocates for a strictly non-anthropomorphic approach to +models while cautiously applying methodologies used in studying human +linguistic behavior to the simulated entities. While natural language +processing researchers focus on the models themselves, their architecture, +evaluation, and methods for improving performance, we as quantitative linguists +should strive to build a robust theory concerning the characteristics of texts +produced by LLMs, how they differ from human-produced texts, and the properties +of simulated entities. Additionally, we should explore the potential of LLMs as +an instrument for studying human culture, of which language is an integral +part. + +
+
+
+
+
+ + ☆ Smaller, Weaker, Yet Better: Training LLM Reasoners via Compute-Optimal + Sampling + + +
+ Training on high-quality synthetic data from strong language models (LMs) is +a common strategy to improve the reasoning performance of LMs. In this work, we +revisit whether this strategy is compute-optimal under a fixed inference budget +(e.g., FLOPs). To do so, we investigate the trade-offs between generating +synthetic data using a stronger but more expensive (SE) model versus a weaker +but cheaper (WC) model. We evaluate the generated data across three key +metrics: coverage, diversity, and false positive rate, and show that the data +from WC models may have higher coverage and diversity, but also exhibit higher +false positive rates. We then finetune LMs on data from SE and WC models in +different settings: knowledge distillation, self-improvement, and a novel +weak-to-strong improvement setup where a weaker LM teaches reasoning to a +stronger LM. Our findings reveal that models finetuned on WC-generated data +consistently outperform those trained on SE-generated data across multiple +benchmarks and multiple choices of WC and SE models. These results challenge +the prevailing practice of relying on SE models for synthetic data generation, +suggesting that WC may be the compute-optimal approach for training advanced LM +reasoners. + +
+
+
+
+
+ + ☆ Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming + + +
+ Recent advances in language models have achieved significant progress. +GPT-4o, as a new milestone, has enabled real-time conversations with humans, +demonstrating near-human natural fluency. Such human-computer interaction +necessitates models with the capability to perform reasoning directly with the +audio modality and generate output in streaming. However, this remains beyond +the reach of current academic models, as they typically depend on extra TTS +systems for speech synthesis, resulting in undesirable latency. This paper +introduces the Mini-Omni, an audio-based end-to-end conversational model, +capable of real-time speech interaction. To achieve this capability, we propose +a text-instructed speech generation method, along with batch-parallel +strategies during inference to further boost the performance. Our method also +helps to retain the original model's language capabilities with minimal +degradation, enabling other works to establish real-time interaction +capabilities. We call this training method "Any Model Can Talk". We also +introduce the VoiceAssistant-400K dataset to fine-tune models optimized for +speech output. To our best knowledge, Mini-Omni is the first fully end-to-end, +open-source model for real-time speech interaction, offering valuable potential +for future research. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce several improvements to the ColBERT model architecture +and training pipeline, leveraging techniques successful in the more established +single-vector embedding model paradigm, particularly those suited for +heterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates +strong performance across a range of English and multilingual retrieval tasks, +while also cutting storage requirements by up to 50% compared to previous +models. + +
+
+
+
+
+ + ☆ Iterative Graph Alignment + + +
+ By compressing diverse narratives, LLMs go beyond memorization, achieving +intelligence by capturing generalizable causal relationships. However, they +suffer from local 'representation gaps' due to insufficient training data +diversity, limiting their real-world utility, especially in tasks requiring +strict alignment to rules. Traditional alignment methods relying on heavy human +annotations are inefficient and unscalable. Recent self-alignment techniques +also fall short, as they often depend on self-selection based prompting and +memorization-based learning. To address these issues, we introduce Iterative +Graph Alignment (IGA), an annotation-free rule-based alignment algorithm. A +teacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical +graphs and reference answers. The student model (LLM) identifies local +knowledge gaps by attempting to align its responses with these references, +collaborating with helper models to generate diverse answers. These aligned +responses are then used for iterative supervised fine-tuning (SFT). Our +evaluations across five rule-based scenarios demonstrate IGP's effectiveness, +with a 73.12\% alignment improvement in Claude Sonnet 3.5, and +Llama3-8B-Instruct achieving an 86.20\% improvement, outperforming Claude +Sonnet 3.5 in rule-based alignment. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Enhancing Dialogue Generation in Werewolf Game Through Situation + Analysis and Persuasion Strategies + + +
+ Recent advancements in natural language processing, particularly with large +language models (LLMs) like GPT-4, have significantly enhanced dialogue +systems, enabling them to generate more natural and fluent conversations. +Despite these improvements, challenges persist, such as managing continuous +dialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024 +addresses these challenges by employing the Werewolf Game, an incomplete +information game, to test the capabilities of LLMs in complex interactive +environments. This paper introduces a LLM-based Werewolf Game AI, where each +role is supported by situation analysis to aid response generation. +Additionally, for the werewolf role, various persuasion strategies, including +logical appeal, credibility appeal, and emotional appeal, are employed to +effectively persuade other players to align with its actions. + +
+
+ comment: Accepted to the AIWolfDial2024 workshop at INLG 2024 +
+
+
+
+
+ + ☆ Predictability maximization and the origins of word order harmony + + +
+ We address the linguistic problem of the sequential arrangement of a head and +its dependents from an information theoretic perspective. In particular, we +consider the optimal placement of a head that maximizes the predictability of +the sequence. We assume that dependents are statistically independent given a +head, in line with the open-choice principle and the core assumptions of +dependency grammar. We demonstrate the optimality of harmonic order, i.e., +placing the head last maximizes the predictability of the head whereas placing +the head first maximizes the predictability of dependents. We also show that +postponing the head is the optimal strategy to maximize its predictability +while bringing it forward is the optimal strategy to maximize the +predictability of dependents. We unravel the advantages of the strategy of +maximizing the predictability of the head over maximizing the predictability of +dependents. Our findings shed light on the placements of the head adopted by +real languages or emerging in different kinds of experiments. + +
+
+
+
+
+ + ☆ SALSA: Speedy ASR-LLM Synchronous Aggregation INTERSPEECH 2024 + + +
+ Harnessing pre-trained LLMs to improve ASR systems, particularly for +low-resource languages, is now an emerging area of research. Existing methods +range from using LLMs for ASR error correction to tightly coupled systems that +replace the ASR decoder with the LLM. These approaches either increase decoding +time or require expensive training of the cross-attention layers. We propose +SALSA, which couples the decoder layers of the ASR to the LLM decoder, while +synchronously advancing both decoders. Such coupling is performed with a simple +projection of the last decoder state, and is thus significantly more training +efficient than earlier approaches. A challenge of our proposed coupling is +handling the mismatch between the tokenizers of the LLM and ASR systems. We +handle this mismatch using cascading tokenization with respect to the LLM and +ASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS +benchmark, yielding substantial WER reductions of up to 38%. + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ☆ CNIMA: A Universal Evaluation Framework and Automated Approach for + Assessing Second Language Dialogues + + +
+ We develop CNIMA (Chinese Non-Native Interactivity Measurement and +Automation), a Chinese-as-a-second-language labelled dataset with 10K +dialogues. We annotate CNIMA using an evaluation framework -- originally +introduced for English-as-a-second-language dialogues -- that assesses +micro-level features (e.g.\ backchannels) and macro-level interactivity labels +(e.g.\ topic management) and test the framework's transferability from English +to Chinese. We found the framework robust across languages and revealed +universal and language-specific relationships between micro-level and +macro-level features. Next, we propose an approach to automate the evaluation +and find strong performance, creating a new tool for automated second language +assessment. Our system can be adapted to other languages easily as it uses +large language models and as such does not require large-scale annotated +training data. + +
+
+
+
+
+ + ☆ LLMs vs Established Text Augmentation Techniques for Classification: + When do the Benefits Outweight the Costs? + + +
+ The generative large language models (LLMs) are increasingly being used for +data augmentation tasks, where text samples are LLM-paraphrased and then used +for classifier fine-tuning. However, a research that would confirm a clear +cost-benefit advantage of LLMs over more established augmentation methods is +largely missing. To study if (and when) is the LLM-based augmentation +advantageous, we compared the effects of recent LLM augmentation methods with +established ones on 6 datasets, 3 classifiers and 2 fine-tuning methods. We +also varied the number of seeds and collected samples to better explore the +downstream model accuracy space. Finally, we performed a cost-benefit analysis +and show that LLM-based methods are worthy of deployment only when very small +number of seeds is used. Moreover, in many cases, established methods lead to +similar or better model accuracies. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Learning from Negative Samples in Generative Biomedical Entity Linking + + +
+ Generative models have become widely used in biomedical entity linking +(BioEL) due to their excellent performance and efficient memory usage. However, +these models are usually trained only with positive samples--entities that +match the input mention's identifier--and do not explicitly learn from hard +negative samples, which are entities that look similar but have different +meanings. To address this limitation, we introduce ANGEL (Learning from +Negative Samples in Generative Biomedical Entity Linking), the first framework +that trains generative BioEL models using negative samples. Specifically, a +generative model is initially trained to generate positive samples from the +knowledge base for given input entities. Subsequently, both correct and +incorrect outputs are gathered from the model's top-k predictions. The model is +then updated to prioritize the correct predictions through direct preference +optimization. Our models fine-tuned with ANGEL outperform the previous best +baseline models by up to an average top-1 accuracy of 1.4% on five benchmarks. +When incorporating our framework into pre-training, the performance improvement +further increases to 1.7%, demonstrating its effectiveness in both the +pre-training and fine-tuning stages. Our code is available at +https://github.com/dmis-lab/ANGEL. + +
+
+
+
+
+ + ☆ Self-Alignment: Improving Alignment of Cultural Values in LLMs via + In-Context Learning + + +
+ Improving the alignment of Large Language Models (LLMs) with respect to the +cultural values that they encode has become an increasingly important topic. In +this work, we study whether we can exploit existing knowledge about cultural +values at inference time to adjust model responses to cultural value probes. We +present a simple and inexpensive method that uses a combination of in-context +learning (ICL) and human survey data, and show that we can improve the +alignment to cultural values across 5 models that include both English-centric +and multilingual LLMs. Importantly, we show that our method could prove useful +in test languages other than English and can improve alignment to the cultural +values that correspond to a range of culturally diverse countries. + +
+
+
+
+
+ + ☆ Is text normalization relevant for classifying medieval charters? + + +
+ This study examines the impact of historical text normalization on the +classification of medieval charters, specifically focusing on document dating +and locating. Using a data set of Middle High German charters from a digital +archive, we evaluate various classifiers, including traditional and +transformer-based models, with and without normalization. Our results indicate +that the given normalization minimally improves locating tasks but reduces +accuracy for dating, implying that original texts contain crucial features that +normalization may obscure. We find that support vector machines and gradient +boosting outperform other models, questioning the efficiency of transformers +for this use case. Results suggest a selective approach to historical text +normalization, emphasizing the significance of preserving some textual +characteristics that are critical for classification tasks in document +analysis. + +
+
+ comment: This preprint has not undergone peer review or any post-submission + improvements or corrections +
+
+
+
+
+ + ☆ SurveySum: A Dataset for Summarizing Multiple Scientific Articles into a + Survey Section + + +
+ Document summarization is a task to shorten texts into concise and +informative summaries. This paper introduces a novel dataset designed for +summarizing multiple scientific articles into a section of a survey. Our +contributions are: (1) SurveySum, a new dataset addressing the gap in +domain-specific summarization tools; (2) two specific pipelines to summarize +scientific articles into a section of a survey; and (3) the evaluation of these +pipelines using multiple metrics to compare their performance. Our results +highlight the importance of high-quality retrieval stages and the impact of +different configurations on the quality of generated summaries. + +
+
+ comment: 15 pages, 6 figures, 1 table. Submitted to BRACIS 2024 +
+
+
+
+
+ + ☆ Instruction-tuned Large Language Models for Machine Translation in the + Medical Domain + + +
+ Large Language Models (LLMs) have shown promising results on machine +translation for high resource language pairs and domains. However, in +specialised domains (e.g. medical) LLMs have shown lower performance compared +to standard neural machine translation models. The consistency in the machine +translation of terminology is crucial for users, researchers, and translators +in specialised domains. In this study, we compare the performance between +baseline LLMs and instruction-tuned LLMs in the medical domain. In addition, we +introduce terminology from specialised medical dictionaries into the +instruction formatted datasets for fine-tuning LLMs. The instruction-tuned LLMs +significantly outperform the baseline models with automatic metrics. + +
+
+
+
+
+ + ☆ MQM-Chat: Multidimensional Quality Metrics for Chat Translation + + +
+ The complexities of chats pose significant challenges for machine translation +models. Recognizing the need for a precise evaluation metric to address the +issues of chat translation, this study introduces Multidimensional Quality +Metrics for Chat Translation (MQM-Chat). Through the experiments of five models +using MQM-Chat, we observed that all models generated certain fundamental +errors, while each of them has different shortcomings, such as omission, overly +correcting ambiguous source content, and buzzword issues, resulting in the loss +of stylized information. Our findings underscore the effectiveness of MQM-Chat +in evaluating chat translation, emphasizing the importance of stylized content +and dialogue consistency for future studies. + +
+
+
+
+
+ + ☆ The Unreasonable Ineffectiveness of Nucleus Sampling on Mitigating Text + Memorization + + +
+ This work analyses the text memorization behavior of large language models +(LLMs) when subjected to nucleus sampling. Stochastic decoding methods like +nucleus sampling are typically applied to overcome issues such as monotonous +and repetitive text generation, which are often observed with +maximization-based decoding techniques. We hypothesize that nucleus sampling +might also reduce the occurrence of memorization patterns, because it could +lead to the selection of tokens outside the memorized sequence. To test this +hypothesis we create a diagnostic dataset with a known distribution of +duplicates that gives us some control over the likelihood of memorization of +certain parts of the training data. Our analysis of two GPT-Neo models +fine-tuned on this dataset interestingly shows that (i) an increase of the +nucleus size reduces memorization only modestly, and (ii) even when models do +not engage in "hard" memorization -- a verbatim reproduction of training +samples -- they may still display "soft" memorization whereby they generate +outputs that echo the training data but without a complete one-by-one +resemblance. + +
+
+ comment: 9 pages, Accepted at INLG 2024 (International Natural Language + Generation Conference) +
+
+
+
+
+ + ☆ Critic-CoT: Boosting the reasoning abilities of large language model via + Chain-of-thoughts Critic + + +
+ Self-critic has become an important mechanism for enhancing the reasoning +performance of LLMs. However, current approaches mainly involve basic prompts +without further training, which tend to be over-simplified, leading to limited +accuracy.Moreover, there is a lack of in-depth investigation of the +relationship between LLM's ability to criticism and its task-solving +performance.To address these issues, we propose Critic-CoT, a novel framework +that pushes LLMs toward System-2-like critic capability, via step-wise CoT +reasoning format and distant-supervision data construction, without the need +for human annotation. Experiments on GSM8K and MATH show that via filtering out +invalid solutions or iterative refinement, our enhanced model boosts +task-solving performance, which demonstrates the effectiveness of our method. +Further, we find that training on critique and refinement alone improves the +generation. We hope our work could shed light on future research on improving +the reasoning and critic ability of LLMs. + +
+
+
+
+
+ + ☆ Physics of Language Models: Part 2.2, How to Learn From Mistakes on + Grade-School Math Problems + + +
+ Language models have demonstrated remarkable performance in solving reasoning +tasks; however, even the strongest models still occasionally make reasoning +mistakes. Recently, there has been active research aimed at improving reasoning +accuracy, particularly by using pretrained language models to "self-correct" +their mistakes via multi-round prompting. In this paper, we follow this line of +work but focus on understanding the usefulness of incorporating +"error-correction" data directly into the pretraining stage. This data consists +of erroneous solution steps immediately followed by their corrections. Using a +synthetic math dataset, we show promising results: this type of pretrain data +can help language models achieve higher reasoning accuracy directly (i.e., +through simple auto-regression, without multi-round prompting) compared to +pretraining on the same amount of error-free data. We also delve into many +details, such as (1) how this approach differs from beam search, (2) how such +data can be prepared, (3) whether masking is needed on the erroneous tokens, +(4) the amount of error required, (5) whether such data can be deferred to the +fine-tuning stage, and many others. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.20311 +
+
+
+
+
+ + ☆ Measuring the Accuracy of Automatic Speech Recognition Solutions + + +
+ For d/Deaf and hard of hearing (DHH) people, captioning is an essential +accessibility tool. Significant developments in artificial intelligence (AI) +mean that Automatic Speech Recognition (ASR) is now a part of many popular +applications. This makes creating captions easy and broadly available - but +transcription needs high levels of accuracy to be accessible. Scientific +publications and industry report very low error rates, claiming AI has reached +human parity or even outperforms manual transcription. At the same time the DHH +community reports serious issues with the accuracy and reliability of ASR. +There seems to be a mismatch between technical innovations and the real-life +experience for people who depend on transcription. Independent and +comprehensive data is needed to capture the state of ASR. We measured the +performance of eleven common ASR services with recordings of Higher Education +lectures. We evaluated the influence of technical conditions like streaming, +the use of vocabularies, and differences between languages. Our results show +that accuracy ranges widely between vendors and for the individual audio +samples. We also measured a significant lower quality for streaming ASR, which +is used for live events. Our study shows that despite the recent improvements +of ASR, common services lack reliability in accuracy. + +
+
+
+
+
+ + ☆ Enhancing AI-Driven Psychological Consultation: Layered Prompts with + Large Language Models + + +
+ Psychological consultation is essential for improving mental health and +well-being, yet challenges such as the shortage of qualified professionals and +scalability issues limit its accessibility. To address these challenges, we +explore the use of large language models (LLMs) like GPT-4 to augment +psychological consultation services. Our approach introduces a novel layered +prompting system that dynamically adapts to user input, enabling comprehensive +and relevant information gathering. We also develop empathy-driven and +scenario-based prompts to enhance the LLM's emotional intelligence and +contextual understanding in therapeutic settings. We validated our approach +through experiments using a newly collected dataset of psychological +consultation dialogues, demonstrating significant improvements in response +quality. The results highlight the potential of our prompt engineering +techniques to enhance AI-driven psychological consultation, offering a scalable +and accessible solution to meet the growing demand for mental health support. + +
+
+
+
+
+ + ☆ LoraMap: Harnessing the Power of LoRA Connections + + +
+ Large Language Models (LLMs) can benefit from mitigating hallucinations +through fact-checking and overcoming substantial computational overhead with +parameter-efficient techniques such as Low-Rank Adaptation (LoRA). While some +studies have explored the parallel integration of multiple LoRAs, these +approaches need attention to the connections between them. This paper +investigates methods to establish connections among multiple LoRAs. We create +three reasoning datasets tailored to fact-checking and fine-tune individual +LoRAs, allowing them to view and reason from diverse perspectives. Then, we +explore strategies for allocating these reasoning LoRAs and introduce LoraMap, +an approach to map connections between them. The results on the fact-checking +task demonstrate that the performance of LoraMap is superior to LoraHub, an +existing LoRA composition method. LoraMap also outperforms with significantly +fewer parameters than LoraConcat, which concatenates LoRAs and further +fine-tunes them. + +
+
+ comment: 13 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ Making the Most of your Model: Methods for Finetuning and Applying + Pretrained Transformers + + +
+ This thesis provides methods and analysis of models which make progress on +this goal. The techniques outlined are task agnostic, and should provide +benefit when used with nearly any transformer LM. We introduce two new +finetuning methods which add new capabilities to the models they are used on. +The first adds a recurrence mechanism, which removes the fixed-window sized +constraint and improves the efficiency of a transformer decoder. The second +allows masked language models (MLMs) to be used for initialization of both the +encoder and decoder of a non-autoregressive sequence-to-sequence transformer, +opening up generative applications of models which were previously only used +for natural language understanding tasks. + We also introduce two new techniques for improving the quality of predictions +of any transformer decoder without additional finetuning. One, hidden state +optimization, can be applied to any transformer decoder to improve the quality +of predictions at inference time, especially for few-shot classification. The +other, conditional beam search, allows practitioners to search for natural +language generation (NLG) model outputs with high likelihood while conditioning +on the event that the output is not degenerate (e.g. empty, repetitive, etc.). + Finally, we provide theoretical and empirical insights on the divergence of +model-likelihood and output quality which has widely been observed in prior +work. These insights apply to any model which represents a distribution over +text, and apply to language models which are not transformers or even +autoregressive. We argue that the NLP community has, to some extent, +misunderstood the implications of these findings, and encourage a point of view +which has more nuance. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ SSDM: Scalable Speech Dysfluency Modeling + + +
+ Speech dysfluency modeling is the core module for spoken language learning, +and speech therapy. However, there are three challenges. First, current +state-of-the-art solutions suffer from poor scalability. Second, there is a +lack of a large-scale dysfluency corpus. Third, there is not an effective +learning framework. In this paper, we propose \textit{SSDM: Scalable Speech +Dysfluency Modeling}, which (1) adopts articulatory gestures as scalable forced +alignment; (2) introduces connectionist subsequence aligner (CSA) to achieve +dysfluency alignment; (3) introduces a large-scale simulated dysfluency corpus +called Libri-Dys; and (4) develops an end-to-end system by leveraging the power +of large language models (LLMs). We expect SSDM to serve as a standard in the +area of dysfluency modeling. Demo is available at +\url{https://eureka235.github.io}. + +
+
+
+
+
+ + ☆ M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language + Models for Chest X-ray Interpretation + + +
+ The rapid evolution of artificial intelligence, especially in large language +models (LLMs), has significantly impacted various domains, including +healthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs, +but with limitations: either underutilizing the multi-tasking capabilities of +LLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM +designed to enhance CXR interpretation. The model is trained on a visual +instruction-following dataset that integrates various task-specific datasets in +a conversational format. As a result, the model supports multiple tasks such as +medical report generation (MRG), visual grounding, and visual question +answering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by +employing a chain-of-thought prompting strategy, in which it identifies +findings in CXR images and subsequently generates corresponding reports. The +model is adaptable to various MRG scenarios depending on the available inputs, +such as single-image, multi-image, and multi-study contexts. In addition to +MRG, M4CXR performs visual grounding at a level comparable to specialized +models and also demonstrates outstanding performance in VQA. Both quantitative +and qualitative assessments reveal M4CXR's versatility in MRG, visual +grounding, and VQA, while consistently maintaining clinical accuracy. + +
+
+
+
+
+ + ☆ From cart to truck: meaning shift through words in English in the last + two centuries + + +
+ This onomasiological study uses diachronic word embeddings to explore how +different words represented the same concepts over time, using historical word +data from 1800 to 2000. We identify shifts in energy, transport, entertainment, +and computing domains, revealing connections between language and societal +changes. + Our approach consisted in using diachronic word embeddings trained using +word2vec with skipgram and aligning them using orthogonal Procrustes. We +discuss possible difficulties linked to the relationships the method +identifies. Moreover, we look at the ethical aspects of interpreting results, +highlighting the need for expert insights to understand the method's +significance. + +
+
+ comment: 7 pages, 1 figure +
+
+
+
+
+ + ☆ ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology + Report Generation Metrics + + +
+ Given the rapidly expanding capabilities of generative AI models for +radiology, there is a need for robust metrics that can accurately measure the +quality of AI-generated radiology reports across diverse hospitals. We develop +ReXamine-Global, a LLM-powered, multi-site framework that tests metrics across +different writing styles and patient populations, exposing gaps in their +generalization. First, our method tests whether a metric is undesirably +sensitive to reporting style, providing different scores depending on whether +AI-generated reports are stylistically similar to ground-truth reports or not. +Second, our method measures whether a metric reliably agrees with experts, or +whether metric and expert scores of AI-generated report quality diverge for +some sites. Using 240 reports from 6 hospitals around the world, we apply +ReXamine-Global to 7 established report evaluation metrics and uncover serious +gaps in their generalizability. Developers can apply ReXamine-Global when +designing new report evaluation metrics, ensuring their robustness across +sites. Additionally, our analysis of existing metrics can guide users of those +metrics towards evaluation procedures that work reliably at their sites of +interest. + +
+
+
+
+
+ + ☆ Benchmarking Japanese Speech Recognition on ASR-LLM Setups with + Multi-Pass Augmented Generative Error Correction + + +
+ With the strong representational power of large language models (LLMs), +generative error correction (GER) for automatic speech recognition (ASR) aims +to provide semantic and phonetic refinements to address ASR errors. This work +explores how LLM-based GER can enhance and expand the capabilities of Japanese +language processing, presenting the first GER benchmark for Japanese ASR with +0.9-2.6k text utterances. We also introduce a new multi-pass augmented +generative error correction (MPA GER) by integrating multiple system hypotheses +on the input side with corrections from multiple LLMs on the output side and +then merging them. To the best of our knowledge, this is the first +investigation of the use of LLMs for Japanese GER, which involves second-pass +language modeling on the output transcriptions generated by the ASR system +(e.g., N-best hypotheses). Our experiments demonstrated performance improvement +in the proposed methods of ASR quality and generalization both in SPREDS-U1-ja +and CSJ data. + +
+
+ comment: submitted to SLT2024 +
+
+
+
+
+ + ☆ A longitudinal sentiment analysis of Sinophobia during COVID-19 using + large language models + + +
+ The COVID-19 pandemic has exacerbated xenophobia, particularly Sinophobia, +leading to widespread discrimination against individuals of Chinese descent. +Large language models (LLMs) are pre-trained deep learning models used for +natural language processing (NLP) tasks. The ability of LLMs to understand and +generate human-like text makes them particularly useful for analysing social +media data to detect and evaluate sentiments. We present a sentiment analysis +framework utilising LLMs for longitudinal sentiment analysis of the Sinophobic +sentiments expressed in X (Twitter) during the COVID-19 pandemic. The results +show a significant correlation between the spikes in Sinophobic tweets, +Sinophobic sentiments and surges in COVID-19 cases, revealing that the +evolution of the pandemic influenced public sentiment and the prevalence of +Sinophobic discourse. Furthermore, the sentiment analysis revealed a +predominant presence of negative sentiments, such as annoyance and denial, +which underscores the impact of political narratives and misinformation shaping +public opinion. The lack of empathetic sentiment which was present in previous +studies related to COVID-19 highlights the way the political narratives in +media viewed the pandemic and how it blamed the Chinese community. Our study +highlights the importance of transparent communication in mitigating xenophobic +sentiments during global crises. + +
+
+
+
+
+ + ☆ Plausible-Parrots @ MSP2023: Enhancing Semantic Plausibility Modeling + using Entity and Event Knowledge + + +
+ In this work, we investigate the effectiveness of injecting external +knowledge to a large language model (LLM) to identify semantic plausibility of +simple events. Specifically, we enhance the LLM with fine-grained entity types, +event types and their definitions extracted from an external knowledge base. +These knowledge are injected into our system via designed templates. We also +augment the data to balance the label distribution and adapt the task setting +to real world scenarios in which event mentions are expressed as natural +language sentences. The experimental results show the effectiveness of the +injected knowledge on modeling semantic plausibility of events. An error +analysis further emphasizes the importance of identifying non-trivial entity +and event types. + +
+
+ comment: 10 pages, 5 figures, 5 tables +
+
+
+
+
+ + ☆ Event Extraction for Portuguese: A QA-driven Approach using ACE-2005 + + +
+ Event extraction is an Information Retrieval task that commonly consists of +identifying the central word for the event (trigger) and the event's arguments. +This task has been extensively studied for English but lags behind for +Portuguese, partly due to the lack of task-specific annotated corpora. This +paper proposes a framework in which two separated BERT-based models were +fine-tuned to identify and classify events in Portuguese documents. We +decompose this task into two sub-tasks. Firstly, we use a token classification +model to detect event triggers. To extract event arguments, we train a Question +Answering model that queries the triggers about their corresponding event +argument roles. Given the lack of event annotated corpora in Portuguese, we +translated the original version of the ACE-2005 dataset (a reference in the +field) into Portuguese, producing a new corpus for Portuguese event extraction. +To accomplish this, we developed an automatic translation pipeline. Our +framework obtains F1 marks of 64.4 for trigger classification and 46.7 for +argument classification setting, thus a new state-of-the-art reference for +these tasks in Portuguese. + +
+
+
+
+
+ + ☆ ACE-2005-PT: Corpus for Event Extraction in Portuguese + + +
+ Event extraction is an NLP task that commonly involves identifying the +central word (trigger) for an event and its associated arguments in text. +ACE-2005 is widely recognised as the standard corpus in this field. While other +corpora, like PropBank, primarily focus on annotating predicate-argument +structure, ACE-2005 provides comprehensive information about the overall event +structure and semantics. However, its limited language coverage restricts its +usability. This paper introduces ACE-2005-PT, a corpus created by translating +ACE-2005 into Portuguese, with European and Brazilian variants. To speed up the +process of obtaining ACE-2005-PT, we rely on automatic translators. This, +however, poses some challenges related to automatically identifying the correct +alignments between multi-word annotations in the original text and in the +corresponding translated sentence. To achieve this, we developed an alignment +pipeline that incorporates several alignment techniques: lemmatization, fuzzy +matching, synonym matching, multiple translations and a BERT-based word +aligner. To measure the alignment effectiveness, a subset of annotations from +the ACE-2005-PT corpus was manually aligned by a linguist expert. This subset +was then compared against our pipeline results which achieved exact and relaxed +match scores of 70.55\% and 87.55\% respectively. As a result, we successfully +generated a Portuguese version of the ACE-2005 corpus, which has been accepted +for publication by LDC. + +
+
+
+
+
+ + ☆ Exploring Multiple Strategies to Improve Multilingual Coreference + Resolution in CorefUD + + +
+ Coreference resolution, the task of identifying expressions in text that +refer to the same entity, is a critical component in various natural language +processing (NLP) applications. This paper presents our end-to-end neural +coreference resolution system, utilizing the CorefUD 1.1 dataset, which spans +17 datasets across 12 languages. We first establish strong baseline models, +including monolingual and cross-lingual variations, and then propose several +extensions to enhance performance across diverse linguistic contexts. These +extensions include cross-lingual training, incorporation of syntactic +information, a Span2Head model for optimized headword prediction, and advanced +singleton modeling. We also experiment with headword span representation and +long-documents modeling through overlapping segments. The proposed extensions, +particularly the heads-only approach, singleton modeling, and long document +prediction significantly improve performance across most datasets. We also +perform zero-shot cross-lingual experiments, highlighting the potential and +limitations of cross-lingual transfer in coreference resolution. Our findings +contribute to the development of robust and scalable coreference systems for +multilingual coreference resolution. Finally, we evaluate our model on CorefUD +1.1 test set and surpass the best model from CRAC 2023 shared task of a +comparable size by a large margin. Our nodel is available on GitHub: +\url{https://github.com/ondfa/coref-multiling} + +
+
+
+
+
+ + ☆ LLaVA-Chef: A Multi-modal Generative Model for Food Recipes + + +
+ In the rapidly evolving landscape of online recipe sharing within a +globalized context, there has been a notable surge in research towards +comprehending and generating food recipes. Recent advancements in large +language models (LLMs) like GPT-2 and LLaVA have paved the way for Natural +Language Processing (NLP) approaches to delve deeper into various facets of +food-related tasks, encompassing ingredient recognition and comprehensive +recipe generation. Despite impressive performance and multi-modal adaptability +of LLMs, domain-specific training remains paramount for their effective +application. This work evaluates existing LLMs for recipe generation and +proposes LLaVA-Chef, a novel model trained on a curated dataset of diverse +recipe prompts in a multi-stage approach. First, we refine the mapping of +visual food image embeddings to the language space. Second, we adapt LLaVA to +the food domain by fine-tuning it on relevant recipe data. Third, we utilize +diverse prompts to enhance the model's recipe comprehension. Finally, we +improve the linguistic quality of generated recipes by penalizing the model +with a custom loss function. LLaVA-Chef demonstrates impressive improvements +over pretrained LLMs and prior works. A detailed qualitative analysis reveals +that LLaVA-Chef generates more detailed recipes with precise ingredient +mentions, compared to existing approaches. + +
+
+
+
+
+ + ☆ Modeling offensive content detection for TikTok + + +
+ The advent of social media transformed interpersonal communication and +information consumption processes. This digital landscape accommodates user +intentions, also resulting in an increase of offensive language and harmful +behavior. Concurrently, social media platforms collect vast datasets comprising +user-generated content and behavioral information. These datasets are +instrumental for platforms deploying machine learning and data-driven +strategies, facilitating customer insights and countermeasures against social +manipulation mechanisms like disinformation and offensive content. +Nevertheless, the availability of such datasets, along with the application of +various machine learning techniques, to researchers and practitioners, for +specific social media platforms regarding particular events, is limited. In +particular for TikTok, which offers unique tools for personalized content +creation and sharing, the existing body of knowledge would benefit from having +diverse comprehensive datasets and associated data analytics solutions on +offensive content. While efforts from social media platforms, research, and +practitioner communities are seen on this behalf, such content continues to +proliferate. This translates to an essential need to make datasets publicly +available and build corresponding intelligent solutions. On this behalf, this +research undertakes the collection and analysis of TikTok data containing +offensive content, building a series of machine learning and deep learning +models for offensive content detection. This is done aiming at answering the +following research question: "How to develop a series of computational models +to detect offensive content on TikTok?". To this end, a Data Science +methodological approach is considered, 120.423 TikTok comments are collected, +and on a balanced, binary classification approach, F1 score performance results +of 0.863 is obtained. + +
+
+ comment: Accepted as a conference paper at DPSH 2024, 8 pages +
+
+
+
+
+ + ☆ See or Guess: Counterfactually Regularized Image Captioning ACM MM 2024 + + +
+ Image captioning, which generates natural language descriptions of the visual +information in an image, is a crucial task in vision-language research. +Previous models have typically addressed this task by aligning the generative +capabilities of machines with human intelligence through statistical fitting of +existing datasets. While effective for normal images, they may struggle to +accurately describe those where certain parts of the image are obscured or +edited, unlike humans who excel in such cases. These weaknesses they exhibit, +including hallucinations and limited interpretability, often hinder performance +in scenarios with shifted association patterns. In this paper, we present a +generic image captioning framework that employs causal inference to make +existing models more capable of interventional tasks, and counterfactually +explainable. Our approach includes two variants leveraging either total effect +or natural direct effect. Integrating them into the training process enables +models to handle counterfactual scenarios, increasing their generalizability. +Extensive experiments on various datasets show that our method effectively +reduces hallucinations and improves the model's faithfulness to images, +demonstrating high portability across both small-scale and large-scale +image-to-text models. The code is available at +https://github.com/Aman-4-Real/See-or-Guess. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Awes, Laws, and Flaws From Today's LLM Research + + +
+ We perform a critical examination of the scientific methodology behind +contemporary large language model (LLM) research. For this we assess over 2,000 +research works based on criteria typical of what is considered good research +(e.g. presence of statistical tests and reproducibility) and cross-validate it +with arguments that are at the centre of controversy (e.g., claims of emergent +behaviour, the use of LLMs as evaluators). We find multiple trends, such as +declines in claims of emergent behaviour and ethics disclaimers; the rise of +LLMs as evaluators in spite of a lack of consensus from the community about +their useability; and an increase of claims of LLM reasoning abilities, +typically without leveraging human evaluation. This paper underscores the need +for more scrutiny and rigour by and from this field to live up to the +fundamentals of a responsible scientific method that is ethical, reproducible, +systematic, and open to criticism. + +
+
+ comment: Under review -- v1 was an old draft with an unrevised abstract (oops) +
+
+
+
+
+ + ♻ ☆ CC-GPX: Extracting High-Quality Annotated Geospatial Data from Common + Crawl SP + + +
+ The Common Crawl (CC) corpus is the largest open web crawl dataset containing +9.5+ petabytes of data captured since 2008. The dataset is instrumental in +training large language models, and as such it has been studied for +(un)desirable content, and distilled for smaller, domain-specific datasets. +However, to our knowledge, no research has been dedicated to using CC as a +source of annotated geospatial data. In this paper, we introduce an efficient +pipeline to extract annotated user-generated tracks from GPX files found in CC, +and the resulting multimodal dataset with 1,416 pairings of human-written +descriptions and MultiLineString vector data from the 6 most recent CC +releases. The dataset can be used to study people's outdoor activity patterns, +the way people talk about their outdoor experiences, as well as for developing +trajectory generation or track annotation models, or for various other problems +in place of synthetically generated routes. Our reproducible code is available +on GitHub: https://github.com/ilyankou/cc-gpx + +
+
+ comment: Accepted as a poster to ACM SIGSPATIAL 2024 +
+
+
+
+
+ + ♻ ☆ Quantifying Geospatial in the Common Crawl Corpus SP + + +
+ Large language models (LLMs) exhibit emerging geospatial capabilities, +stemming from their pre-training on vast unlabelled text datasets that are +often derived from the Common Crawl (CC) corpus. However, the geospatial +content within CC remains largely unexplored, impacting our understanding of +LLMs' spatial reasoning. This paper investigates the prevalence of geospatial +data in recent Common Crawl releases using Gemini 1.5, a powerful language +model. By analyzing a sample of documents and manually revising the results, we +estimate that 18.7% of web documents in CC contain geospatial information such +as coordinates and addresses. We find little difference in prevalence between +Enlgish- and non-English-language documents. Our findings provide quantitative +insights into the nature and extent of geospatial data in CC, and lay the +groundwork for future studies of geospatial biases of LLMs. + +
+
+ comment: Accepted as a poster to ACM SIGSPATIAL 2024 +
+
+
+
+
+ + ♻ ☆ GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless + Generative Inference of LLM + + +
+ Key-value (KV) caching has become the de-facto to accelerate generation speed +for large language models (LLMs) inference. However, the growing cache demand +with increasing sequence length has transformed LLM inference to be a memory +bound problem, significantly constraining the system throughput. Existing +methods rely on dropping unimportant tokens or quantizing all entries +uniformly. Such methods, however, often incur high approximation errors to +represent the compressed matrices. The autoregressive decoding process further +compounds the error of each step, resulting in critical deviation in model +generation and deterioration of performance. To tackle this challenge, we +propose GEAR, an efficient KV cache compression framework that achieves +near-lossless high-ratio compression. GEAR first applies quantization to +majority of entries of similar magnitudes to ultra-low precision. It then +employs a low rank matrix to approximate the quantization error, and a sparse +matrix to remedy individual errors from outlier entries. By adeptly integrating +three techniques, GEAR is able to fully exploit their synergistic potentials. +Our experiments demonstrate that compared to alternatives, GEAR achieves +near-lossless 4-bit KV cache compression with up to 2.38x throughput +improvement, while reducing peak-memory size up to 2.29x. Our code is publicly +available at https://github.com/HaoKang-Timmy/GEAR. + +
+
+
+
+
+ + ♻ ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More + than Measuring Coherence, Grounding, and Repetition + + +
+ Visual storytelling consists in generating a natural language story given a +temporally ordered sequence of images. This task is not only challenging for +models, but also very difficult to evaluate with automatic metrics since there +is no consensus about what makes a story 'good'. In this paper, we introduce a +novel method that measures story quality in terms of human likeness regarding +three key aspects highlighted in previous work: visual grounding, coherence, +and repetitiveness. We then use this method to evaluate the stories generated +by several models, showing that the foundation model LLaVA obtains the best +result, but only slightly so compared to TAPM, a 50-times smaller visual +storytelling model. Upgrading the visual and language components of TAPM +results in a model that yields competitive performance with a relatively low +number of parameters. Finally, we carry out a human evaluation study, whose +results suggest that a 'good' story may require more than a human-like level of +visual grounding, coherence, and repetition. + +
+
+
+
+
+ + ♻ ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding + Integration in Adobe Express CIKM 2024 + + +
+ As user content and queries become increasingly multi-modal, the need for +effective multi-modal search systems has grown. Traditional search systems +often rely on textual and metadata annotations for indexed images, while +multi-modal embeddings like CLIP enable direct search using text and image +embeddings. However, embedding-based approaches face challenges in integrating +contextual features such as user locale and recency. Building a scalable +multi-modal search system requires fine-tuning several components. This paper +presents a multi-modal search architecture and a series of AB tests that +optimize embeddings and multi-modal technologies in Adobe Express template +search. We address considerations such as embedding model selection, the roles +of embeddings in matching and ranking, and the balance between dense and sparse +embeddings. Our iterative approach demonstrates how utilizing sparse, dense, +and contextual features enhances short and long query search, significantly +reduces null rates (over 70\%), and increases click-through rates (CTR). Our +findings provide insights into developing robust multi-modal search systems, +thereby enhancing relevance for complex queries. + +
+
+ comment: CIKM 2024 (International Conference on Information and Knowledge + Management), Multimodal Search and Recommendations Workshop +
+
+
+
+
+ + ♻ ☆ Mitigating Exaggerated Safety in Large Language Models + + +
+ As the popularity of Large Language Models (LLMs) grow, combining model +safety with utility becomes increasingly important. The challenge is making +sure that LLMs can recognize and decline dangerous prompts without sacrificing +their ability to be helpful. The problem of "exaggerated safety" demonstrates +how difficult this can be. To reduce excessive safety behaviours -- which was +discovered to be 26.1% of safe prompts being misclassified as dangerous and +refused -- we use a combination of XSTest dataset prompts as well as +interactive, contextual, and few-shot prompting to examine the decision bounds +of LLMs such as Llama2, Gemma Command R+, and Phi-3. We find that few-shot +prompting works best for Llama2, interactive prompting works best Gemma, and +contextual prompting works best for Command R+ and Phi-3. Using a combination +of these prompting strategies, we are able to mitigate exaggerated safety +behaviors by an overall 92.9% across all LLMs. Our work presents a multiple +prompting strategies to jailbreak LLMs' decision-making processes, allowing +them to navigate the tight line between refusing unsafe prompts and remaining +helpful. + +
+
+ comment: 17 pages, 8 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Adaptive Reinforcement Learning Planning: Harnessing Large Language + Models for Complex Information Extraction + + +
+ Existing research on large language models (LLMs) shows that they can solve +information extraction tasks through multi-step planning. However, their +extraction behavior on complex sentences and tasks is unstable, emerging issues +such as false positives and missing elements. We observe that decomposing +complex extraction tasks and extracting them step by step can effectively +improve LLMs' performance, and the extraction orders of entities significantly +affect the final results of LLMs. This paper proposes a two-stage multi-step +method for LLM-based information extraction and adopts the RL framework to +execute the multi-step planning. We regard sequential extraction as a Markov +decision process, build an LLM-based extraction environment, design a decision +module to adaptively provide the optimal order for sequential entity extraction +on different sentences, and utilize the DDQN algorithm to train the decision +model. We also design the rewards and evaluation metrics suitable for the +extraction results of LLMs. We conduct extensive experiments on multiple public +datasets to demonstrate the effectiveness of our method in improving the +information extraction capabilities of LLMs. + +
+
+
+
+
+ + ♻ ☆ Conan-embedding: General Text Embedding with More and Better Negative + Samples + + +
+ With the growing popularity of RAG, the capabilities of embedding models are +gaining increasing attention. Embedding models are primarily trained through +contrastive loss learning, with negative examples being a key component. +Previous work has proposed various hard negative mining strategies, but these +strategies are typically employed as preprocessing steps. In this paper, we +propose the conan-embedding model, which maximizes the utilization of more and +higher-quality negative examples. Specifically, since the model's ability to +handle preprocessed negative examples evolves during training, we propose +dynamic hard negative mining method to expose the model to more challenging +negative examples throughout the training process. Secondly, contrastive +learning requires as many negative examples as possible but is limited by GPU +memory constraints. Therefore, we use a Cross-GPU balancing Loss to provide +more negative examples for embedding training and balance the batch size across +multiple tasks. Moreover, we also discovered that the prompt-response pairs +from LLMs can be used for embedding training. Our approach effectively enhances +the capabilities of embedding models, currently ranking first on the Chinese +leaderboard of Massive text embedding benchmark + +
+
+
+
+
+ + ♻ ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease + Classification: A Systematic Review + + +
+ Parkinson's disease (PD), the second most prevalent neurodegenerative +disorder worldwide, frequently presents with early-stage speech impairments. +Recent advancements in Artificial Intelligence (AI), particularly deep learning +(DL), have significantly enhanced PD diagnosis through the analysis of speech +data. Nevertheless, the progress of research is restricted by the limited +availability of publicly accessible speech-based PD datasets, primarily due to +privacy concerns. The goal of this systematic review is to explore the current +landscape of speech-based DL approaches for PD classification, based on 33 +scientific works published between 2020 and March 2024. We discuss their +available resources, capabilities, potential limitations, and issues related to +bias, explainability, and privacy. Furthermore, this review provides an +overview of publicly accessible speech-based datasets and open-source material +for PD. The DL approaches are categorized into end-to-end (E2E) learning, +transfer learning (TL) and deep acoustic features extraction (DAFE) approaches. +Among E2E approaches, Convolutional Neural Networks (CNNs) are prevalent, +though Transformers are increasingly popular. E2E approaches face challenges +such as limited data and computational resources, especially with Transformers. +TL addresses these issues by providing more robust PD diagnosis and better +generalizability across languages. DAFE aims to improve the explainability and +interpretability of results by examining the specific effects of deep features +on both other DL approaches and more traditional machine learning (ML) methods. +However, it often underperforms compared to E2E and TL approaches. + +
+
+ comment: Submitted in Applied Sciences - peer reviewed Open Access journal. + This research was funded by the NWO research programme AiNed Fellowship + Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant + number NGF.1607.22.013 +
+
+
+
+
+ + ♻ ☆ Can LLMs perform structured graph reasoning? ICPR + + +
+ Pretrained Large Language Models (LLMs) have demonstrated various reasoning +capabilities through language-based prompts alone, particularly in unstructured +task settings (tasks purely based on language semantics). However, LLMs often +struggle with structured tasks, because of the inherent incompatibility of +input representation. Reducing structured tasks to uni-dimensional language +semantics often renders the problem trivial. Keeping the trade-off between LLM +compatibility and structure complexity in mind, we design various graph +reasoning tasks as a proxy to semi-structured tasks in this paper, in order to +test the ability to navigate through representations beyond plain text in +various LLMs. Particularly, we design 10 distinct problems of graph traversal, +each representing increasing levels of complexity, and benchmark 5 different +instruct-finetuned LLMs (GPT-4, GPT-3.5, Claude-2, Llama-2 and Palm-2) on the +aforementioned tasks. Further, we analyse the performance of models across +various settings such as varying sizes of graphs as well as different forms of +k-shot prompting. We highlight various limitations, biases and properties of +LLMs through this benchmarking process, such as an inverse relation to the +average degrees of freedom of traversal per node in graphs, the overall +negative impact of k-shot prompting on graph reasoning tasks, and a positive +response bias which prevents LLMs from identifying the absence of a valid +solution. Finally, we introduce a new prompting technique specially designed +for graph traversal tasks (PathCompare), which demonstrates a notable increase +in the performance of LLMs in comparison to standard prompting techniques such +as Chain-of-Thought (CoT). + +
+
+ comment: International Conference on Pattern Recognition (ICPR), 2024 +
+
+
+
+
+ + ♻ ☆ The Odyssey of Commonsense Causality: From Foundational Benchmarks to + Cutting-Edge Reasoning + + +
+ Understanding commonsense causality is a unique mark of intelligence for +humans. It helps people understand the principles of the real world better and +benefits the decision-making process related to causation. For instance, +commonsense causality is crucial in judging whether a defendant's action causes +the plaintiff's loss in determining legal liability. Despite its significance, +a systematic exploration of this topic is notably lacking. Our comprehensive +survey bridges this gap by focusing on taxonomies, benchmarks, acquisition +methods, qualitative reasoning, and quantitative measurements in commonsense +causality, synthesizing insights from over 200 representative articles. Our +work aims to provide a systematic overview, update scholars on recent +advancements, provide a pragmatic guide for beginners, and highlight promising +future research directions in this vital field. + +
+
+ comment: 42 pages +
+
+
+
+
+ + ♻ ☆ Inverse-Q*: Token Level Reinforcement Learning for Aligning Large + Language Models Without Preference Data + + +
+ Reinforcement Learning from Human Feedback (RLHF) has proven effective in +aligning large language models with human intentions, yet it often relies on +complex methodologies like Proximal Policy Optimization (PPO) that require +extensive hyper-parameter tuning and present challenges in sample efficiency +and stability. In this paper, we introduce Inverse-Q*, an innovative framework +that transcends traditional RL methods by optimizing token-level reinforcement +learning without the need for additional reward or value models. Inverse-Q* +leverages direct preference optimization techniques but extends them by +estimating the conditionally optimal policy directly from the model's +responses, facilitating more granular and flexible policy shaping. Our approach +reduces reliance on human annotation and external supervision, making it +especially suitable for low-resource settings. We present extensive +experimental results demonstrating that Inverse-Q* not only matches but +potentially exceeds the effectiveness of PPO in terms of convergence speed and +the alignment of model responses with human preferences. Our findings suggest +that Inverse-Q* offers a practical and robust alternative to conventional RLHF +approaches, paving the way for more efficient and adaptable model training +approaches. + +
+
+
+
+
+ + ♻ ☆ IKUN for WMT24 General MT Task: LLMs Are here for Multilingual Machine + Translation + + +
+ This paper introduces two multilingual systems, IKUN and IKUN-C, developed +for the general machine translation task in WMT24. IKUN and IKUN-C represent an +open system and a constrained system, respectively, built on Llama-3-8b and +Mistral-7B-v0.3. Both systems are designed to handle all 11 language directions +using a single model. According to automatic evaluation metrics, IKUN-C +achieved 6 first-place and 3 second-place finishes among all constrained +systems, while IKUN secured 1 first-place and 2 second-place finishes across +both open and constrained systems. These encouraging results suggest that large +language models (LLMs) are nearing the level of proficiency required for +effective multilingual machine translation. The systems are based on a +two-stage approach: first, continuous pre-training on monolingual data in 10 +languages, followed by fine-tuning on high-quality parallel data for 11 +language directions. The primary difference between IKUN and IKUN-C lies in +their monolingual pre-training strategy. IKUN-C is pre-trained using +constrained monolingual data, whereas IKUN leverages monolingual data from the +OSCAR dataset. In the second phase, both systems are fine-tuned on parallel +data sourced from NTREX, Flores, and WMT16-23 for all 11 language pairs. + +
+
+ comment: typo: 120K -> 12K vocabulary size +
+
+
+
+
+ + ♻ ☆ ReMamba: Equip Mamba with Effective Long-Sequence Modeling + + +
+ While the Mamba architecture demonstrates superior inference efficiency and +competitive performance on short-context natural language processing (NLP) +tasks, empirical evidence suggests its capacity to comprehend long contexts is +limited compared to transformer-based models. In this study, we investigate the +long-context efficiency issues of the Mamba models and propose ReMamba, which +enhances Mamba's ability to comprehend long contexts. ReMamba incorporates +selective compression and adaptation techniques within a two-stage re-forward +process, incurring minimal additional inference costs overhead. Experimental +results on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy, +improving over the baselines by 3.2 and 1.6 points, respectively, and attaining +performance almost on par with same-size transformer models. + +
+
+
+
+
+ + ♻ ☆ A Preference-driven Paradigm for Enhanced Translation with Large + Language Models NAACL 2024 + + +
+ Recent research has shown that large language models (LLMs) can achieve +remarkable translation performance through supervised fine-tuning (SFT) using +only a small amount of parallel data. However, SFT simply instructs the model +to imitate the reference translations at the token level, making it vulnerable +to the noise present in the references. Hence, the assistance from SFT often +reaches a plateau once the LLMs have achieved a certain level of translation +capability, and further increasing the size of parallel data does not provide +additional benefits. To overcome this plateau associated with imitation-based +SFT, we propose a preference-based approach built upon the Plackett-Luce model. +The objective is to steer LLMs towards a more nuanced understanding of +translation preferences from a holistic view, while also being more resilient +in the absence of gold translations. We further build a dataset named MAPLE to +verify the effectiveness of our approach, which includes multiple translations +of varying quality for each source sentence. Extensive experiments demonstrate +the superiority of our approach in "breaking the plateau" across diverse LLMs +and test settings. Our in-depth analysis underscores the pivotal role of +diverse translations and accurate preference scores in the success of our +approach. + +
+
+ comment: Accepted to NAACL 2024 (long, main) +
+
+
+
+
+ + ♻ ☆ TEncDM: Understanding the Properties of Diffusion Model in the Space of + Language Model Encodings + + +
+ This paper presents the Text Encoding Diffusion Model (TEncDM), a novel +approach to diffusion modeling that operates in the space of pre-trained +language model encodings. In contrast to traditionally used embeddings, +encodings integrate contextual information. In our approach, we also employ a +transformer-based decoder, specifically designed to incorporate context in the +token prediction process. We conduct a comprehensive examination of the +influence of the encoder, decoder, noise scheduler, and self-conditioning on +zero-shot generation. Furthermore, we compare TEncDM with previous approaches +on three conditional text generation tasks: QQP, XSum, and Wiki-Auto. The +results show that TEncDM exhibits superior performance compared to existing +non-autoregressive diffusion models. + +
+
+ comment: 14 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Helmsman of the Masses? Evaluate the Opinion Leadership of Large + Language Models in the Werewolf Game + + +
+ Large language models (LLMs) have exhibited memorable strategic behaviors in +social deductive games. However, the significance of opinion leadership +exhibited by LLM-based agents has been largely overlooked, which is crucial for +practical applications in multi-agent and human-AI interaction settings. +Opinion leaders are individuals who have a noticeable impact on the beliefs and +behaviors of others within a social group. In this work, we employ the Werewolf +game as a simulation platform to assess the opinion leadership of LLMs. The +game includes the role of the Sheriff, tasked with summarizing arguments and +recommending decision options, and therefore serves as a credible proxy for an +opinion leader. We develop a framework integrating the Sheriff role and devise +two novel metrics based on the critical characteristics of opinion leaders. The +first metric measures the reliability of the opinion leader, and the second +assesses the influence of the opinion leader on other players' decisions. We +conduct extensive experiments to evaluate LLMs of different scales. In +addition, we collect a Werewolf question-answering dataset (WWQA) to assess and +enhance LLM's grasp of the game rules, and we also incorporate human +participants for further analysis. The results suggest that the Werewolf game +is a suitable test bed to evaluate the opinion leadership of LLMs, and few LLMs +possess the capacity for opinion leadership. + +
+
+ comment: Published as a conference paper at COLM 2024. 37 pages, 6 figures, 27 + tables +
+
+
+
+
+ + ♻ ☆ MaskMoE: Boosting Token-Level Learning via Routing Mask in + Mixture-of-Experts + + +
+ Scaling the size of a model enhances its capabilities but significantly +increases computation complexity. Mixture-of-Experts models (MoE) address the +issue by allowing model size to scale up without substantially increasing +training or inference costs. In MoE, there is an important module called the +router, which is used to distribute each token to the experts. Currently, the +mainstream routing methods include dynamic routing and fixed routing. Despite +their promising results, MoE models encounter several challenges. Primarily, +for dynamic routing methods, the dispersion of training tokens across multiple +experts can lead to underfitting, particularly for infrequent tokens. +Additionally, though fixed routing methods can mitigate that issue, they +compromise on the diversity of representations. In this paper, we propose +\textbf{MaskMoE}, a method designed to enhance token-level learning by +employing a routing \textbf{mask}ing technique within the +\textbf{M}ixture-\textbf{o}f-\textbf{E}xperts model. MaskMoE is capable of +maintaining representation diversity while achieving more comprehensive +training. Experimental results demonstrate that our method outperforms previous +dominant Mixture-of-Experts models in terms of both perplexity (PPL) and +downstream task performance. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via + Layer-wise Relevance Propagation + + +
+ Retrieval-Augmented Generation (RAG) has become a primary technique for +mitigating hallucinations in large language models (LLMs). However, incomplete +knowledge extraction and insufficient understanding can still mislead LLMs to +produce irrelevant or even contradictory responses, which means hallucinations +persist in RAG. In this paper, we propose LRP4RAG, a method based on the +Layer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations +in RAG. Specifically, we first utilize LRP to compute the relevance between the +input and output of the RAG generator. We then apply further extraction and +resampling to the relevance matrix. The processed relevance data are input into +multiple classifiers to determine whether the output contains hallucinations. +To the best of our knowledge, this is the first time that LRP has been used for +detecting RAG hallucinations, and extensive experiments demonstrate that +LRP4RAG outperforms existing baselines. + +
+
+
+
+
+ + ♻ ☆ PsychoGAT: A Novel Psychological Measurement Paradigm through + Interactive Fiction Games with LLM Agents ACL 2024 + + +
+ Psychological measurement is essential for mental health, self-understanding, +and personal development. Traditional methods, such as self-report scales and +psychologist interviews, often face challenges with engagement and +accessibility. While game-based and LLM-based tools have been explored to +improve user interest and automate assessment, they struggle to balance +engagement with generalizability. In this work, we propose PsychoGAT +(Psychological Game AgenTs) to achieve a generic gamification of psychological +assessment. The main insight is that powerful LLMs can function both as adept +psychologists and innovative game designers. By incorporating LLM agents into +designated roles and carefully managing their interactions, PsychoGAT can +transform any standardized scales into personalized and engaging interactive +fiction games. To validate the proposed method, we conduct psychometric +evaluations to assess its effectiveness and employ human evaluators to examine +the generated content across various psychological constructs, including +depression, cognitive distortions, and personality traits. Results demonstrate +that PsychoGAT serves as an effective assessment tool, achieving statistically +significant excellence in psychometric metrics such as reliability, convergent +validity, and discriminant validity. Moreover, human evaluations confirm +PsychoGAT's enhancements in content coherence, interactivity, interest, +immersion, and satisfaction. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Internal Consistency and Self-Feedback in Large Language Models: A + Survey + + +
+ Large language models (LLMs) often exhibit deficient reasoning or generate +hallucinations. To address these, studies prefixed with "Self-" such as +Self-Consistency, Self-Improve, and Self-Refine have been initiated. They share +a commonality: involving LLMs evaluating and updating themselves. Nonetheless, +these efforts lack a unified perspective on summarization, as existing surveys +predominantly focus on categorization. + In this paper, we summarize a theoretical framework, Internal Consistency, +offering explanations for reasoning deficiencies and hallucinations. Internal +Consistency refers to the consistency in expressions among LLMs' latent, +decoding, or response layers based on sampling methodologies. Then, we +introduce another effective theoretical framework capable of mining Internal +Consistency, named Self-Feedback. This framework consists of two modules: +Self-Evaluation and Self-Update. The former captures Internal Consistency +Signals, while the latter leverages the signals to enhance either the model's +response or the model itself. This framework has been employed in numerous +studies. + We systematically classify these studies by tasks and lines of work; +summarize relevant evaluation methods and benchmarks; and delve into the +concern, "Does Self-Feedback Really Work?" We also propose several critical +viewpoints, including the "Hourglass Evolution of Internal Consistency", +"Consistency Is (Almost) Correctness" hypothesis, and "The Paradox of Latent +and Explicit Reasoning". The relevant resources are open-sourced at +https://github.com/IAAR-Shanghai/ICSFSurvey. + +
+
+ comment: 24 pages, 9 figures, 7 tables, 14 equations +
+
+
+
+
+ + ♻ ☆ InstructERC: Reforming Emotion Recognition in Conversation with + Multi-task Retrieval-Augmented Large Language Models + + +
+ The field of emotion recognition of conversation (ERC) has been focusing on +separating sentence feature encoding and context modeling, lacking exploration +in generative paradigms based on unified designs. In this study, we propose a +novel approach, InstructERC, to reformulate the ERC task from a discriminative +framework to a generative framework based on Large Language Models (LLMs). +InstructERC makes three significant contributions: (1) it introduces a simple +yet effective retrieval template module, which helps the model explicitly +integrate multi-granularity dialogue supervision information. (2) We introduce +two additional emotion alignment tasks, namely speaker identification and +emotion prediction tasks, to implicitly model the dialogue role relationships +and future emotional tendencies in conversations. (3) Pioneeringly, we unify +emotion labels across benchmarks through the feeling wheel to fit real +application scenarios. InstructERC still perform impressively on this unified +dataset. Our LLM-based plugin framework significantly outperforms all previous +models and achieves comprehensive SOTA on three commonly used ERC datasets. +Extensive analysis of parameter-efficient and data-scaling experiments provides +empirical guidance for applying it in practical scenarios. + +
+
+
+
+
+ + ♻ ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language + Models + + +
+ With the great advancements in large language models (LLMs), adversarial +attacks against LLMs have recently attracted increasing attention. We found +that pre-existing adversarial attack methodologies exhibit limited +transferability and are notably inefficient, particularly when applied to LLMs. +In this paper, we analyze the core mechanisms of previous predominant +adversarial attack methods, revealing that 1) the distributions of importance +score differ markedly among victim models, restricting the transferability; 2) +the sequential attack processes induces substantial time overheads. Based on +the above two insights, we introduce a new scheme, named TF-Attack, for +Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an +external LLM as a third-party overseer rather than the victim model to identify +critical units within sentences. Moreover, TF-Attack introduces the concept of +Importance Level, which allows for parallel substitutions of attacks. We +conduct extensive experiments on 6 widely adopted benchmarks, evaluating the +proposed method through both automatic and human metrics. Results show that our +method consistently surpasses previous methods in transferability and delivers +significant speed improvements, up to 20 times faster than earlier attack +strategies. + +
+
+ comment: 14 pages, 6 figures. arXiv admin note: text overlap with + arXiv:2305.17440 by other authors +
+
+
+
+
+ + ♻ ☆ BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General + Role-Playing Language Model + + +
+ The rapid advancement of large language models (LLMs) has revolutionized +role-playing, enabling the development of general role-playing models. However, +current role-playing training has two significant issues: (I) Using a +predefined role profile to prompt dialogue training for specific scenarios +usually leads to inconsistencies and even conflicts between the dialogue and +the profile, resulting in training biases. (II) The model learns to imitate the +role based solely on the profile, neglecting profile-dialogue alignment at the +sentence level. In this work, we propose a simple yet effective framework +called BEYOND DIALOGUE, designed to overcome these hurdles. This framework +innovatively introduces "beyond dialogue" tasks to align dialogue with profile +traits based on each specific scenario, thereby eliminating biases during +training. Furthermore, by adopting an innovative prompting mechanism that +generates reasoning outcomes for training, the framework allows the model to +achieve fine-grained alignment between profile and dialogue at the sentence +level. The aforementioned methods are fully automated and low-cost. +Additionally, the integration of automated dialogue and objective evaluation +methods forms a comprehensive framework, paving the way for general +role-playing. Experimental results demonstrate that our model excels in +adhering to and reflecting various dimensions of role profiles, outperforming +most proprietary general and specialized role-playing baselines. All code and +datasets are available at https://github.com/yuyouyu32/BeyondDialogue. + +
+
+
+
+
+ + ♻ ☆ GenRec: Generative Sequential Recommendation with Large Language Models + + +
+ Sequential recommendation is a task to capture hidden user preferences from +historical user item interaction data and recommend next items for the user. +Significant progress has been made in this domain by leveraging classification +based learning methods. Inspired by the recent paradigm of 'pretrain, prompt +and predict' in NLP, we consider sequential recommendation as a sequence to +sequence generation task and propose a novel model named Generative +Recommendation (GenRec). Unlike classification based models that learn explicit +user and item representations, GenRec utilizes the sequence modeling capability +of Transformer and adopts the masked item prediction objective to effectively +learn the hidden bidirectional sequential patterns. Different from existing +generative sequential recommendation models, GenRec does not rely on manually +designed hard prompts. The input to GenRec is textual user item sequence and +the output is top ranked next items. Moreover, GenRec is lightweight and +requires only a few hours to train effectively in low-resource settings, making +it highly applicable to real-world scenarios and helping to democratize large +language models in the sequential recommendation domain. Our extensive +experiments have demonstrated that GenRec generalizes on various public +real-world datasets and achieves state-of-the-art results. Our experiments also +validate the effectiveness of the the proposed masked item prediction objective +that improves the model performance by a large margin. + +
+
+
+
+
+ + ♻ ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language + Models + + +
+ With the great advancements in large language models (LLMs), adversarial +attacks against LLMs have recently attracted increasing attention. We found +that pre-existing adversarial attack methodologies exhibit limited +transferability and are notably inefficient, particularly when applied to LLMs. +In this paper, we analyze the core mechanisms of previous predominant +adversarial attack methods, revealing that 1) the distributions of importance +score differ markedly among victim models, restricting the transferability; 2) +the sequential attack processes induces substantial time overheads. Based on +the above two insights, we introduce a new scheme, named TF-Attack, for +Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an +external LLM as a third-party overseer rather than the victim model to identify +critical units within sentences. Moreover, TF-Attack introduces the concept of +Importance Level, which allows for parallel substitutions of attacks. We +conduct extensive experiments on 6 widely adopted benchmarks, evaluating the +proposed method through both automatic and human metrics. Results show that our +method consistently surpasses previous methods in transferability and delivers +significant speed improvements, up to 20 times faster than earlier attack +strategies. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ CReMa: Crisis Response through Computational Identification and Matching + of Cross-Lingual Requests and Offers Shared on Social Media + + +
+ During times of crisis, social media platforms play a crucial role in +facilitating communication and coordinating resources. In the midst of chaos +and uncertainty, communities often rely on these platforms to share urgent +pleas for help, extend support, and organize relief efforts. However, the +overwhelming volume of conversations during such periods can escalate to +unprecedented levels, necessitating the automated identification and matching +of requests and offers to streamline relief operations. Additionally, there is +a notable absence of studies conducted in multi-lingual settings, despite the +fact that any geographical area can have a diverse linguistic population. +Therefore, we propose CReMa (Crisis Response Matcher), a systematic approach +that integrates textual, temporal, and spatial features to address the +challenges of effectively identifying and matching requests and offers on +social media platforms during emergencies. Our approach utilizes a +crisis-specific pre-trained model and a multi-lingual embedding space. We +emulate human decision-making to compute temporal and spatial features and +non-linearly weigh the textual features. The results from our experiments are +promising, outperforming strong baselines. Additionally, we introduce a novel +multi-lingual dataset simulating help-seeking and offering assistance on social +media in 16 languages and conduct comprehensive cross-lingual experiments. +Furthermore, we analyze a million-scale geotagged global dataset to understand +patterns in seeking help and offering assistance on social media. Overall, +these contributions advance the field of crisis informatics and provide +benchmarks for future research in the area. + +
+
+ comment: \copyright 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ ECC Analyzer: Extract Trading Signal from Earnings Conference Calls + using Large Language Model for Stock Performance Prediction + + +
+ In the realm of financial analytics, leveraging unstructured data, such as +earnings conference calls (ECCs), to forecast stock volatility is a critical +challenge that has attracted both academics and investors. While previous +studies have used multimodal deep learning-based models to obtain a general +view of ECCs for volatility predicting, they often fail to capture detailed, +complex information. Our research introduces a novel framework: \textbf{ECC +Analyzer}, which utilizes large language models (LLMs) to extract richer, more +predictive content from ECCs to aid the model's prediction performance. We use +the pre-trained large models to extract textual and audio features from ECCs +and implement a hierarchical information extraction strategy to extract more +fine-grained information. This strategy first extracts paragraph-level general +information by summarizing the text and then extracts fine-grained focus +sentences using Retrieval-Augmented Generation (RAG). These features are then +fused through multimodal feature fusion to perform volatility prediction. +Experimental results demonstrate that our model outperforms traditional +analytical benchmarks, confirming the effectiveness of advanced LLM techniques +in financial analysis. + +
+
+ comment: 9 pages, 1 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Large Language Multimodal Models for 5-Year Chronic Disease Cohort + Prediction Using EHR Data + + +
+ Chronic diseases such as diabetes are the leading causes of morbidity and +mortality worldwide. Numerous research studies have been attempted with various +deep learning models in diagnosis. However, most previous studies had certain +limitations, including using publicly available datasets (e.g. MIMIC), and +imbalanced data. In this study, we collected five-year electronic health +records (EHRs) from the Taiwan hospital database, including 1,420,596 clinical +notes, 387,392 laboratory test results, and more than 1,505 laboratory test +items, focusing on research pre-training large language models. We proposed a +novel Large Language Multimodal Models (LLMMs) framework incorporating +multimodal data from clinical notes and laboratory test results for the +prediction of chronic disease risk. Our method combined a text embedding +encoder and multi-head attention layer to learn laboratory test values, +utilizing a deep neural network (DNN) module to merge blood features with +chronic disease semantics into a latent space. In our experiments, we observe +that clinicalBERT and PubMed-BERT, when combined with attention fusion, can +achieve an accuracy of 73% in multiclass chronic diseases and diabetes +prediction. By transforming laboratory test values into textual descriptions +and employing the Flan T-5 model, we achieved a 76% Area Under the ROC Curve +(AUROC), demonstrating the effectiveness of leveraging numerical text data for +training and inference in language models. This approach significantly improves +the accuracy of early-stage diabetes prediction. + +
+
+
+
+
+ + ♻ ☆ Use of a Structured Knowledge Base Enhances Metadata Curation by Large + Language Models + + +
+ Metadata play a crucial role in ensuring the findability, accessibility, +interoperability, and reusability of datasets. This paper investigates the +potential of large language models (LLMs), specifically GPT-4, to improve +adherence to metadata standards. We conducted experiments on 200 random data +records describing human samples relating to lung cancer from the NCBI +BioSample repository, evaluating GPT-4's ability to suggest edits for adherence +to metadata standards. We computed the adherence accuracy of field name-field +value pairs through a peer review process, and we observed a marginal average +improvement in adherence to the standard data dictionary from 79% to 80% +(p<0.5). We then prompted GPT-4 with domain information in the form of the +textual descriptions of CEDAR templates and recorded a significant improvement +to 97% from 79% (p<0.01). These results indicate that, while LLMs may not be +able to correct legacy metadata to ensure satisfactory adherence to standards +when unaided, they do show promise for use in automated metadata curation when +integrated with a structured knowledge base + +
+
+
+
+
+ + ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ♻ ☆ Text Generation: A Systematic Literature Review of Tasks, Evaluation, + and Challenges + + +
+ Text generation has become more accessible than ever, and the increasing +interest in these systems, especially those using large language models, has +spurred an increasing number of related publications. We provide a systematic +literature review comprising 244 selected papers between 2017 and 2024. This +review categorizes works in text generation into five main tasks: open-ended +text generation, summarization, translation, paraphrasing, and question +answering. For each task, we review their relevant characteristics, sub-tasks, +and specific challenges (e.g., missing datasets for multi-document +summarization, coherence in story generation, and complex reasoning for +question answering). Additionally, we assess current approaches for evaluating +text generation systems and ascertain problems with current metrics. Our +investigation shows nine prominent challenges common to all tasks and sub-tasks +in recent text generation publications: bias, reasoning, hallucinations, +misuse, privacy, interpretability, transparency, datasets, and computing. We +provide a detailed analysis of these challenges, their potential solutions, and +which gaps still require further engagement from the community. This systematic +literature review targets two main audiences: early career researchers in +natural language processing looking for an overview of the field and promising +research directions, as well as experienced researchers seeking a detailed view +of tasks, evaluation methodologies, open challenges, and recent mitigation +strategies. + +
+
+ comment: 35 pages, 2 figures, 2 tables, Under review +
+
+
+
+
+ + ♻ ☆ A global AI community requires language-diverse publishing ICLR + + +
+ In this provocation, we discuss the English dominance of the AI research +community, arguing that the requirement for English language publishing upholds +and reinforces broader regimes of extraction in AI. While large language models +and machine translation have been celebrated as a way to break down barriers, +we regard their use as a symptom of linguistic exclusion of scientists and +potential readers. We propose alternative futures for a healthier publishing +culture, organized around three themes: administering conferences in the +languages of the country in which they are held, instructing peer reviewers not +to adjudicate the language appropriateness of papers, and offering +opportunities to publish and present in multiple languages. We welcome new +translations of this piece. Please contact the authors if you would like to +contribute one. + +
+
+ comment: Translations by Tianyu M. Fang (Mandarin Chinese), Michael Hardy + (Guarani), Vandana Sarin and Vivek Sarin (Hindi), Roshna Omer Abdulrahman + (Soran\^i Kurdish), Gabriel Poesia (Portuguese), and Mat\'ias Grinberg + (Spanish). In the proceedings of the Global AI Cultures Workshop at the + Twelfth International Conference on Learning Representations (ICLR) 2024, + Vienna, Austria, May 7-11, 2024 +
+
+
+
+
+ + ♻ ☆ MelHuBERT: A simplified HuBERT on Mel spectrograms + + +
+ Self-supervised models have had great success in learning speech +representations that can generalize to various downstream tasks. However, most +self-supervised models require a large amount of compute and multiple GPUs to +train, significantly hampering the development of self-supervised learning. In +an attempt to reduce the computation of training, we revisit the training of +HuBERT, a highly successful self-supervised model. We improve and simplify +several key components, including the loss function, input representation, and +training in multiple stages. Our model, MelHuBERT, is able to achieve favorable +performance on phone recognition, speaker identification, and automatic speech +recognition against HuBERT, while saving 31.2% of the pre-training time, or +equivalently 33.5% MACs per one second speech. The code and pre-trained models +are available in https://github.com/nervjack2/MelHuBERT. + +
+
+ comment: ASRU 2023 +
+
+
+
+
+ + ♻ ☆ Are Small Language Models Ready to Compete with Large Language Models + for Practical Applications? + + +
+ The rapid rise of Language Models (LMs) has expanded their use in several +applications. Yet, due to constraints of model size, associated cost, or +proprietary restrictions, utilizing state-of-the-art (SOTA) LLMs is not always +feasible. With open, smaller LMs emerging, more applications can leverage their +capabilities, but selecting the right LM can be challenging as smaller LMs +don't perform well universally. This work tries to bridge this gap by proposing +a framework to experimentally evaluate small, open LMs in practical settings +through measuring semantic correctness of outputs across three practical +aspects: task types, application domains and reasoning types, using diverse +prompt styles. It also conducts an in-depth comparison of 10 small, open LMs to +identify best LM and prompt style depending on specific application requirement +using the proposed framework. We also show that if selected appropriately, they +can outperform SOTA LLMs like DeepSeek-v2, GPT-4o-mini, Gemini-1.5-Pro, and +even compete with GPT-4o. + +
+
+ comment: Submitted to ARR +
+
+
+
+
+ + ♻ ☆ Loop Copilot: Conducting AI Ensembles for Music Generation and Iterative + Editing + + +
+ Creating music is iterative, requiring varied methods at each stage. However, +existing AI music systems fall short in orchestrating multiple subsystems for +diverse needs. To address this gap, we introduce Loop Copilot, a novel system +that enables users to generate and iteratively refine music through an +interactive, multi-round dialogue interface. The system uses a large language +model to interpret user intentions and select appropriate AI models for task +execution. Each backend model is specialized for a specific task, and their +outputs are aggregated to meet the user's requirements. To ensure musical +coherence, essential attributes are maintained in a centralized table. We +evaluate the effectiveness of the proposed system through semi-structured +interviews and questionnaires, highlighting its utility not only in +facilitating music creation but also its potential for broader applications. + +
+
+ comment: Source code and demo video are available at + \url{https://sites.google.com/view/loop-copilot} +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 145 + +
+
+
+ + ☆ 3D Whole-body Grasp Synthesis with Directional Controllability + + +
+ Synthesizing 3D whole-bodies that realistically grasp objects is useful for +animation, mixed reality, and robotics. This is challenging, because the hands +and body need to look natural w.r.t. each other, the grasped object, as well as +the local scene (i.e., a receptacle supporting the object). Only recent work +tackles this, with a divide-and-conquer approach; it first generates a +"guiding" right-hand grasp, and then searches for bodies that match this. +However, the guiding-hand synthesis lacks controllability and receptacle +awareness, so it likely has an implausible direction (i.e., a body can't match +this without penetrating the receptacle) and needs corrections through major +post-processing. Moreover, the body search needs exhaustive sampling and is +expensive. These are strong limitations. We tackle these with a novel method +called CWGrasp. Our key idea is that performing geometry-based reasoning "early +on," instead of "too late," provides rich "control" signals for inference. To +this end, CWGrasp first samples a plausible reaching-direction vector (used +later for both the arm and hand) from a probabilistic model built via +raycasting from the object and collision checking. Then, it generates a +reaching body with a desired arm direction, as well as a "guiding" grasping +hand with a desired palm direction that complies with the arm's one. +Eventually, CWGrasp refines the body to match the "guiding" hand, while +plausibly contacting the scene. Notably, generating already-compatible "parts" +greatly simplifies the "whole." Moreover, CWGrasp uniquely tackles both right- +and left-hand grasps. We evaluate on the GRAB and ReplicaGrasp datasets. +CWGrasp outperforms baselines, at lower runtime and budget, while all +components help performance. Code and models will be released. + +
+
+
+
+
+ + ☆ SAM2Point: Segment Any 3D as Videos in Zero-shot and Promptable Manners + + +
+ We introduce SAM2Point, a preliminary exploration adapting Segment Anything +Model 2 (SAM 2) for zero-shot and promptable 3D segmentation. SAM2Point +interprets any 3D data as a series of multi-directional videos, and leverages +SAM 2 for 3D-space segmentation, without further training or 2D-3D projection. +Our framework supports various prompt types, including 3D points, boxes, and +masks, and can generalize across diverse scenarios, such as 3D objects, indoor +scenes, outdoor environments, and raw sparse LiDAR. Demonstrations on multiple +3D datasets, e.g., Objaverse, S3DIS, ScanNet, Semantic3D, and KITTI, highlight +the robust generalization capabilities of SAM2Point. To our best knowledge, we +present the most faithful implementation of SAM in 3D, which may serve as a +starting point for future research in promptable 3D segmentation. Online Demo: +https://huggingface.co/spaces/ZiyuG/SAM2Point . Code: +https://github.com/ZiyuGuo99/SAM2Point . + +
+
+ comment: Work in progress. Online Demo: + https://huggingface.co/spaces/ZiyuG/SAM2Point . Code: + https://github.com/ZiyuGuo99/SAM2Point +
+
+
+
+
+ + ☆ PromptSmooth: Certifying Robustness of Medical Vision-Language Models + via Prompt Learning MICCAI 2024 + + +
+ Medical vision-language models (Med-VLMs) trained on large datasets of +medical image-text pairs and later fine-tuned for specific tasks have emerged +as a mainstream paradigm in medical image analysis. However, recent studies +have highlighted the susceptibility of these Med-VLMs to adversarial attacks, +raising concerns about their safety and robustness. Randomized smoothing is a +well-known technique for turning any classifier into a model that is +certifiably robust to adversarial perturbations. However, this approach +requires retraining the Med-VLM-based classifier so that it classifies well +under Gaussian noise, which is often infeasible in practice. In this paper, we +propose a novel framework called PromptSmooth to achieve efficient certified +robustness of Med-VLMs by leveraging the concept of prompt learning. Given any +pre-trained Med-VLM, PromptSmooth adapts it to handle Gaussian noise by +learning textual prompts in a zero-shot or few-shot manner, achieving a +delicate balance between accuracy and robustness, while minimizing the +computational overhead. Moreover, PromptSmooth requires only a single model to +handle multiple noise levels, which substantially reduces the computational +cost compared to traditional methods that rely on training a separate model for +each noise level. Comprehensive experiments based on three Med-VLMs and across +six downstream datasets of various imaging modalities demonstrate the efficacy +of PromptSmooth. Our code and models are available at +https://github.com/nhussein/promptsmooth. + +
+
+ comment: Accepted to MICCAI 2024 +
+
+
+
+
+ + ☆ ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion + Model + + +
+ Advancements in 3D scene reconstruction have transformed 2D images from the +real world into 3D models, producing realistic 3D results from hundreds of +input photos. Despite great success in dense-view reconstruction scenarios, +rendering a detailed scene from insufficient captured views is still an +ill-posed optimization problem, often resulting in artifacts and distortions in +unseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction +paradigm that reframes the ambiguous reconstruction challenge as a temporal +generation task. The key insight is to unleash the strong generative prior of +large pre-trained video diffusion models for sparse-view reconstruction. +However, 3D view consistency struggles to be accurately preserved in directly +generated video frames from pre-trained models. To address this, given limited +input views, the proposed ReconX first constructs a global point cloud and +encodes it into a contextual space as the 3D structure condition. Guided by the +condition, the video diffusion model then synthesizes video frames that are +both detail-preserved and exhibit a high degree of 3D consistency, ensuring the +coherence of the scene from various perspectives. Finally, we recover the 3D +scene from the generated video through a confidence-aware 3D Gaussian Splatting +optimization scheme. Extensive experiments on various real-world datasets show +the superiority of our ReconX over state-of-the-art methods in terms of quality +and generalizability. + +
+
+ comment: Project page: https://liuff19.github.io/ReconX +
+
+
+
+
+ + ☆ CSGO: Content-Style Composition in Text-to-Image Generation + + +
+ The diffusion model has shown exceptional capabilities in controlled image +generation, which has further fueled interest in image style transfer. Existing +works mainly focus on training free-based methods (e.g., image inversion) due +to the scarcity of specific data. In this study, we present a data construction +pipeline for content-style-stylized image triplets that generates and +automatically cleanses stylized data triplets. Based on this pipeline, we +construct a dataset IMAGStyle, the first large-scale style transfer dataset +containing 210k image triplets, available for the community to explore and +research. Equipped with IMAGStyle, we propose CSGO, a style transfer model +based on end-to-end training, which explicitly decouples content and style +features employing independent feature injection. The unified CSGO implements +image-driven style transfer, text-driven stylized synthesis, and text +editing-driven stylized synthesis. Extensive experiments demonstrate the +effectiveness of our approach in enhancing style control capabilities in image +generation. Additional visualization and access to the source code can be +located on the project page: \url{https://csgo-gen.github.io/}. + +
+
+
+
+
+ + ☆ UV-free Texture Generation with Denoising and Geodesic Heat Diffusions + + +
+ Seams, distortions, wasted UV space, vertex-duplication, and varying +resolution over the surface are the most prominent issues of the standard +UV-based texturing of meshes. These issues are particularly acute when +automatic UV-unwrapping techniques are used. For this reason, instead of +generating textures in automatically generated UV-planes like most +state-of-the-art methods, we propose to represent textures as coloured +point-clouds whose colours are generated by a denoising diffusion probabilistic +model constrained to operate on the surface of 3D objects. Our sampling and +resolution agnostic generative model heavily relies on heat diffusion over the +surface of the meshes for spatial communication between points. To enable +processing of arbitrarily sampled point-cloud textures and ensure long-distance +texture consistency we introduce a fast re-sampling of the mesh spectral +properties used during the heat diffusion and introduce a novel +heat-diffusion-based self-attention mechanism. Our code and pre-trained models +are available at github.com/simofoti/UV3-TeD. + +
+
+
+
+
+ + ☆ OmniRe: Omni Urban Scene Reconstruction + + +
+ We introduce OmniRe, a holistic approach for efficiently reconstructing +high-fidelity dynamic urban scenes from on-device logs. Recent methods for +modeling driving sequences using neural radiance fields or Gaussian Splatting +have demonstrated the potential of reconstructing challenging dynamic scenes, +but often overlook pedestrians and other non-vehicle dynamic actors, hindering +a complete pipeline for dynamic urban scene reconstruction. To that end, we +propose a comprehensive 3DGS framework for driving scenes, named OmniRe, that +allows for accurate, full-length reconstruction of diverse dynamic objects in a +driving log. OmniRe builds dynamic neural scene graphs based on Gaussian +representations and constructs multiple local canonical spaces that model +various dynamic actors, including vehicles, pedestrians, and cyclists, among +many others. This capability is unmatched by existing methods. OmniRe allows us +to holistically reconstruct different objects present in the scene, +subsequently enabling the simulation of reconstructed scenarios with all actors +participating in real-time (~60Hz). Extensive evaluations on the Waymo dataset +show that our approach outperforms prior state-of-the-art methods +quantitatively and qualitatively by a large margin. We believe our work fills a +critical gap in driving reconstruction. + +
+
+ comment: See the project page for code, video results and demos: + https://ziyc.github.io/omnire/ +
+
+
+
+
+ + ☆ Dissecting Out-of-Distribution Detection and Open-Set Recognition: A + Critical Analysis of Methods and Benchmarks + + +
+ Detecting test-time distribution shift has emerged as a key capability for +safely deployed machine learning models, with the question being tackled under +various guises in recent years. In this paper, we aim to provide a consolidated +view of the two largest sub-fields within the community: out-of-distribution +(OOD) detection and open-set recognition (OSR). In particular, we aim to +provide rigorous empirical analysis of different methods across settings and +provide actionable takeaways for practitioners and researchers. Concretely, we +make the following contributions: (i) We perform rigorous cross-evaluation +between state-of-the-art methods in the OOD detection and OSR settings and +identify a strong correlation between the performances of methods for them; +(ii) We propose a new, large-scale benchmark setting which we suggest better +disentangles the problem tackled by OOD detection and OSR, re-evaluating +state-of-the-art OOD detection and OSR methods in this setting; (iii) We +surprisingly find that the best performing method on standard benchmarks +(Outlier Exposure) struggles when tested at scale, while scoring rules which +are sensitive to the deep feature magnitude consistently show promise; and (iv) +We conduct empirical analysis to explain these phenomena and highlight +directions for future research. Code: +\url{https://github.com/Visual-AI/Dissect-OOD-OSR} + +
+
+ comment: Accepted to IJCV, preprint version +
+
+
+
+
+ + ☆ VideoLLM-MoD: Efficient Video-Language Streaming with Mixture-of-Depths + Vision Computation + + +
+ A well-known dilemma in large vision-language models (e.g., GPT-4, LLaVA) is +that while increasing the number of vision tokens generally enhances visual +understanding, it also significantly raises memory and computational costs, +especially in long-term, dense video frame streaming scenarios. Although +learnable approaches like Q-Former and Perceiver Resampler have been developed +to reduce the vision token burden, they overlook the context causally modeled +by LLMs (i.e., key-value cache), potentially leading to missed visual cues when +addressing user queries. In this paper, we introduce a novel approach to reduce +vision compute by leveraging redundant vision tokens "skipping layers" rather +than decreasing the number of vision tokens. Our method, VideoLLM-MoD, is +inspired by mixture-of-depths LLMs and addresses the challenge of numerous +vision tokens in long-term or streaming video. Specifically, for each +transformer layer, we learn to skip the computation for a high proportion +(e.g., 80\%) of vision tokens, passing them directly to the next layer. This +approach significantly enhances model efficiency, achieving approximately +\textasciitilde42\% time and \textasciitilde30\% memory savings for the entire +training. Moreover, our method reduces the computation in the context and avoid +decreasing the vision tokens, thus preserving or even improving performance +compared to the vanilla model. We conduct extensive experiments to demonstrate +the effectiveness of VideoLLM-MoD, showing its state-of-the-art results on +multiple benchmarks, including narration, forecasting, and summarization tasks +in COIN, Ego4D, and Ego-Exo4D datasets. + +
+
+
+
+
+ + ☆ Prediction-Feedback DETR for Temporal Action Detection + + +
+ Temporal Action Detection (TAD) is fundamental yet challenging for real-world +video applications. Leveraging the unique benefits of transformers, various +DETR-based approaches have been adopted in TAD. However, it has recently been +identified that the attention collapse in self-attention causes the performance +degradation of DETR for TAD. Building upon previous research, this paper newly +addresses the attention collapse problem in cross-attention within DETR-based +TAD methods. Moreover, our findings reveal that cross-attention exhibits +patterns distinct from predictions, indicating a short-cut phenomenon. To +resolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR), +which utilizes predictions to restore the collapse and align the cross- and +self-attention with predictions. Specifically, we devise novel +prediction-feedback objectives using guidance from the relations of the +predictions. As a result, Pred-DETR significantly alleviates the collapse and +achieves state-of-the-art performance among DETR-based methods on various +challenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and +FineAction. + +
+
+
+
+
+ + ☆ H-SGANet: Hybrid Sparse Graph Attention Network for Deformable Medical + Image Registration + + +
+ The integration of Convolutional Neural Network (ConvNet) and Transformer has +emerged as a strong candidate for image registration, leveraging the strengths +of both models and a large parameter space. However, this hybrid model, +treating brain MRI volumes as grid or sequence structures, faces challenges in +accurately representing anatomical connectivity, diverse brain regions, and +vital connections contributing to the brain's internal architecture. Concerns +also arise regarding the computational expense and GPU memory usage associated +with this model. To tackle these issues, a lightweight hybrid sparse graph +attention network (H-SGANet) has been developed. This network incorporates a +central mechanism, Sparse Graph Attention (SGA), based on a Vision Graph Neural +Network (ViG) with predetermined anatomical connections. The SGA module expands +the model's receptive field and seamlessly integrates into the network. To +further amplify the advantages of the hybrid network, the Separable +Self-Attention (SSA) is employed as an enhanced token mixer, integrated with +depth-wise convolution to constitute SSAFormer. This strategic integration is +designed to more effectively extract long-range dependencies. As a hybrid +ConvNet-ViG-Transformer model, H-SGANet offers threefold benefits for +volumetric medical image registration. It optimizes fixed and moving images +concurrently through a hybrid feature fusion layer and an end-to-end learning +framework. Compared to VoxelMorph, a model with a similar parameter count, +H-SGANet demonstrates significant performance enhancements of 3.5% and 1.5% in +Dice score on the OASIS dataset and LPBA40 dataset, respectively. + +
+
+
+
+
+ + ☆ One-Shot Learning Meets Depth Diffusion in Multi-Object Videos + + +
+ Creating editable videos that depict complex interactions between multiple +objects in various artistic styles has long been a challenging task in +filmmaking. Progress is often hampered by the scarcity of data sets that +contain paired text descriptions and corresponding videos that showcase these +interactions. This paper introduces a novel depth-conditioning approach that +significantly advances this field by enabling the generation of coherent and +diverse videos from just a single text-video pair using a pre-trained +depth-aware Text-to-Image (T2I) model. Our method fine-tunes the pre-trained +model to capture continuous motion by employing custom-designed spatial and +temporal attention mechanisms. During inference, we use the DDIM inversion to +provide structural guidance for video generation. This innovative technique +allows for continuously controllable depth in videos, facilitating the +generation of multiobject interactions while maintaining the concept generation +and compositional strengths of the original T2I model across various artistic +styles, such as photorealism, animation, and impressionism. + +
+
+
+
+
+ + ☆ GradBias: Unveiling Word Influence on Bias in Text-to-Image Generative + Models + + +
+ Recent progress in Text-to-Image (T2I) generative models has enabled +high-quality image generation. As performance and accessibility increase, these +models are gaining significant attraction and popularity: ensuring their +fairness and safety is a priority to prevent the dissemination and perpetuation +of biases. However, existing studies in bias detection focus on closed sets of +predefined biases (e.g., gender, ethnicity). In this paper, we propose a +general framework to identify, quantify, and explain biases in an open set +setting, i.e. without requiring a predefined set. This pipeline leverages a +Large Language Model (LLM) to propose biases starting from a set of captions. +Next, these captions are used by the target generative model for generating a +set of images. Finally, Vision Question Answering (VQA) is leveraged for bias +evaluation. We show two variations of this framework: OpenBias and GradBias. +OpenBias detects and quantifies biases, while GradBias determines the +contribution of individual prompt words on biases. OpenBias effectively detects +both well-known and novel biases related to people, objects, and animals and +highly aligns with existing closed-set bias detection methods and human +judgment. GradBias shows that neutral words can significantly influence biases +and it outperforms several baselines, including state-of-the-art foundation +models. Code available here: https://github.com/Moreno98/GradBias. + +
+
+ comment: Under review. Code: https://github.com/Moreno98/GradBias +
+
+
+
+
+ + ☆ Generic Objects as Pose Probes for Few-Shot View Synthesis + + +
+ Radiance fields including NeRFs and 3D Gaussians demonstrate great potential +in high-fidelity rendering and scene reconstruction, while they require a +substantial number of posed images as inputs. COLMAP is frequently employed for +preprocessing to estimate poses, while it necessitates a large number of +feature matches to operate effectively, and it struggles with scenes +characterized by sparse features, large baselines between images, or a limited +number of input images. We aim to tackle few-view NeRF reconstruction using +only 3 to 6 unposed scene images. Traditional methods often use calibration +boards but they are not common in images. We propose a novel idea of utilizing +everyday objects, commonly found in both images and real life, as "pose +probes". The probe object is automatically segmented by SAM, whose shape is +initialized from a cube. We apply a dual-branch volume rendering optimization +(object NeRF and scene NeRF) to constrain the pose optimization and jointly +refine the geometry. Specifically, object poses of two views are first +estimated by PnP matching in an SDF representation, which serves as initial +poses. PnP matching, requiring only a few features, is suitable for +feature-sparse scenes. Additional views are incrementally incorporated to +refine poses from preceding views. In experiments, PoseProbe achieves +state-of-the-art performance in both pose estimation and novel view synthesis +across multiple datasets. We demonstrate its effectiveness, particularly in +few-view and large-baseline scenes where COLMAP struggles. In ablations, using +different objects in a scene yields comparable performance. + +
+
+
+
+
+ + ☆ PartFormer: Awakening Latent Diverse Representation from Vision + Transformer for Object Re-Identification + + +
+ Extracting robust feature representation is critical for object +re-identification to accurately identify objects across non-overlapping +cameras. Although having a strong representation ability, the Vision +Transformer (ViT) tends to overfit on most distinct regions of training data, +limiting its generalizability and attention to holistic object features. +Meanwhile, due to the structural difference between CNN and ViT, fine-grained +strategies that effectively address this issue in CNN do not continue to be +successful in ViT. To address this issue, by observing the latent diverse +representation hidden behind the multi-head attention, we present PartFormer, +an innovative adaptation of ViT designed to overcome the granularity +limitations in object Re-ID tasks. The PartFormer integrates a Head +Disentangling Block (HDB) that awakens the diverse representation of multi-head +self-attention without the typical loss of feature richness induced by +concatenation and FFN layers post-attention. To avoid the homogenization of +attention heads and promote robust part-based feature learning, two head +diversity constraints are imposed: attention diversity constraint and +correlation diversity constraint. These constraints enable the model to exploit +diverse and discriminative feature representations from different attention +heads. Comprehensive experiments on various object Re-ID benchmarks demonstrate +the superiority of the PartFormer. Specifically, our framework significantly +outperforms state-of-the-art by 2.4\% mAP scores on the most challenging MSMT17 +dataset. + +
+
+
+
+
+ + ☆ Space3D-Bench: Spatial 3D Question Answering Benchmark + + +
+ Answering questions about the spatial properties of the environment poses +challenges for existing language and vision foundation models due to a lack of +understanding of the 3D world notably in terms of relationships between +objects. To push the field forward, multiple 3D Q&A datasets were proposed +which, overall, provide a variety of questions, but they individually focus on +particular aspects of 3D reasoning or are limited in terms of data modalities. +To address this, we present Space3D-Bench - a collection of 1000 general +spatial questions and answers related to scenes of the Replica dataset which +offers a variety of data modalities: point clouds, posed RGB-D images, +navigation meshes and 3D object detections. To ensure that the questions cover +a wide range of 3D objectives, we propose an indoor spatial questions taxonomy +inspired by geographic information systems and use it to balance the dataset +accordingly. Moreover, we provide an assessment system that grades natural +language responses based on predefined ground-truth answers by leveraging a +Vision Language Model's comprehension of both text and images to compare the +responses with ground-truth textual information or relevant visual data. +Finally, we introduce a baseline called RAG3D-Chat integrating the world +understanding of foundation models with rich context retrieval, achieving an +accuracy of 67% on the proposed dataset. + +
+
+
+
+
+ + ☆ Eigen-Cluster VIS: Improving Weakly-supervised Video Instance + Segmentation by Leveraging Spatio-temporal Consistency + + +
+ The performance of Video Instance Segmentation (VIS) methods has improved +significantly with the advent of transformer networks. However, these networks +often face challenges in training due to the high annotation cost. To address +this, unsupervised and weakly-supervised methods have been developed to reduce +the dependency on annotations. This work introduces a novel weakly-supervised +method called Eigen-cluster VIS that, without requiring any mask annotations, +achieves competitive accuracy compared to other VIS approaches. This method is +based on two key innovations: a Temporal Eigenvalue Loss (TEL) and a clip-level +Quality Cluster Coefficient (QCC). The TEL ensures temporal coherence by +leveraging the eigenvalues of the Laplacian matrix derived from graph adjacency +matrices. By minimizing the mean absolute error (MAE) between the eigenvalues +of adjacent frames, this loss function promotes smooth transitions and stable +segmentation boundaries over time, reducing temporal discontinuities and +improving overall segmentation quality. The QCC employs the K-means method to +ensure the quality of spatio-temporal clusters without relying on ground truth +masks. Using the Davies-Bouldin score, the QCC provides an unsupervised measure +of feature discrimination, allowing the model to self-evaluate and adapt to +varying object distributions, enhancing robustness during the testing phase. +These enhancements are computationally efficient and straightforward, offering +significant performance gains without additional annotated data. The proposed +Eigen-Cluster VIS method is evaluated on the YouTube-VIS 2019/2021 and OVIS +datasets, demonstrating that it effectively narrows the performance gap between +the fully-supervised and weakly-supervised VIS approaches. The code is +available on: https://github.com/farnooshar/EigenClusterVIS + +
+
+ comment: 12 pages, 6 Figures, 5 tabels +
+
+
+
+
+ + ☆ DriveGenVLM: Real-world Video Generation for Vision Language Model based + Autonomous Driving + + +
+ The advancement of autonomous driving technologies necessitates increasingly +sophisticated methods for understanding and predicting real-world scenarios. +Vision language models (VLMs) are emerging as revolutionary tools with +significant potential to influence autonomous driving. In this paper, we +propose the DriveGenVLM framework to generate driving videos and use VLMs to +understand them. To achieve this, we employ a video generation framework +grounded in denoising diffusion probabilistic models (DDPM) aimed at predicting +real-world video sequences. We then explore the adequacy of our generated +videos for use in VLMs by employing a pre-trained model known as Efficient +In-context Learning on Egocentric Videos (EILEV). The diffusion model is +trained with the Waymo open dataset and evaluated using the Fr\'echet Video +Distance (FVD) score to ensure the quality and realism of the generated videos. +Corresponding narrations are provided by EILEV for these generated videos, +which may be beneficial in the autonomous driving domain. These narrations can +enhance traffic scene understanding, aid in navigation, and improve planning +capabilities. The integration of video generation with VLMs in the DriveGenVLM +framework represents a significant step forward in leveraging advanced AI +models to address complex challenges in autonomous driving. + +
+
+
+
+
+ + ☆ SODAWideNet++: Combining Attention and Convolutions for Salient Object + Detection ICPR 2024 + + +
+ Salient Object Detection (SOD) has traditionally relied on feature refinement +modules that utilize the features of an ImageNet pre-trained backbone. However, +this approach limits the possibility of pre-training the entire network because +of the distinct nature of SOD and image classification. Additionally, the +architecture of these backbones originally built for Image classification is +sub-optimal for a dense prediction task like SOD. To address these issues, we +propose a novel encoder-decoder-style neural network called SODAWideNet++ that +is designed explicitly for SOD. Inspired by the vision transformers ability to +attain a global receptive field from the initial stages, we introduce the +Attention Guided Long Range Feature Extraction (AGLRFE) module, which combines +large dilated convolutions and self-attention. Specifically, we use attention +features to guide long-range information extracted by multiple dilated +convolutions, thus taking advantage of the inductive biases of a convolution +operation and the input dependency brought by self-attention. In contrast to +the current paradigm of ImageNet pre-training, we modify 118K annotated images +from the COCO semantic segmentation dataset by binarizing the annotations to +pre-train the proposed model end-to-end. Further, we supervise the background +predictions along with the foreground to push our model to generate accurate +saliency predictions. SODAWideNet++ performs competitively on five different +datasets while only containing 35% of the trainable parameters compared to the +state-of-the-art models. The code and pre-computed saliency maps are provided +at https://github.com/VimsLab/SODAWideNetPlusPlus. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ☆ 3D Pose-Based Temporal Action Segmentation for Figure Skating: A + Fine-Grained and Jump Procedure-Aware Annotation Approach + + +
+ Understanding human actions from videos is essential in many domains, +including sports. In figure skating, technical judgments are performed by +watching skaters' 3D movements, and its part of the judging procedure can be +regarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure +skating that automatically assign temporal semantics to video are actively +researched. However, there is a lack of datasets and effective methods for TAS +tasks requiring 3D pose data. In this study, we first created the FS-Jump3D +dataset of complex and dynamic figure skating jumps using optical markerless +motion capture. We also propose a new fine-grained figure skating jump TAS +dataset annotation method with which TAS models can learn jump procedures. In +the experimental results, we validated the usefulness of 3D pose features as +input and the fine-grained dataset for the TAS model in figure skating. +FS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D. + +
+
+ comment: 10 pages, 7th ACM International Workshop on Multimedia Content + Analysis in Sports +
+
+
+
+
+ + ☆ Turbulence Strength $C_n^2$ Estimation from Video using Physics-based + Deep Learning + + +
+ Images captured from a long distance suffer from dynamic image distortion due +to turbulent flow of air cells with random temperatures, and thus refractive +indices. This phenomenon, known as image dancing, is commonly characterized by +its refractive-index structure constant $C_n^2$ as a measure of the turbulence +strength. For many applications such as atmospheric forecast model, +long-range/astronomy imaging, and aviation safety, optical communication +technology, $C_n^2$ estimation is critical for accurately sensing the turbulent +environment. Previous methods for $C_n^2$ estimation include estimation from +meteorological data (temperature, relative humidity, wind shear, etc.) for +single-point measurements, two-ended pathlength measurements from optical +scintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$ +from passive video cameras for low cost and hardware complexity. In this paper, +we present a comparative analysis of classical image gradient methods for +$C_n^2$ estimation and modern deep learning-based methods leveraging +convolutional neural networks. To enable this, we collect a dataset of video +capture along with reference scintillometer measurements for ground truth, and +we release this unique dataset to the scientific community. We observe that +deep learning methods can achieve higher accuracy when trained on similar data, +but suffer from generalization errors to other, unseen imagery as compared to +classical methods. To overcome this trade-off, we present a novel physics-based +network architecture that combines learned convolutional layers with a +differentiable image gradient method that maintains high accuracy while being +generalizable across image datasets. + +
+
+ comment: Code Available: https://github.com/Riponcs/Cn2Estimation +
+
+
+
+
+ + ☆ Sparse Signal Reconstruction for Overdispersed Low-photon Count + Biomedical Imaging Using $\ell_p$ Total Variation + + +
+ The negative binomial model, which generalizes the Poisson distribution +model, can be found in applications involving low-photon signal recovery, +including medical imaging. Recent studies have explored several regularization +terms for the negative binomial model, such as the $\ell_p$ quasi-norm with $0 +< p < 1$, $\ell_1$ norm, and the total variation (TV) quasi-seminorm for +promoting sparsity in signal recovery. These penalty terms have been shown to +improve image reconstruction outcomes. In this paper, we investigate the +$\ell_p$ quasi-seminorm, both isotropic and anisotropic $\ell_p$ TV +quasi-seminorms, within the framework of the negative binomial statistical +model. This problem can be formulated as an optimization problem, which we +solve using a gradient-based approach. We present comparisons between the +negative binomial and Poisson statistical models using the $\ell_p$ TV +quasi-seminorm as well as common penalty terms. Our experimental results +highlight the efficacy of the proposed method. + +
+
+ comment: 5 pages, Accepted by the IEEE International Symposium on Biomedical + Imaging (ISBI) +
+
+
+
+
+ + ☆ Towards Infusing Auxiliary Knowledge for Distracted Driver Detection KDD + + +
+ Distracted driving is a leading cause of road accidents globally. +Identification of distracted driving involves reliably detecting and +classifying various forms of driver distraction (e.g., texting, eating, or +using in-car devices) from in-vehicle camera feeds to enhance road safety. This +task is challenging due to the need for robust models that can generalize to a +diverse set of driver behaviors without requiring extensive annotated datasets. +In this paper, we propose KiD3, a novel method for distracted driver detection +(DDD) by infusing auxiliary knowledge about semantic relations between entities +in a scene and the structural configuration of the driver's pose. Specifically, +we construct a unified framework that integrates the scene graphs, and driver +pose information with the visual cues in video frames to create a holistic +representation of the driver's actions.Our results indicate that KiD3 achieves +a 13.64% accuracy improvement over the vision-only baseline by incorporating +such auxiliary knowledge with visual information. + +
+
+ comment: Accepted at KiL 2024: Workshop on Knowledge-infused Learning + co-located with 30th ACM KDD Conference +
+
+
+
+
+ + ☆ FastForensics: Efficient Two-Stream Design for Real-Time Image + Manipulation Detection BMVC 2024 + + +
+ With the rise in popularity of portable devices, the spread of falsified +media on social platforms has become rampant. This necessitates the timely +identification of authentic content. However, most advanced detection methods +are computationally heavy, hindering their real-time application. In this +paper, we describe an efficient two-stream architecture for real-time image +manipulation detection. Our method consists of two-stream branches targeting +the cognitive and inspective perspectives. In the cognitive branch, we propose +efficient wavelet-guided Transformer blocks to capture the global manipulation +traces related to frequency. This block contains an interactive wavelet-guided +self-attention module that integrates wavelet transformation with efficient +attention design, interacting with the knowledge from the inspective branch. +The inspective branch consists of simple convolutions that capture fine-grained +traces and interact bidirectionally with Transformer blocks to provide mutual +support. Our method is lightweight ($\sim$ 8M) but achieves competitive +performance compared to many other counterparts, demonstrating its efficacy in +image manipulation detection and its potential for portable integration. + +
+
+ comment: BMVC 2024 +
+
+
+
+
+ + ☆ MST-KD: Multiple Specialized Teachers Knowledge Distillation for Fair + Face Recognition ECCV 2024 + + +
+ As in school, one teacher to cover all subjects is insufficient to distill +equally robust information to a student. Hence, each subject is taught by a +highly specialised teacher. Following a similar philosophy, we propose a +multiple specialized teacher framework to distill knowledge to a student +network. In our approach, directed at face recognition use cases, we train four +teachers on one specific ethnicity, leading to four highly specialized and +biased teachers. Our strategy learns a project of these four teachers into a +common space and distill that information to a student network. Our results +highlighted increased performance and reduced bias for all our experiments. In +addition, we further show that having biased/specialized teachers is crucial by +showing that our approach achieves better results than when knowledge is +distilled from four teachers trained on balanced datasets. Our approach +represents a step forward to the understanding of the importance of +ethnicity-specific features. + +
+
+ comment: Accepted at ECCV 2024 ABAW +
+
+
+
+
+ + ☆ OP-Align: Object-level and Part-level Alignment for Self-supervised + Category-level Articulated Object Pose Estimation ECCV2024 + + +
+ Category-level articulated object pose estimation focuses on the pose +estimation of unknown articulated objects within known categories. Despite its +significance, this task remains challenging due to the varying shapes and poses +of objects, expensive dataset annotation costs, and complex real-world +environments. In this paper, we propose a novel self-supervised approach that +leverages a single-frame point cloud to solve this task. Our model consistently +generates reconstruction with a canonical pose and joint state for the entire +input object, and it estimates object-level poses that reduce overall pose +variance and part-level poses that align each part of the input with its +corresponding part of the reconstruction. Experimental results demonstrate that +our approach significantly outperforms previous self-supervised methods and is +comparable to the state-of-the-art supervised methods. To assess the +performance of our model in real-world scenarios, we also introduce a new +real-world articulated object benchmark dataset. + +
+
+ comment: to be published in ECCV2024 +
+
+
+
+
+ + ☆ Spurfies: Sparse Surface Reconstruction using Local Geometry Priors + + +
+ We introduce Spurfies, a novel method for sparse-view surface reconstruction +that disentangles appearance and geometry information to utilize local geometry +priors trained on synthetic data. Recent research heavily focuses on 3D +reconstruction using dense multi-view setups, typically requiring hundreds of +images. However, these methods often struggle with few-view scenarios. Existing +sparse-view reconstruction techniques often rely on multi-view stereo networks +that need to learn joint priors for geometry and appearance from a large amount +of data. In contrast, we introduce a neural point representation that +disentangles geometry and appearance to train a local geometry prior using a +subset of the synthetic ShapeNet dataset only. During inference, we utilize +this surface prior as additional constraint for surface and appearance +reconstruction from sparse input views via differentiable volume rendering, +restricting the space of possible solutions. We validate the effectiveness of +our method on the DTU dataset and demonstrate that it outperforms previous +state of the art by 35% in surface quality while achieving competitive novel +view synthesis quality. Moreover, in contrast to previous works, our method can +be applied to larger, unbounded scenes, such as Mip-NeRF 360. + +
+
+ comment: https://geometric-rl.mpi-inf.mpg.de/spurfies/ +
+
+
+
+
+ + ☆ GRPose: Learning Graph Relations for Human Image Generation with Pose + Priors + + +
+ Recent methods using diffusion models have made significant progress in human +image generation with various additional controls such as pose priors. However, +existing approaches still struggle to generate high-quality images with +consistent pose alignment, resulting in unsatisfactory outputs. In this paper, +we propose a framework delving into the graph relations of pose priors to +provide control information for human image generation. The main idea is to +establish a graph topological structure between the pose priors and latent +representation of diffusion models to capture the intrinsic associations +between different pose parts. A Progressive Graph Integrator (PGI) is designed +to learn the spatial relationships of the pose priors with the graph structure, +adopting a hierarchical strategy within an Adapter to gradually propagate +information across different pose parts. A pose perception loss is further +introduced based on a pretrained pose estimation network to minimize the pose +differences. Extensive qualitative and quantitative experiments conducted on +the Human-Art and LAION-Human datasets demonstrate that our model achieves +superior performance, with a 9.98% increase in pose average precision compared +to the latest benchmark model. The code is released on *******. + +
+
+ comment: The code will be released at https://github.com/XiangchenYin/GRPose +
+
+
+
+
+ + ☆ Towards Modality-agnostic Label-efficient Segmentation with + Entropy-Regularized Distribution Alignment + + +
+ Label-efficient segmentation aims to perform effective segmentation on input +data using only sparse and limited ground-truth labels for training. This topic +is widely studied in 3D point cloud segmentation due to the difficulty of +annotating point clouds densely, while it is also essential for cost-effective +segmentation on 2D images. Until recently, pseudo-labels have been widely +employed to facilitate training with limited ground-truth labels, and promising +progress has been witnessed in both the 2D and 3D segmentation. However, +existing pseudo-labeling approaches could suffer heavily from the noises and +variations in unlabelled data, which would result in significant discrepancies +between generated pseudo-labels and current model predictions during training. +We analyze that this can further confuse and affect the model learning process, +which shows to be a shared problem in label-efficient learning across both 2D +and 3D modalities. To address this issue, we propose a novel learning strategy +to regularize the pseudo-labels generated for training, thus effectively +narrowing the gaps between pseudo-labels and model predictions. More +specifically, our method introduces an Entropy Regularization loss and a +Distribution Alignment loss for label-efficient learning, resulting in an ERDA +learning strategy. Interestingly, by using KL distance to formulate the +distribution alignment loss, ERDA reduces to a deceptively simple +cross-entropy-based loss which optimizes both the pseudo-label generation +module and the segmentation model simultaneously. In addition, we innovate in +the pseudo-label generation to make our ERDA consistently effective across both +2D and 3D data modalities for segmentation. Enjoying simplicity and more +modality-agnostic pseudo-label generation, our method has shown outstanding +performance in fully utilizing all unlabeled data points for training across +... + +
+
+ comment: Extended version of arXiv:2305.15832; Code at + https://github.com/LiyaoTang/ERDA +
+
+
+
+
+ + ☆ Alignment is All You Need: A Training-free Augmentation Strategy for + Pose-guided Video Generation ICML 2024 + + +
+ Character animation is a transformative field in computer graphics and +vision, enabling dynamic and realistic video animations from static images. +Despite advancements, maintaining appearance consistency in animations remains +a challenge. Our approach addresses this by introducing a training-free +framework that ensures the generated video sequence preserves the reference +image's subtleties, such as physique and proportions, through a dual alignment +strategy. We decouple skeletal and motion priors from pose information, +enabling precise control over animation generation. Our method also improves +pixel-level alignment for conditional control from the reference character, +enhancing the temporal consistency and visual cohesion of animations. Our +method significantly enhances the quality of video generation without the need +for large datasets or expensive computational resources. + +
+
+ comment: CVG@ICML 2024 +
+
+
+
+
+ + ☆ A Simple and Generalist Approach for Panoptic Segmentation + + +
+ Generalist vision models aim for one and the same architecture for a variety +of vision tasks. While such shared architecture may seem attractive, generalist +models tend to be outperformed by their bespoken counterparts, especially in +the case of panoptic segmentation. We address this problem by introducing two +key contributions, without compromising the desirable properties of generalist +models. These contributions are: (i) a positional-embedding (PE) based loss for +improved centroid regressions; (ii) Edge Distance Sampling (EDS) for the better +separation of instance boundaries. The PE-based loss facilitates a better +per-pixel regression of the associated instance's centroid, whereas EDS +contributes by carefully handling the void regions (caused by missing labels) +and smaller instances. These two simple yet effective modifications +significantly improve established baselines, while achieving state-of-the-art +results among all generalist solutions. More specifically, our method achieves +a panoptic quality(PQ) of 52.5 on the COCO dataset, which is an improvement of +10 points over the best model with similar approach (Painter), and is superior +by 2 to the best performing diffusion-based method Pix2Seq-$\mathcal{D}$. +Furthermore, we provide insights into and an in-depth analysis of our +contributions through exhaustive experiments. Our source code and model weights +will be made publicly available. + +
+
+
+
+
+ + ☆ Locally Grouped and Scale-Guided Attention for Dense Pest Counting + + +
+ This study introduces a new dense pest counting problem to predict densely +distributed pests captured by digital traps. Unlike traditional detection-based +counting models for sparsely distributed objects, trap-based pest counting must +deal with dense pest distributions that pose challenges such as severe +occlusion, wide pose variation, and similar appearances in colors and textures. +To address these problems, it is essential to incorporate the local attention +mechanism, which identifies locally important and unimportant areas to learn +locally grouped features, thereby enhancing discriminative performance. +Accordingly, this study presents a novel design that integrates locally grouped +and scale-guided attention into a multiscale CenterNet framework. To group +local features with similar attributes, a straightforward method is introduced +using the heatmap predicted by the first hourglass containing pest centroid +information, which eliminates the need for complex clustering models. To +enhance attentiveness, the pixel attention module transforms the heatmap into a +learnable map. Subsequently, scale-guided attention is deployed to make the +object and background features more discriminative, achieving multiscale +feature fusion. Through experiments, the proposed model is verified to enhance +object features based on local grouping and discriminative feature attention +learning. Additionally, the proposed model is highly effective in overcoming +occlusion and pose variation problems, making it more suitable for dense pest +counting. In particular, the proposed model outperforms state-of-the-art models +by a large margin, with a remarkable contribution to dense pest counting. + +
+
+
+
+
+ + ☆ UAV-Based Human Body Detector Selection and Fusion for Geolocated + Saliency Map Generation + + +
+ The problem of reliably detecting and geolocating objects of different +classes in soft real-time is essential in many application areas, such as +Search and Rescue performed using Unmanned Aerial Vehicles (UAVs). This +research addresses the complementary problems of system contextual vision-based +detector selection, allocation, and execution, in addition to the fusion of +detection results from teams of UAVs for the purpose of accurately and reliably +geolocating objects of interest in a timely manner. In an offline step, an +application-independent evaluation of vision-based detectors from a system +perspective is first performed. Based on this evaluation, the most appropriate +algorithms for online object detection for each platform are selected +automatically before a mission, taking into account a number of practical +system considerations, such as the available communication links, video +compression used, and the available computational resources. The detection +results are fused using a method for building maps of salient locations which +takes advantage of a novel sensor model for vision-based detections for both +positive and negative observations. A number of simulated and real flight +experiments are also presented, validating the proposed method. + +
+
+ comment: 42 pages, 19 figures +
+
+
+
+
+ + ☆ CogVLM2: Visual Language Models for Image and Video Understanding + + +
+ Beginning with VisualGLM and CogVLM, we are continuously exploring VLMs in +pursuit of enhanced vision-language fusion, efficient higher-resolution +architecture, and broader modalities and applications. Here we propose the +CogVLM2 family, a new generation of visual language models for image and video +understanding including CogVLM2, CogVLM2-Video and GLM-4V. As an image +understanding model, CogVLM2 inherits the visual expert architecture with +improved training recipes in both pre-training and post-training stages, +supporting input resolution up to $1344 \times 1344$ pixels. As a video +understanding model, CogVLM2-Video integrates multi-frame input with timestamps +and proposes automated temporal grounding data construction. Notably, CogVLM2 +family has achieved state-of-the-art results on benchmarks like MMBench, +MM-Vet, TextVQA, MVBench and VCGBench. All models are open-sourced in +https://github.com/THUDM/CogVLM2 and https://github.com/THUDM/GLM-4, +contributing to the advancement of the field. + +
+
+
+
+
+ + ☆ Adapting Vision-Language Models to Open Classes via Test-Time Prompt + Tuning + + +
+ Adapting pre-trained models to open classes is a challenging problem in +machine learning. Vision-language models fully explore the knowledge of text +modality, demonstrating strong zero-shot recognition performance, which is +naturally suited for various open-set problems. More recently, some research +focuses on fine-tuning such models to downstream tasks. Prompt tuning methods +achieved huge improvements by learning context vectors on few-shot data. +However, through the evaluation under open-set adaptation setting with the test +data including new classes, we find that there exists a dilemma that learned +prompts have worse generalization abilities than hand-crafted prompts. In this +paper, we consider combining the advantages of both and come up with a +test-time prompt tuning approach, which leverages the maximum concept matching +(MCM) scores as dynamic weights to generate an input-conditioned prompt for +each image during test. Through extensive experiments on 11 different datasets, +we show that our proposed method outperforms all comparison methods on average +considering both base and new classes. The code is available at +https://github.com/gaozhengqing/TTPT + +
+
+ comment: PRCV 2024 +
+
+
+
+
+ + ☆ A Deep-Learning-Based Lable-free No-Reference Image Quality Assessment + Metric: Application in Sodium MRI Denoising + + +
+ New multinuclear MRI techniques, such as sodium MRI, generally suffer from +low image quality due to an inherently low signal. Postprocessing methods, such +as image denoising, have been developed for image enhancement. However, the +assessment of these enhanced images is challenging especially considering when +there is a lack of high resolution and high signal images as reference, such as +in sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are +approaches to solve this problem. Existing learning-based NR-IQA metrics rely +on labels derived from subjective human opinions or metrics like +Signal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate +ground truths, resulting in unreliable assessment. We note that deep learning +(DL) models have a unique characteristic in that they are specialized to a +characteristic training set, meaning that deviations between the input testing +data from the training data will reduce prediction accuracy. Therefore, we +propose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM), +which does not depend on ground-truth images or labels. MSM measures the +difference between the input image and the model's prediction for evaluating +the quality of the input image. Experiments conducted on both simulated +distorted proton T1-weighted MR images and denoised sodium MR images +demonstrate that MSM exhibits a superior evaluation performance on various +simulated noises and distortions. MSM also has a substantial agreement with the +expert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528, +outperforming the existing NR-IQA metrics. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ MICDrop: Masking Image and Depth Features via Complementary Dropout for + Domain-Adaptive Semantic Segmentation + + +
+ Unsupervised Domain Adaptation (UDA) is the task of bridging the domain gap +between a labeled source domain, e.g., synthetic data, and an unlabeled target +domain. We observe that current UDA methods show inferior results on fine +structures and tend to oversegment objects with ambiguous appearance. To +address these shortcomings, we propose to leverage geometric information, i.e., +depth predictions, as depth discontinuities often coincide with segmentation +boundaries. We show that naively incorporating depth into current UDA methods +does not fully exploit the potential of this complementary information. To this +end, we present MICDrop, which learns a joint feature representation by masking +image encoder features while inversely masking depth encoder features. With +this simple yet effective complementary masking strategy, we enforce the use of +both modalities when learning the joint feature representation. To aid this +process, we propose a feature fusion module to improve both global as well as +local information sharing while being robust to errors in the depth +predictions. We show that our method can be plugged into various recent UDA +methods and consistently improve results across standard UDA benchmarks, +obtaining new state-of-the-art performances. + +
+
+
+
+
+ + ☆ Creating a Segmented Pointcloud of Grapevines by Combining Multiple + Viewpoints Through Visual Odometry + + +
+ Grapevine winter pruning is a labor-intensive and repetitive process that +significantly influences the quality and quantity of the grape harvest and +produced wine of the following season. It requires a careful and expert +detection of the point to be cut. Because of its complexity, repetitive nature +and time constraint, the task requires skilled labor that needs to be trained. +This extended abstract presents the computer vision pipeline employed in +project Vinum, using detectron2 as a segmentation network and keypoint visual +odometry to merge different observation into a single pointcloud used to make +informed pruning decisions. + +
+
+
+
+
+ + ☆ Improving 3D deep learning segmentation with biophysically motivated + cell synthesis + + +
+ Biomedical research increasingly relies on 3D cell culture models and +AI-based analysis can potentially facilitate a detailed and accurate feature +extraction on a single-cell level. However, this requires for a precise +segmentation of 3D cell datasets, which in turn demands high-quality ground +truth for training. Manual annotation, the gold standard for ground truth data, +is too time-consuming and thus not feasible for the generation of large 3D +training datasets. To address this, we present a novel framework for generating +3D training data, which integrates biophysical modeling for realistic cell +shape and alignment. Our approach allows the in silico generation of coherent +membrane and nuclei signals, that enable the training of segmentation models +utilizing both channels for improved performance. Furthermore, we present a new +GAN training scheme that generates not only image data but also matching +labels. Quantitative evaluation shows superior performance of biophysical +motivated synthetic training data, even outperforming manual annotation and +pretrained models. This underscores the potential of incorporating biophysical +modeling for enhancing synthetic training data quality. + +
+
+
+
+
+ + ☆ Multi-source Domain Adaptation for Panoramic Semantic Segmentation + + +
+ Panoramic semantic segmentation has received widespread attention recently +due to its comprehensive 360\degree field of view. However, labeling such +images demands greater resources compared to pinhole images. As a result, many +unsupervised domain adaptation methods for panoramic semantic segmentation have +emerged, utilizing real pinhole images or low-cost synthetic panoramic images. +But, the segmentation model lacks understanding of the panoramic structure when +only utilizing real pinhole images, and it lacks perception of real-world +scenes when only adopting synthetic panoramic images. Therefore, in this paper, +we propose a new task of multi-source domain adaptation for panoramic semantic +segmentation, aiming to utilize both real pinhole and synthetic panoramic +images in the source domains, enabling the segmentation model to perform well +on unlabeled real panoramic images in the target domain. Further, we propose +Deformation Transform Aligner for Panoramic Semantic Segmentation (DTA4PASS), +which converts all pinhole images in the source domains into panoramic-like +images, and then aligns the converted source domains with the target domain. +Specifically, DTA4PASS consists of two main components: Unpaired Semantic +Morphing (USM) and Distortion Gating Alignment (DGA). Firstly, in USM, the +Semantic Dual-view Discriminator (SDD) assists in training the diffeomorphic +deformation network, enabling the effective transformation of pinhole images +without paired panoramic views. Secondly, DGA assigns pinhole-like and +panoramic-like features to each image by gating, and aligns these two features +through uncertainty estimation. DTA4PASS outperforms the previous +state-of-the-art methods by 1.92% and 2.19% on the outdoor and indoor +multi-source domain adaptation scenarios, respectively. The source code will be +released. + +
+
+ comment: 9 pages, 7 figures, 5 tables +
+
+
+
+
+ + ☆ Spiking Diffusion Models + + +
+ Recent years have witnessed Spiking Neural Networks (SNNs) gaining attention +for their ultra-low energy consumption and high biological plausibility +compared with traditional Artificial Neural Networks (ANNs). Despite their +distinguished properties, the application of SNNs in the computationally +intensive field of image generation is still under exploration. In this paper, +we propose the Spiking Diffusion Models (SDMs), an innovative family of +SNN-based generative models that excel in producing high-quality samples with +significantly reduced energy consumption. In particular, we propose a +Temporal-wise Spiking Mechanism (TSM) that allows SNNs to capture more temporal +features from a bio-plasticity perspective. In addition, we propose a +threshold-guided strategy that can further improve the performances by up to +16.7% without any additional training. We also make the first attempt to use +the ANN-SNN approach for SNN-based generation tasks. Extensive experimental +results reveal that our approach not only exhibits comparable performance to +its ANN counterpart with few spiking time steps, but also outperforms previous +SNN-based generative models by a large margin. Moreover, we also demonstrate +the high-quality generation ability of SDM on large-scale datasets, e.g., LSUN +bedroom. This development marks a pivotal advancement in the capabilities of +SNN-based generation, paving the way for future research avenues to realize +low-energy and low-latency generative applications. Our code is available at +https://github.com/AndyCao1125/SDM. + +
+
+ comment: Accepted by IEEE Transactions on Artificial Intelligence +
+
+
+
+
+ + ☆ Weakly Supervised Object Detection for Automatic Tooth-marked Tongue + Recognition + + +
+ Tongue diagnosis in Traditional Chinese Medicine (TCM) is a crucial +diagnostic method that can reflect an individual's health status. Traditional +methods for identifying tooth-marked tongues are subjective and inconsistent +because they rely on practitioner experience. We propose a novel fully +automated Weakly Supervised method using Vision transformer and Multiple +instance learning WSVM for tongue extraction and tooth-marked tongue +recognition. Our approach first accurately detects and extracts the tongue +region from clinical images, removing any irrelevant background information. +Then, we implement an end-to-end weakly supervised object detection method. We +utilize Vision Transformer (ViT) to process tongue images in patches and employ +multiple instance loss to identify tooth-marked regions with only image-level +annotations. WSVM achieves high accuracy in tooth-marked tongue classification, +and visualization experiments demonstrate its effectiveness in pinpointing +these regions. This automated approach enhances the objectivity and accuracy of +tooth-marked tongue diagnosis. It provides significant clinical value by +assisting TCM practitioners in making precise diagnoses and treatment +recommendations. Code is available at https://github.com/yc-zh/WSVM. + +
+
+
+
+
+ + ☆ What to Preserve and What to Transfer: Faithful, Identity-Preserving + Diffusion-based Hairstyle Transfer + + +
+ Hairstyle transfer is a challenging task in the image editing field that +modifies the hairstyle of a given face image while preserving its other +appearance and background features. The existing hairstyle transfer approaches +heavily rely on StyleGAN, which is pre-trained on cropped and aligned face +images. Hence, they struggle to generalize under challenging conditions such as +extreme variations of head poses or focal lengths. To address this issue, we +propose a one-stage hairstyle transfer diffusion model, HairFusion, that +applies to real-world scenarios. Specifically, we carefully design a +hair-agnostic representation as the input of the model, where the original hair +information is thoroughly eliminated. Next, we introduce a hair align +cross-attention (Align-CA) to accurately align the reference hairstyle with the +face image while considering the difference in their face shape. To enhance the +preservation of the face image's original features, we leverage adaptive hair +blending during the inference, where the output's hair regions are estimated by +the cross-attention map in Align-CA and blended with non-hair areas of the face +image. Our experimental results show that our method achieves state-of-the-art +performance compared to the existing methods in preserving the integrity of +both the transferred hairstyle and the surrounding features. The codes are +available at https://github.com/cychungg/HairFusion. + +
+
+
+
+
+ + ☆ Enhancing Sound Source Localization via False Negative Elimination + + +
+ Sound source localization aims to localize objects emitting the sound in +visual scenes. Recent works obtaining impressive results typically rely on +contrastive learning. However, the common practice of randomly sampling +negatives in prior arts can lead to the false negative issue, where the sounds +semantically similar to visual instance are sampled as negatives and +incorrectly pushed away from the visual anchor/query. As a result, this +misalignment of audio and visual features could yield inferior performance. To +address this issue, we propose a novel audio-visual learning framework which is +instantiated with two individual learning schemes: self-supervised predictive +learning (SSPL) and semantic-aware contrastive learning (SACL). SSPL explores +image-audio positive pairs alone to discover semantically coherent similarities +between audio and visual features, while a predictive coding module for feature +alignment is introduced to facilitate the positive-only learning. In this +regard SSPL acts as a negative-free method to eliminate false negatives. By +contrast, SACL is designed to compact visual features and remove false +negatives, providing reliable visual anchor and audio negatives for contrast. +Different from SSPL, SACL releases the potential of audio-visual contrastive +learning, offering an effective alternative to achieve the same goal. +Comprehensive experiments demonstrate the superiority of our approach over the +state-of-the-arts. Furthermore, we highlight the versatility of the learned +representation by extending the approach to audio-visual event classification +and object detection tasks. Code and models are available at: +https://github.com/zjsong/SACL. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2203.13412 +
+
+
+
+
+ + ☆ Mismatched: Evaluating the Limits of Image Matching Approaches and + Benchmarks + + +
+ Three-dimensional (3D) reconstruction from two-dimensional images is an +active research field in computer vision, with applications ranging from +navigation and object tracking to segmentation and three-dimensional modeling. +Traditionally, parametric techniques have been employed for this task. However, +recent advancements have seen a shift towards learning-based methods. Given the +rapid pace of research and the frequent introduction of new image matching +methods, it is essential to evaluate them. In this paper, we present a +comprehensive evaluation of various image matching methods using a +structure-from-motion pipeline. We assess the performance of these methods on +both in-domain and out-of-domain datasets, identifying key limitations in both +the methods and benchmarks. We also investigate the impact of edge detection as +a pre-processing step. Our analysis reveals that image matching for 3D +reconstruction remains an open challenge, necessitating careful selection and +tuning of models for specific scenarios, while also highlighting mismatches in +how metrics currently represent method performance. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ Integrating Features for Recognizing Human Activities through Optimized + Parameters in Graph Convolutional Networks and Transformer Architectures + + +
+ Human activity recognition is a major field of study that employs computer +vision, machine vision, and deep learning techniques to categorize human +actions. The field of deep learning has made significant progress, with +architectures that are extremely effective at capturing human dynamics. This +study emphasizes the influence of feature fusion on the accuracy of activity +recognition. This technique addresses the limitation of conventional models, +which face difficulties in identifying activities because of their limited +capacity to understand spatial and temporal features. The technique employs +sensory data obtained from four publicly available datasets: HuGaDB, PKU-MMD, +LARa, and TUG. The accuracy and F1-score of two deep learning models, +specifically a Transformer model and a Parameter-Optimized Graph Convolutional +Network (PO-GCN), were evaluated using these datasets. The feature fusion +technique integrated the final layer features from both models and inputted +them into a classifier. Empirical evidence demonstrates that PO-GCN outperforms +standard models in activity recognition. HuGaDB demonstrated a 2.3% improvement +in accuracy and a 2.2% increase in F1-score. TUG showed a 5% increase in +accuracy and a 0.5% rise in F1-score. On the other hand, LARa and PKU-MMD +achieved lower accuracies of 64% and 69% respectively. This indicates that the +integration of features enhanced the performance of both the Transformer model +and PO-GCN. + +
+
+ comment: 6 pages, 1 figure, conference +
+
+
+
+
+ + ☆ Discriminative Spatial-Semantic VOS Solution: 1st Place Solution for 6th + LSVOS + + +
+ Video object segmentation (VOS) is a crucial task in computer vision, but +current VOS methods struggle with complex scenes and prolonged object motions. +To address these challenges, the MOSE dataset aims to enhance object +recognition and differentiation in complex environments, while the LVOS dataset +focuses on segmenting objects exhibiting long-term, intricate movements. This +report introduces a discriminative spatial-temporal VOS model that utilizes +discriminative object features as query representations. The semantic +understanding of spatial-semantic modules enables it to recognize object parts, +while salient features highlight more distinctive object characteristics. Our +model, trained on extensive VOS datasets, achieved first place +(\textbf{80.90\%} $\mathcal{J \& F}$) on the test set of the 6th LSVOS +challenge in the VOS Track, demonstrating its effectiveness in tackling the +aforementioned challenges. The code will be available at +\href{https://github.com/yahooo-m/VOS-Solution}{code}. + +
+
+ comment: 1st Place Solution for 6th LSVOS VOS Track. arXiv admin note: + substantial text overlap with arXiv:2406.04600 +
+
+
+
+
+ + ☆ COIN: Control-Inpainting Diffusion Prior for Human and Camera Motion + Estimation ECCV 2024 + + +
+ Estimating global human motion from moving cameras is challenging due to the +entanglement of human and camera motions. To mitigate the ambiguity, existing +methods leverage learned human motion priors, which however often result in +oversmoothed motions with misaligned 2D projections. To tackle this problem, we +propose COIN, a control-inpainting motion diffusion prior that enables +fine-grained control to disentangle human and camera motions. Although +pre-trained motion diffusion models encode rich motion priors, we find it +non-trivial to leverage such knowledge to guide global motion estimation from +RGB videos. COIN introduces a novel control-inpainting score distillation +sampling method to ensure well-aligned, consistent, and high-quality motion +from the diffusion prior within a joint optimization framework. Furthermore, we +introduce a new human-scene relation loss to alleviate the scale ambiguity by +enforcing consistency among the humans, camera, and scene. Experiments on three +challenging benchmarks demonstrate the effectiveness of COIN, which outperforms +the state-of-the-art methods in terms of global human motion estimation and +camera motion estimation. As an illustrative example, COIN outperforms the +state-of-the-art method by 33% in world joint position error (W-MPJPE) on the +RICH dataset. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Text-Enhanced Zero-Shot Action Recognition: A training-free approach ICPR 2024 + + +
+ Vision-language models (VLMs) have demonstrated remarkable performance across +various visual tasks, leveraging joint learning of visual and textual +representations. While these models excel in zero-shot image tasks, their +application to zero-shot video action recognition (ZSVAR) remains challenging +due to the dynamic and temporal nature of actions. Existing methods for ZS-VAR +typically require extensive training on specific datasets, which can be +resource-intensive and may introduce domain biases. In this work, we propose +Text-Enhanced Action Recognition (TEAR), a simple approach to ZS-VAR that is +training-free and does not require the availability of training data or +extensive computational resources. Drawing inspiration from recent findings in +vision and language literature, we utilize action descriptors for decomposition +and contextual information to enhance zero-shot action recognition. Through +experiments on UCF101, HMDB51, and Kinetics-600 datasets, we showcase the +effectiveness and applicability of our proposed approach in addressing the +challenges of ZS-VAR. + +
+
+ comment: accepted to ICPR 2024 +
+
+
+
+
+ + ☆ IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial + Intelligence Evaluation in Histopathology + + +
+ Histopathological image analysis is crucial for accurate cancer diagnosis and +treatment planning. While deep learning models, especially convolutional neural +networks, have advanced this field, their "black-box" nature raises concerns +about interpretability and trustworthiness. Explainable Artificial Intelligence +(XAI) techniques aim to address these concerns, but evaluating their +effectiveness remains challenging. A significant issue with current +occlusion-based XAI methods is that they often generate Out-of-Distribution +(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce +Inpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a +Denoising Diffusion Probabilistic Model to inpaint occluded regions in +histopathological images. By replacing cancerous areas with realistic, +non-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity. +We evaluate our method on the CAMELYON16 dataset through two phases: first, by +assessing perceptual similarity using the Learned Perceptual Image Patch +Similarity (LPIPS) metric, and second, by quantifying the impact on model +predictions through Area Under the Curve (AUC) analysis. Our results +demonstrate that IBO significantly improves perceptual fidelity, achieving +nearly twice the improvement in LPIPS scores compared to the best existing +occlusion strategy. Additionally, IBO increased the precision of XAI +performance prediction from 42% to 71% compared to traditional methods. These +results demonstrate IBO's potential to provide more reliable evaluations of XAI +techniques, benefiting histopathology and other applications. The source code +for this study is available at https://github.com/a-fsh-r/IBO. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Exploiting temporal information to detect conversational groups in + videos and predict the next speaker + + +
+ Studies in human human interaction have introduced the concept of F formation +to describe the spatial arrangement of participants during social interactions. +This paper has two objectives. It aims at detecting F formations in video +sequences and predicting the next speaker in a group conversation. The proposed +approach exploits time information and human multimodal signals in video +sequences. In particular, we rely on measuring the engagement level of people +as a feature of group belonging. Our approach makes use of a recursive neural +network, the Long Short Term Memory (LSTM), to predict who will take the +speaker's turn in a conversation group. Experiments on the MatchNMingle dataset +led to 85% true positives in group detection and 98% accuracy in predicting the +next speaker. + +
+
+ comment: Accepted to Pattern Recognition Letter, 8 pages, 10 figures +
+
+
+
+
+ + ☆ Law of Vision Representation in MLLMs + + +
+ We present the "Law of Vision Representation" in multimodal large language +models (MLLMs). It reveals a strong correlation between the combination of +cross-modal alignment, correspondence in vision representation, and MLLM +performance. We quantify the two factors using the cross-modal Alignment and +Correspondence score (AC score). Through extensive experiments involving +thirteen different vision representation settings and evaluations across eight +benchmarks, we find that the AC score is linearly correlated to model +performance. By leveraging this relationship, we are able to identify and train +the optimal vision representation only, which does not require finetuning the +language model every time, resulting in a 99.7% reduction in computational +cost. + +
+
+ comment: The code is available at + https://github.com/bronyayang/Law_of_Vision_Representation_in_MLLMs +
+
+
+
+
+ + ☆ NeRF-CA: Dynamic Reconstruction of X-ray Coronary Angiography with + Extremely Sparse-views + + +
+ Dynamic three-dimensional (4D) reconstruction from two-dimensional X-ray +coronary angiography (CA) remains a significant clinical problem. Challenges +include sparse-view settings, intra-scan motion, and complex vessel morphology +such as structure sparsity and background occlusion. Existing CA reconstruction +methods often require extensive user interaction or large training datasets. On +the other hand, Neural Radiance Field (NeRF), a promising deep learning +technique, has successfully reconstructed high-fidelity static scenes for +natural and medical scenes. Recent work, however, identified that sparse-views, +background occlusion, and dynamics still pose a challenge when applying NeRF in +the X-ray angiography context. Meanwhile, many successful works for natural +scenes propose regularization for sparse-view reconstruction or scene +decomposition to handle dynamics. However, these techniques do not directly +translate to the CA context, where both challenges and background occlusion are +significant. This paper introduces NeRF-CA, the first step toward a 4D CA +reconstruction method that achieves reconstructions from sparse coronary +angiograms with cardiac motion. We leverage the motion of the coronary artery +to decouple the scene into a dynamic coronary artery component and static +background. We combine this scene decomposition with tailored regularization +techniques. These techniques enforce the separation of the coronary artery from +the background by enforcing dynamic structure sparsity and scene smoothness. By +uniquely combining these approaches, we achieve 4D reconstructions from as few +as four angiogram sequences. This setting aligns with clinical workflows while +outperforming state-of-the-art X-ray sparse-view NeRF reconstruction +techniques. We validate our approach quantitatively and qualitatively using 4D +phantom datasets and ablation studies. + +
+
+
+
+
+ + ☆ Toward Robust Early Detection of Alzheimer's Disease via an Integrated + Multimodal Learning Approach + + +
+ Alzheimer's Disease (AD) is a complex neurodegenerative disorder marked by +memory loss, executive dysfunction, and personality changes. Early diagnosis is +challenging due to subtle symptoms and varied presentations, often leading to +misdiagnosis with traditional unimodal diagnostic methods due to their limited +scope. This study introduces an advanced multimodal classification model that +integrates clinical, cognitive, neuroimaging, and EEG data to enhance +diagnostic accuracy. The model incorporates a feature tagger with a tabular +data coding architecture and utilizes the TimesBlock module to capture +intricate temporal patterns in Electroencephalograms (EEG) data. By employing +Cross-modal Attention Aggregation module, the model effectively fuses Magnetic +Resonance Imaging (MRI) spatial information with EEG temporal data, +significantly improving the distinction between AD, Mild Cognitive Impairment, +and Normal Cognition. Simultaneously, we have constructed the first AD +classification dataset that includes three modalities: EEG, MRI, and tabular +data. Our innovative approach aims to facilitate early diagnosis and +intervention, potentially slowing the progression of AD. The source code and +our private ADMC dataset are available at https://github.com/JustlfC03/MSTNet. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Learned Image Transmission with Hierarchical Variational Autoencoder + + +
+ In this paper, we introduce an innovative hierarchical joint source-channel +coding (HJSCC) framework for image transmission, utilizing a hierarchical +variational autoencoder (VAE). Our approach leverages a combination of +bottom-up and top-down paths at the transmitter to autoregressively generate +multiple hierarchical representations of the original image. These +representations are then directly mapped to channel symbols for transmission by +the JSCC encoder. We extend this framework to scenarios with a feedback link, +modeling transmission over a noisy channel as a probabilistic sampling process +and deriving a novel generative formulation for JSCC with feedback. Compared +with existing approaches, our proposed HJSCC provides enhanced adaptability by +dynamically adjusting transmission bandwidth, encoding these representations +into varying amounts of channel symbols. Additionally, we introduce a rate +attention module to guide the JSCC encoder in optimizing its encoding strategy +based on prior information. Extensive experiments on images of varying +resolutions demonstrate that our proposed model outperforms existing baselines +in rate-distortion performance and maintains robustness against channel noise. + +
+
+
+
+
+ + ☆ P2P-Bridge: Diffusion Bridges for 3D Point Cloud Denoising ECCV 2024 + + +
+ In this work, we tackle the task of point cloud denoising through a novel +framework that adapts Diffusion Schr\"odinger bridges to points clouds. Unlike +previous approaches that predict point-wise displacements from point features +or learned noise distributions, our method learns an optimal transport plan +between paired point clouds. Experiments on object datasets like PU-Net and +real-world datasets such as ScanNet++ and ARKitScenes show that P2P-Bridge +achieves significant improvements over existing methods. While our approach +demonstrates strong results using only point coordinates, we also show that +incorporating additional features, such as color information or point-wise +DINOv2 features, further enhances the performance. Code and pretrained models +are available at https://p2p-bridge.github.io. + +
+
+ comment: ECCV 2024 Project page: https://p2p-bridge.github.io +
+
+
+
+
+ + ☆ BEVal: A Cross-dataset Evaluation Study of BEV Segmentation Models for + Autononomous Driving + + +
+ Current research in semantic bird's-eye view segmentation for autonomous +driving focuses solely on optimizing neural network models using a single +dataset, typically nuScenes. This practice leads to the development of highly +specialized models that may fail when faced with different environments or +sensor setups, a problem known as domain shift. In this paper, we conduct a +comprehensive cross-dataset evaluation of state-of-the-art BEV segmentation +models to assess their performance across different training and testing +datasets and setups, as well as different semantic categories. We investigate +the influence of different sensors, such as cameras and LiDAR, on the models' +ability to generalize to diverse conditions and scenarios. Additionally, we +conduct multi-dataset training experiments that improve models' BEV +segmentation performance compared to single-dataset training. Our work +addresses the gap in evaluating BEV segmentation models under cross-dataset +validation. And our findings underscore the importance of enhancing model +generalizability and adaptability to ensure more robust and reliable BEV +segmentation approaches for autonomous driving applications. + +
+
+
+
+
+ + ☆ ResVG: Enhancing Relation and Semantic Understanding in Multiple + Instances for Visual Grounding ACM MM 2024 + + +
+ Visual grounding aims to localize the object referred to in an image based on +a natural language query. Although progress has been made recently, accurately +localizing target objects within multiple-instance distractions (multiple +objects of the same category as the target) remains a significant challenge. +Existing methods demonstrate a significant performance drop when there are +multiple distractions in an image, indicating an insufficient understanding of +the fine-grained semantics and spatial relationships between objects. In this +paper, we propose a novel approach, the Relation and Semantic-sensitive Visual +Grounding (ResVG) model, to address this issue. Firstly, we enhance the model's +understanding of fine-grained semantics by injecting semantic prior information +derived from text queries into the model. This is achieved by leveraging +text-to-image generation models to produce images representing the semantic +attributes of target objects described in queries. Secondly, we tackle the lack +of training samples with multiple distractions by introducing a +relation-sensitive data augmentation method. This method generates additional +training data by synthesizing images containing multiple objects of the same +category and pseudo queries based on their spatial relationships. The proposed +ReSVG model significantly improves the model's ability to comprehend both +object semantics and spatial relations, leading to enhanced performance in +visual grounding tasks, particularly in scenarios with multiple-instance +distractions. We conduct extensive experiments to validate the effectiveness of +our methods on five datasets. Code is available at +https://github.com/minghangz/ResVG. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ FA-YOLO: Research On Efficient Feature Selection YOLO Improved Algorithm + Based On FMDS and AGMF Modules + + +
+ Over the past few years, the YOLO series of models has emerged as one of the +dominant methodologies in the realm of object detection. Many studies have +advanced these baseline models by modifying their architectures, enhancing data +quality, and developing new loss functions. However, current models still +exhibit deficiencies in processing feature maps, such as overlooking the fusion +of cross-scale features and a static fusion approach that lacks the capability +for dynamic feature adjustment. To address these issues, this paper introduces +an efficient Fine-grained Multi-scale Dynamic Selection Module (FMDS Module), +which applies a more effective dynamic feature selection and fusion method on +fine-grained multi-scale feature maps, significantly enhancing the detection +accuracy of small, medium, and large-sized targets in complex environments. +Furthermore, this paper proposes an Adaptive Gated Multi-branch Focus Fusion +Module (AGMF Module), which utilizes multiple parallel branches to perform +complementary fusion of various features captured by the gated unit branch, +FMDS Module branch, and TripletAttention branch. This approach further enhances +the comprehensiveness, diversity, and integrity of feature fusion. This paper +has integrated the FMDS Module, AGMF Module, into Yolov9 to develop a novel +object detection model named FA-YOLO. Extensive experimental results show that +under identical experimental conditions, FA-YOLO achieves an outstanding 66.1% +mean Average Precision (mAP) on the PASCAL VOC 2007 dataset, representing 1.0% +improvement over YOLOv9's 65.1%. Additionally, the detection accuracies of +FA-YOLO for small, medium, and large targets are 44.1%, 54.6%, and 70.8%, +respectively, showing improvements of 2.0%, 3.1%, and 0.9% compared to YOLOv9's +42.1%, 51.5%, and 69.9%. + +
+
+ comment: 11 pages and 4 figures +
+
+
+
+
+ + ☆ Bootstrap Segmentation Foundation Model under Distribution Shift via + Object-Centric Learning ECCV 2024 + + +
+ Foundation models have made incredible strides in achieving zero-shot or +few-shot generalization, leveraging prompt engineering to mimic the +problem-solving approach of human intelligence. However, when it comes to some +foundation models like Segment Anything, there is still a challenge in +performing well on out-of-distribution data, including camouflaged and medical +images. Inconsistent prompting strategies during fine-tuning and testing +further compound the issue, leading to decreased performance. Drawing +inspiration from how human cognition processes new environments, we introduce +SlotSAM, a method that reconstructs features from the encoder in a +self-supervised manner to create object-centric representations. These +representations are then integrated into the foundation model, bolstering its +object-level perceptual capabilities while reducing the impact of +distribution-related variables. The beauty of SlotSAM lies in its simplicity +and adaptability to various tasks, making it a versatile solution that +significantly enhances the generalization abilities of foundation models. +Through limited parameter fine-tuning in a bootstrap manner, our approach paves +the way for improved generalization in novel environments. The code is +available at github.com/lytang63/SlotSAM. + +
+
+ comment: This work is accepted by ECCV 2024 EVAL-FoMo Workshop +
+
+
+
+
+ + ☆ Semantics-Oriented Multitask Learning for DeepFake Detection: A Joint + Embedding Approach + + +
+ In recent years, the multimedia forensics and security community has seen +remarkable progress in multitask learning for DeepFake (i.e., face forgery) +detection. The prevailing strategy has been to frame DeepFake detection as a +binary classification problem augmented by manipulation-oriented auxiliary +tasks. This strategy focuses on learning features specific to face +manipulations, which exhibit limited generalizability. In this paper, we delve +deeper into semantics-oriented multitask learning for DeepFake detection, +leveraging the relationships among face semantics via joint embedding. We first +propose an automatic dataset expansion technique that broadens current face +forgery datasets to support semantics-oriented DeepFake detection tasks at both +the global face attribute and local face region levels. Furthermore, we resort +to joint embedding of face images and their corresponding labels (depicted by +textual descriptions) for prediction. This approach eliminates the need for +manually setting task-agnostic and task-specific parameters typically required +when predicting labels directly from images. In addition, we employ a bi-level +optimization strategy to dynamically balance the fidelity loss weightings of +various tasks, making the training process fully automated. Extensive +experiments on six DeepFake datasets show that our method improves the +generalizability of DeepFake detection and, meanwhile, renders some degree of +model interpretation by providing human-understandable explanations. + +
+
+
+
+
+ + ☆ Enhanced Control for Diffusion Bridge in Image Restoration + + +
+ Image restoration refers to the process of restoring a damaged low-quality +image back to its corresponding high-quality image. Typically, we use +convolutional neural networks to directly learn the mapping from low-quality +images to high-quality images achieving image restoration. Recently, a special +type of diffusion bridge model has achieved more advanced results in image +restoration. It can transform the direct mapping from low-quality to +high-quality images into a diffusion process, restoring low-quality images +through a reverse process. However, the current diffusion bridge restoration +models do not emphasize the idea of conditional control, which may affect +performance. This paper introduces the ECDB model enhancing the control of the +diffusion bridge with low-quality images as conditions. Moreover, in response +to the characteristic of diffusion models having low denoising level at larger +values of \(\bm t \), we also propose a Conditional Fusion Schedule, which more +effectively handles the conditional feature information of various modules. +Experimental results prove that the ECDB model has achieved state-of-the-art +results in many image restoration tasks, including deraining, inpainting and +super-resolution. Code is avaliable at https://github.com/Hammour-steak/ECDB. + +
+
+
+
+
+ + ☆ Rethinking Sparse Lexical Representations for Image Retrieval in the Age + of Rising Multi-Modal Large Language Models ECCV 2024 + + +
+ In this paper, we rethink sparse lexical representations for image retrieval. +By utilizing multi-modal large language models (M-LLMs) that support visual +prompting, we can extract image features and convert them into textual data, +enabling us to utilize efficient sparse retrieval algorithms employed in +natural language processing for image retrieval tasks. To assist the LLM in +extracting image features, we apply data augmentation techniques for key +expansion and analyze the impact with a metric for relevance between images and +textual data. We empirically show the superior precision and recall performance +of our image retrieval method compared to conventional vision-language +model-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a +keyword-based image retrieval scenario, where keywords serve as search queries. +We also demonstrate that the retrieval performance can be improved by +iteratively incorporating keywords into search queries. + +
+
+ comment: Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer + Vision in the Age of Deep Learning (TradiCV) +
+
+
+
+
+ + ☆ Convolutional Neural Network Compression Based on Low-Rank Decomposition + + +
+ Deep neural networks typically impose significant computational loads and +memory consumption. Moreover, the large parameters pose constraints on +deploying the model on edge devices such as embedded systems. Tensor +decomposition offers a clear advantage in compressing large-scale weight +tensors. Nevertheless, direct utilization of low-rank decomposition typically +leads to significant accuracy loss. This paper proposes a model compression +method that integrates Variational Bayesian Matrix Factorization (VBMF) with +orthogonal regularization. Initially, the model undergoes over-parameterization +and training, with orthogonal regularization applied to enhance its likelihood +of achieving the accuracy of the original model. Secondly, VBMF is employed to +estimate the rank of the weight tensor at each layer. Our framework is +sufficiently general to apply to other convolutional neural networks and easily +adaptable to incorporate other tensor decomposition methods. Experimental +results show that for both high and low compression ratios, our compression +model exhibits advanced performance. + +
+
+ comment: 10 pages, 1 figures +
+
+
+
+
+ + ☆ Fine-grained Classification of Port Wine Stains Using Optical Coherence + Tomography Angiography + + +
+ Accurate classification of port wine stains (PWS, vascular malformations +present at birth), is critical for subsequent treatment planning. However, the +current method of classifying PWS based on the external skin appearance rarely +reflects the underlying angiopathological heterogeneity of PWS lesions, +resulting in inconsistent outcomes with the common vascular-targeted +photodynamic therapy (V-PDT) treatments. Conversely, optical coherence +tomography angiography (OCTA) is an ideal tool for visualizing the vascular +malformations of PWS. Previous studies have shown no significant correlation +between OCTA quantitative metrics and the PWS subtypes determined by the +current classification approach. This study proposes a new classification +approach for PWS using both OCT and OCTA. By examining the hypodermic +histopathology and vascular structure of PWS, we have devised a fine-grained +classification method that subdivides PWS into five distinct types. To assess +the angiopathological differences of various PWS subtypes, we have analyzed six +metrics related to vascular morphology and depth information of PWS lesions. +The five PWS types present significant differences across all metrics compared +to the conventional subtypes. Our findings suggest that an angiopathology-based +classification accurately reflects the heterogeneity in PWS lesions. This +research marks the first attempt to classify PWS based on angiopathology, +potentially guiding more effective subtyping and treatment strategies for PWS. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ SAU: A Dual-Branch Network to Enhance Long-Tailed Recognition via + Generative Models + + +
+ Long-tailed distributions in image recognition pose a considerable challenge +due to the severe imbalance between a few dominant classes with numerous +examples and many minority classes with few samples. Recently, the use of large +generative models to create synthetic data for image classification has been +realized, but utilizing synthetic data to address the challenge of long-tailed +recognition remains relatively unexplored. In this work, we proposed the use of +synthetic data as a complement to long-tailed datasets to eliminate the impact +of data imbalance. To tackle this real-synthetic mixed dataset, we designed a +two-branch model that contains Synthetic-Aware and Unaware branches (SAU). The +core ideas are (1) a synthetic-unaware branch for classification that mixes +real and synthetic data and treats all data equally without distinguishing +between them. (2) A synthetic-aware branch for improving the robustness of the +feature extractor by distinguishing between real and synthetic data and +learning their discrepancies. Extensive experimental results demonstrate that +our method can improve the accuracy of long-tailed image recognition. Notably, +our approach achieves state-of-the-art Top-1 accuracy and significantly +surpasses other methods on CIFAR-10-LT and CIFAR-100-LT datasets across various +imbalance factors. Our code is available at https://github.com/lgX1123/gm4lt. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Beyond Uncertainty: Evidential Deep Learning for Robust Video Temporal + Grounding + + +
+ Existing Video Temporal Grounding (VTG) models excel in accuracy but often +overlook open-world challenges posed by open-vocabulary queries and untrimmed +videos. This leads to unreliable predictions for noisy, corrupted, and +out-of-distribution data. Adapting VTG models to dynamically estimate +uncertainties based on user input can address this issue. To this end, we +introduce SRAM, a robust network module that benefits from a two-stage +cross-modal alignment task. More importantly, it integrates Deep Evidential +Regression (DER) to explicitly and thoroughly quantify uncertainty during +training, thus allowing the model to say "I do not know" in scenarios beyond +its handling capacity. However, the direct application of traditional DER +theory and its regularizer reveals structural flaws, leading to unintended +constraints in VTG tasks. In response, we develop a simple yet effective +Geom-regularizer that enhances the uncertainty learning framework from the +ground up. To the best of our knowledge, this marks the first successful +attempt of DER in VTG. Our extensive quantitative and qualitative results +affirm the effectiveness, robustness, and interpretability of our modules and +the uncertainty learning paradigm in VTG tasks. The code will be made +available. + +
+
+ comment: Ongoing work: 28pages, 19 figures, 7 tables. Code is available at: + https://kaijing.space/SRAM/ +
+
+
+
+
+ + ☆ UDD: Dataset Distillation via Mining Underutilized Regions + + +
+ Dataset distillation synthesizes a small dataset such that a model trained on +this set approximates the performance of the original dataset. Recent studies +on dataset distillation focused primarily on the design of the optimization +process, with methods such as gradient matching, feature alignment, and +training trajectory matching. However, little attention has been given to the +issue of underutilized regions in synthetic images. In this paper, we propose +UDD, a novel approach to identify and exploit the underutilized regions to make +them informative and discriminate, and thus improve the utilization of the +synthetic dataset. Technically, UDD involves two underutilized regions +searching policies for different conditions, i.e., response-based policy and +data jittering-based policy. Compared with previous works, such two policies +are utilization-sensitive, equipping with the ability to dynamically adjust the +underutilized regions during the training process. Additionally, we analyze the +current model optimization problem and design a category-wise feature +contrastive loss, which can enhance the distinguishability of different +categories and alleviate the shortcomings of the existing multi-formation +methods. Experimentally, our method improves the utilization of the synthetic +dataset and outperforms the state-of-the-art methods on various datasets, such +as MNIST, FashionMNIST, SVHN, CIFAR-10, and CIFAR-100. For example, the +improvements on CIFAR-10 and CIFAR-100 are 4.0\% and 3.7\% over the next best +method with IPC=1, by mining the underutilized regions. + +
+
+ comment: PRCV2024 +
+
+
+
+
+ + ☆ Improving Diffusion-based Data Augmentation with Inversion Spherical + Interpolation + + +
+ Data Augmentation (DA), \ie, synthesizing faithful and diverse samples to +expand the original training set, is a prevalent and effective strategy to +improve various visual recognition tasks. With the powerful image generation +ability, diffusion-based DA has shown strong performance gains on different +benchmarks. In this paper, we analyze today's diffusion-based DA methods, and +argue that they cannot take account of both faithfulness and diversity, which +are two critical keys for generating high-quality samples and boosting final +classification performance. To this end, we propose a novel Diffusion-based +Inversion Interpolation DA method: Diff-II. Specifically, Diff-II consists of +three main steps: 1) Category concepts learning: Learning concept embeddings +for each category. 2) Inversion interpolation: Calculating the inversion for +each image, and conducting spherical interpolation for two randomly sampled +inversions from the same category. 3) Two-stage denoising: Using different +prompts to generate synthesized images in a coarse-to-fine manner. Extensive +experiments on multiple image classification tasks (\eg, few-shot, long-tailed, +and out-of-distribution classification) have demonstrated its effectiveness +over state-of-the-art diffusion-based DA methods. + +
+
+
+
+
+ + ☆ Low Saturation Confidence Distribution-based Test-Time Adaptation for + Cross-Domain Remote Sensing Image Classification + + +
+ Although the Unsupervised Domain Adaptation (UDA) method has improved the +effect of remote sensing image classification tasks, most of them are still +limited by access to the source domain (SD) data. Designs such as Source-free +Domain Adaptation (SFDA) solve the challenge of a lack of SD data, however, +they still rely on a large amount of target domain data and thus cannot achieve +fast adaptations, which seriously hinders their further application in broader +scenarios. The real-world applications of cross-domain remote sensing image +classification require a balance of speed and accuracy at the same time. +Therefore, we propose a novel and comprehensive test time adaptation (TTA) +method -- Low Saturation Confidence Distribution Test Time Adaptation +(LSCD-TTA), which is the first attempt to solve such scenarios through the idea +of TTA. LSCD-TTA specifically considers the distribution characteristics of +remote sensing images, including three main parts that concentrate on different +optimization directions: First, low saturation distribution (LSD) considers the +dominance of low-confidence samples during the later TTA stage. Second, +weak-category cross-entropy (WCCE) increases the weight of categories that are +more difficult to classify with less prior knowledge. Finally, diverse +categories confidence (DIV) comprehensively considers the category diversity to +alleviate the deviation of the sample distribution. By weighting the +abovementioned three modules, the model can widely, quickly and accurately +adapt to the target domain without much prior target distributions, repeated +data access, and manual annotation. We evaluate LSCD-TTA on three +remote-sensing image datasets. The experimental results show that LSCD-TTA +achieves a significant gain of 4.96%-10.51% with Resnet-50 and 5.33%-12.49% +with Resnet-101 in average accuracy compared to other state-of-the-art DA and +TTA methods. + +
+
+
+
+
+ + ☆ Advancing Architectural Floorplan Design with Geometry-enhanced Graph + Diffusion + + +
+ Automating architectural floorplan design is vital for housing and interior +design, offering a faster, cost-effective alternative to manual sketches by +architects. However, existing methods, including rule-based and learning-based +approaches, face challenges in design complexity and constrained generation +with extensive post-processing, and tend to obvious geometric inconsistencies +such as misalignment, overlap, and gaps. In this work, we propose a novel +generative framework for vector floorplan design via structural graph +generation, called GSDiff, focusing on wall junction generation and wall +segment prediction to capture both geometric and semantic aspects of structural +graphs. To improve the geometric rationality of generated structural graphs, we +propose two innovative geometry enhancement methods. In wall junction +generation, we propose a novel alignment loss function to improve geometric +consistency. In wall segment prediction, we propose a random self-supervision +method to enhance the model's perception of the overall geometric structure, +thereby promoting the generation of reasonable geometric structures. Employing +the diffusion model and the Transformer model, as well as the geometry +enhancement strategies, our framework can generate wall junctions, wall +segments and room polygons with structural and semantic information, resulting +in structural graphs that accurately represent floorplans. Extensive +experiments show that the proposed method surpasses existing techniques, +enabling free generation and constrained generation, marking a shift towards +structure generation in architectural design. + +
+
+
+
+
+ + ☆ EvLight++: Low-Light Video Enhancement with an Event Camera: A + Large-Scale Real-World Dataset, Novel Method, and More + + +
+ Event cameras offer significant advantages for low-light video enhancement, +primarily due to their high dynamic range. Current research, however, is +severely limited by the absence of large-scale, real-world, and +spatio-temporally aligned event-video datasets. To address this, we introduce a +large-scale dataset with over 30,000 pairs of frames and events captured under +varying illumination. This dataset was curated using a robotic arm that traces +a consistent non-linear trajectory, achieving spatial alignment precision under +0.03mm and temporal alignment with errors under 0.01s for 90% of the dataset. +Based on the dataset, we propose \textbf{EvLight++}, a novel event-guided +low-light video enhancement approach designed for robust performance in +real-world scenarios. Firstly, we design a multi-scale holistic fusion branch +to integrate structural and textural information from both images and events. +To counteract variations in regional illumination and noise, we introduce +Signal-to-Noise Ratio (SNR)-guided regional feature selection, enhancing +features from high SNR regions and augmenting those from low SNR regions by +extracting structural information from events. To incorporate temporal +information and ensure temporal coherence, we further introduce a recurrent +module and temporal loss in the whole pipeline. Extensive experiments on our +and the synthetic SDSD dataset demonstrate that EvLight++ significantly +outperforms both single image- and video-based methods by 1.37 dB and 3.71 dB, +respectively. To further explore its potential in downstream tasks like +semantic segmentation and monocular depth estimation, we extend our datasets by +adding pseudo segmentation and depth labels via meticulous annotation efforts +with foundation models. Experiments under diverse low-light scenes show that +the enhanced results achieve a 15.97% improvement in mIoU for semantic +segmentation. + +
+
+ comment: Journal extension based on EvLight (arXiv:2404.00834) +
+
+
+
+
+ + ☆ Anno-incomplete Multi-dataset Detection + + +
+ Object detectors have shown outstanding performance on various public +datasets. However, annotating a new dataset for a new task is usually +unavoidable in real, since 1) a single existing dataset usually does not +contain all object categories needed; 2) using multiple datasets usually +suffers from annotation incompletion and heterogeneous features. We propose a +novel problem as "Annotation-incomplete Multi-dataset Detection", and develop +an end-to-end multi-task learning architecture which can accurately detect all +the object categories with multiple partially annotated datasets. Specifically, +we propose an attention feature extractor which helps to mine the relations +among different datasets. Besides, a knowledge amalgamation training strategy +is incorporated to accommodate heterogeneous features from different sources. +Extensive experiments on different object detection datasets demonstrate the +effectiveness of our methods and an improvement of 2.17%, 2.10% in mAP can be +achieved on COCO and VOC respectively. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ Neural Spectral Decomposition for Dataset Distillation ECCV 2024 + + +
+ In this paper, we propose Neural Spectrum Decomposition, a generic +decomposition framework for dataset distillation. Unlike previous methods, we +consider the entire dataset as a high-dimensional observation that is low-rank +across all dimensions. We aim to discover the low-rank representation of the +entire dataset and perform distillation efficiently. Toward this end, we learn +a set of spectrum tensors and transformation matrices, which, through simple +matrix multiplication, reconstruct the data distribution. Specifically, a +spectrum tensor can be mapped back to the image space by a transformation +matrix, and efficient information sharing during the distillation learning +process is achieved through pairwise combinations of different spectrum vectors +and transformation matrices. Furthermore, we integrate a trajectory matching +optimization method guided by a real distribution. Our experimental results +demonstrate that our approach achieves state-of-the-art performance on +benchmarks, including CIFAR10, CIFAR100, Tiny Imagenet, and ImageNet Subset. +Our code are available at \url{https://github.com/slyang2021/NSD}. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ LMT-GP: Combined Latent Mean-Teacher and Gaussian Process for + Semi-supervised Low-light Image Enhancement + + +
+ While recent low-light image enhancement (LLIE) methods have made significant +advancements, they still face challenges in terms of low visual quality and +weak generalization ability when applied to complex scenarios. To address these +issues, we propose a semi-supervised method based on latent mean-teacher and +Gaussian process, named LMT-GP. We first design a latent mean-teacher framework +that integrates both labeled and unlabeled data, as well as their latent +vectors, into model training. Meanwhile, we use a mean-teacher-assisted +Gaussian process learning strategy to establish a connection between the latent +and pseudo-latent vectors obtained from the labeled and unlabeled data. To +guide the learning process, we utilize an assisted Gaussian process regression +(GPR) loss function. Furthermore, we design a pseudo-label adaptation module +(PAM) to ensure the reliability of the network learning. To demonstrate our +method's generalization ability and effectiveness, we apply it to multiple LLIE +datasets and high-level vision tasks. Experiment results demonstrate that our +method achieves high generalization performance and image quality. The code is +available at https://github.com/HFUT-CV/LMT-GP. + +
+
+
+
+
+ + ☆ PSE-Net: Channel Pruning for Convolutional Neural Networks with + Parallel-subnets Estimator + + +
+ Channel Pruning is one of the most widespread techniques used to compress +deep neural networks while maintaining their performances. Currently, a typical +pruning algorithm leverages neural architecture search to directly find +networks with a configurable width, the key step of which is to identify +representative subnet for various pruning ratios by training a supernet. +However, current methods mainly follow a serial training strategy to optimize +supernet, which is very time-consuming. In this work, we introduce PSE-Net, a +novel parallel-subnets estimator for efficient channel pruning. Specifically, +we propose a parallel-subnets training algorithm that simulate the +forward-backward pass of multiple subnets by droping extraneous features on +batch dimension, thus various subnets could be trained in one round. Our +proposed algorithm facilitates the efficiency of supernet training and equips +the network with the ability to interpolate the accuracy of unsampled subnets, +enabling PSE-Net to effectively evaluate and rank the subnets. Over the trained +supernet, we develop a prior-distributed-based sampling algorithm to boost the +performance of classical evolutionary search. Such algorithm utilizes the prior +information of supernet training phase to assist in the search of optimal +subnets while tackling the challenge of discovering samples that satisfy +resource constraints due to the long-tail distribution of network +configuration. Extensive experiments demonstrate PSE-Net outperforms previous +state-of-the-art channel pruning methods on the ImageNet dataset while +retaining superior supernet training efficiency. For example, under 300M FLOPs +constraint, our pruned MobileNetV2 achieves 75.2% Top-1 accuracy on ImageNet +dataset, exceeding the original MobileNetV2 by 2.6 units while only cost +30%/16% times than BCNet/AutoAlim. + +
+
+ comment: 10pages, Neural Networks +
+
+
+
+
+ + ☆ Enhancing Conditional Image Generation with Explainable Latent Space + Manipulation + + +
+ In the realm of image synthesis, achieving fidelity to a reference image +while adhering to conditional prompts remains a significant challenge. This +paper proposes a novel approach that integrates a diffusion model with latent +space manipulation and gradient-based selective attention mechanisms to address +this issue. Leveraging Grad-SAM (Gradient-based Selective Attention +Manipulation), we analyze the cross attention maps of the cross attention +layers and gradients for the denoised latent vector, deriving importance scores +of elements of denoised latent vector related to the subject of interest. Using +this information, we create masks at specific timesteps during denoising to +preserve subjects while seamlessly integrating the reference image features. +This approach ensures the faithful formation of subjects based on conditional +prompts, while concurrently refining the background for a more coherent +composition. Our experiments on places365 dataset demonstrate promising +results, with our proposed model achieving the lowest mean and median Frechet +Inception Distance (FID) scores compared to baseline models, indicating +superior fidelity preservation. Furthermore, our model exhibits competitive +performance in aligning the generated images with provided textual +descriptions, as evidenced by high CLIP scores. These results highlight the +effectiveness of our approach in both fidelity preservation and textual context +preservation, offering a significant advancement in text-to-image synthesis +tasks. + +
+
+ comment: 7 pages , 5 figures +
+
+
+
+
+ + ☆ Revisiting 360 Depth Estimation with PanoGabor: A New Fusion Perspective + + +
+ Depth estimation from a monocular 360 image is important to the perception of +the entire 3D environment. However, the inherent distortion and large field of +view (FoV) in 360 images pose great challenges for this task. To this end, +existing mainstream solutions typically introduce additional perspective-based +360 representations (\textit{e.g.}, Cubemap) to achieve effective feature +extraction. Nevertheless, regardless of the introduced representations, they +eventually need to be unified into the equirectangular projection (ERP) format +for the subsequent depth estimation, which inevitably reintroduces the +troublesome distortions. In this work, we propose an oriented distortion-aware +Gabor Fusion framework (PGFuse) to address the above challenges. First, we +introduce Gabor filters that analyze texture in the frequency domain, thereby +extending the receptive fields and enhancing depth cues. To address the +reintroduced distortions, we design a linear latitude-aware distortion +representation method to generate customized, distortion-aware Gabor filters +(PanoGabor filters). Furthermore, we design a channel-wise and spatial-wise +unidirectional fusion module (CS-UFM) that integrates the proposed PanoGabor +filters to unify other representations into the ERP format, delivering +effective and distortion-free features. Considering the orientation sensitivity +of the Gabor transform, we introduce a spherical gradient constraint to +stabilize this sensitivity. Experimental results on three popular indoor 360 +benchmarks demonstrate the superiority of the proposed PGFuse to existing +state-of-the-art solutions. Code can be available upon acceptance. + +
+
+
+
+
+ + ☆ LLaVA-SG: Leveraging Scene Graphs as Visual Semantic Expression in + Vision-Language Models + + +
+ Recent advances in large vision-language models (VLMs) typically employ +vision encoders based on the Vision Transformer (ViT) architecture. The +division of the images into patches by ViT results in a fragmented perception, +thereby hindering the visual understanding capabilities of VLMs. In this paper, +we propose an innovative enhancement to address this limitation by introducing +a Scene Graph Expression (SGE) module in VLMs. This module extracts and +structurally expresses the complex semantic information within images, thereby +improving the foundational perception and understanding abilities of VLMs. +Extensive experiments demonstrate that integrating our SGE module significantly +enhances the VLM's performance in vision-language tasks, indicating its +effectiveness in preserving intricate semantic details and facilitating better +visual understanding. Code and data would be available. + +
+
+
+
+
+ + ☆ Training-free Video Temporal Grounding using Large-scale Pre-trained + Models ECCV 2024 + + +
+ Video temporal grounding aims to identify video segments within untrimmed +videos that are most relevant to a given natural language query. Existing video +temporal localization models rely on specific datasets for training and have +high data collection costs, but they exhibit poor generalization capability +under the across-dataset and out-of-distribution (OOD) settings. In this paper, +we propose a Training-Free Video Temporal Grounding (TFVTG) approach that +leverages the ability of pre-trained large models. A naive baseline is to +enumerate proposals in the video and use the pre-trained visual language models +(VLMs) to select the best proposal according to the vision-language alignment. +However, most existing VLMs are trained on image-text pairs or trimmed video +clip-text pairs, making it struggle to (1) grasp the relationship and +distinguish the temporal boundaries of multiple events within the same video; +(2) comprehend and be sensitive to the dynamic transition of events (the +transition from one event to another) in the video. To address these issues, we +propose leveraging large language models (LLMs) to analyze multiple sub-events +contained in the query text and analyze the temporal order and relationships +between these events. Secondly, we split a sub-event into dynamic transition +and static status parts and propose the dynamic and static scoring functions +using VLMs to better evaluate the relevance between the event and the +description. Finally, for each sub-event description, we use VLMs to locate the +top-k proposals and leverage the order and relationships between sub-events +provided by LLMs to filter and integrate these proposals. Our method achieves +the best performance on zero-shot video temporal grounding on Charades-STA and +ActivityNet Captions datasets without any training and demonstrates better +generalization capabilities in cross-dataset and OOD settings. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ M4CXR: Exploring Multi-task Potentials of Multi-modal Large Language + Models for Chest X-ray Interpretation + + +
+ The rapid evolution of artificial intelligence, especially in large language +models (LLMs), has significantly impacted various domains, including +healthcare. In chest X-ray (CXR) analysis, previous studies have employed LLMs, +but with limitations: either underutilizing the multi-tasking capabilities of +LLMs or lacking clinical accuracy. This paper presents M4CXR, a multi-modal LLM +designed to enhance CXR interpretation. The model is trained on a visual +instruction-following dataset that integrates various task-specific datasets in +a conversational format. As a result, the model supports multiple tasks such as +medical report generation (MRG), visual grounding, and visual question +answering (VQA). M4CXR achieves state-of-the-art clinical accuracy in MRG by +employing a chain-of-thought prompting strategy, in which it identifies +findings in CXR images and subsequently generates corresponding reports. The +model is adaptable to various MRG scenarios depending on the available inputs, +such as single-image, multi-image, and multi-study contexts. In addition to +MRG, M4CXR performs visual grounding at a level comparable to specialized +models and also demonstrates outstanding performance in VQA. Both quantitative +and qualitative assessments reveal M4CXR's versatility in MRG, visual +grounding, and VQA, while consistently maintaining clinical accuracy. + +
+
+
+
+
+ + ☆ Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on + Model-free Products + + +
+ Anomaly detection is a long-standing challenge in manufacturing systems. +Traditionally, anomaly detection has relied on human inspectors. However, 3D +point clouds have gained attention due to their robustness to environmental +factors and their ability to represent geometric data. Existing 3D anomaly +detection methods generally fall into two categories. One compares scanned 3D +point clouds with design files, assuming these files are always available. +However, such assumptions are often violated in many real-world applications +where model-free products exist, such as fresh produce (i.e., ``Cookie", +``Potato", etc.), dentures, bone, etc. The other category compares patches of +scanned 3D point clouds with a library of normal patches named memory bank. +However, those methods usually fail to detect incomplete shapes, which is a +fairly common defect type (i.e., missing pieces of different products). The +main challenge is that missing areas in 3D point clouds represent the absence +of scanned points. This makes it infeasible to compare the missing region with +existing point cloud patches in the memory bank. To address these two +challenges, we proposed a unified, unsupervised 3D anomaly detection framework +capable of identifying all types of defects on model-free products. Our method +integrates two detection modules: a feature-based detection module and a +reconstruction-based detection module. Feature-based detection covers geometric +defects, such as dents, holes, and cracks, while the reconstruction-based +method detects missing regions. Additionally, we employ a One-class Support +Vector Machine (OCSVM) to fuse the detection results from both modules. The +results demonstrate that (1) our proposed method outperforms the +state-of-the-art methods in identifying incomplete shapes and (2) it still +maintains comparable performance with the SOTA methods in detecting all other +types of anomalies. + +
+
+
+
+
+ + ☆ PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object + Detection in Bird's-Eye-View + + +
+ Recently, LSS-based multi-view 3D object detection provides an economical and +deployment-friendly solution for autonomous driving. However, all the existing +LSS-based methods transform multi-view image features into a Cartesian +Bird's-Eye-View(BEV) representation, which does not take into account the +non-uniform image information distribution and hardly exploits the view +symmetry. In this paper, in order to adapt the image information distribution +and preserve the view symmetry by regular convolution, we propose to employ the +polar BEV representation to substitute the Cartesian BEV representation. To +achieve this, we elaborately tailor three modules: a polar view transformer to +generate the polar BEV representation, a polar temporal fusion module for +fusing historical polar BEV features and a polar detection head to predict the +polar-parameterized representation of the object. In addition, we design a 2D +auxiliary detection head and a spatial attention enhancement module to improve +the quality of feature extraction in perspective view and BEV, respectively. +Finally, we integrate the above improvements into a novel multi-view 3D object +detector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves +the superior performance. The code is available at +https://github.com/Yzichen/PolarBEVDet.git. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ DLM-VMTL:A Double Layer Mapper for heterogeneous data video Multi-task + prompt learning + + +
+ In recent years, the parameters of backbones of Video Understanding tasks +continue to increase and even reach billion-level. Whether fine-tuning a +specific task on the Video Foundation Model or pre-training the model designed +for the specific task, incurs a lot of overhead. How to make these models play +other values than their own tasks becomes a worthy question. Multi-Task +Learning(MTL) makes the visual task acquire the rich shareable knowledge from +other tasks while joint training. It is fully explored in Image Recognition +tasks especially dense predict tasks. Nevertheless, it is rarely used in video +domain due to the lack of multi-labels video data. In this paper, a +heterogenous data video multi-task prompt learning (VMTL) method is proposed to +address above problem. It's different from it in image domain, a Double-Layers +Mapper(DLM) is proposed to extract the shareable knowledge into visual promptS +and align it with representation of primary task. Extensive experiments prove +that our DLM-VMTL performs better than baselines on 6 different video +understanding tasks and 11 datasets. + +
+
+
+
+
+ + ☆ Estimating Dynamic Flow Features in Groups of Tracked Objects + + +
+ Interpreting motion captured in image sequences is crucial for a wide range +of computer vision applications. Typical estimation approaches include optical +flow (OF), which approximates the apparent motion instantaneously in a scene, +and multiple object tracking (MOT), which tracks the motion of subjects over +time. Often, the motion of objects in a scene is governed by some underlying +dynamical system which could be inferred by analyzing the motion of groups of +objects. Standard motion analyses, however, are not designed to intuit flow +dynamics from trajectory data, making such measurements difficult in practice. +The goal of this work is to extend gradient-based dynamical systems analyses to +real-world applications characterized by complex, feature-rich image sequences +with imperfect tracers. The tracer trajectories are tracked using deep vision +networks and gradients are approximated using Lagrangian gradient regression +(LGR), a tool designed to estimate spatial gradients from sparse data. From +gradients, dynamical features such as regions of coherent rotation and +transport barriers are identified. The proposed approach is affordably +implemented and enables advanced studies including the motion analysis of two +distinct object classes in a single image sequence. Two examples of the method +are presented on data sets for which standard gradient-based analyses do not +apply. + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ☆ VLM-KD: Knowledge Distillation from VLM for Long-Tail Visual Recognition + + +
+ For visual recognition, knowledge distillation typically involves +transferring knowledge from a large, well-trained teacher model to a smaller +student model. In this paper, we introduce an effective method to distill +knowledge from an off-the-shelf vision-language model (VLM), demonstrating that +it provides novel supervision in addition to those from a conventional +vision-only teacher model. Our key technical contribution is the development of +a framework that generates novel text supervision and distills free-form text +into a vision encoder. We showcase the effectiveness of our approach, termed +VLM-KD, across various benchmark datasets, showing that it surpasses several +state-of-the-art long-tail visual classifiers. To our knowledge, this work is +the first to utilize knowledge distillation with text supervision generated by +an off-the-shelf VLM and apply it to vanilla randomly initialized vision +encoders. + +
+
+
+
+
+ + ☆ Enhancing Autism Spectrum Disorder Early Detection with the Parent-Child + Dyads Block-Play Protocol and an Attention-enhanced GCN-xLSTM Hybrid Deep + Learning Framework + + +
+ Autism Spectrum Disorder (ASD) is a rapidly growing neurodevelopmental +disorder. Performing a timely intervention is crucial for the growth of young +children with ASD, but traditional clinical screening methods lack objectivity. +This study introduces an innovative approach to early detection of ASD. The +contributions are threefold. First, this work proposes a novel Parent-Child +Dyads Block-Play (PCB) protocol, grounded in kinesiological and neuroscientific +research, to identify behavioral patterns distinguishing ASD from typically +developing (TD) toddlers. Second, we have compiled a substantial video dataset, +featuring 40 ASD and 89 TD toddlers engaged in block play with parents. This +dataset exceeds previous efforts on both the scale of participants and the +length of individual sessions. Third, our approach to action analysis in videos +employs a hybrid deep learning framework, integrating a two-stream graph +convolution network with attention-enhanced xLSTM (2sGCN-AxLSTM). This +framework is adept at capturing dynamic interactions between toddlers and +parents by extracting spatial features correlated with upper body and head +movements and focusing on global contextual information of action sequences +over time. By learning these global features with spatio-temporal correlations, +our 2sGCN-AxLSTM effectively analyzes dynamic human behavior patterns and +demonstrates an unprecedented accuracy of 89.6\% in early detection of ASD. Our +approach shows strong potential for enhancing early ASD diagnosis by accurately +analyzing parent-child interactions, providing a critical tool to support +timely and informed clinical decision-making. + +
+
+ comment: 18 pages, 8 figures, and 4 tables +
+
+
+
+
+ + ☆ Ig3D: Integrating 3D Face Representations in Facial Expression Inference ECCV + + +
+ Reconstructing 3D faces with facial geometry from single images has allowed +for major advances in animation, generative models, and virtual reality. +However, this ability to represent faces with their 3D features is not as fully +explored by the facial expression inference (FEI) community. This study +therefore aims to investigate the impacts of integrating such 3D +representations into the FEI task, specifically for facial expression +classification and face-based valence-arousal (VA) estimation. To accomplish +this, we first assess the performance of two 3D face representations (both +based on the 3D morphable model, FLAME) for the FEI tasks. We further explore +two fusion architectures, intermediate fusion and late fusion, for integrating +the 3D face representations with existing 2D inference frameworks. To evaluate +our proposed architecture, we extract the corresponding 3D representations and +perform extensive tests on the AffectNet and RAF-DB datasets. Our experimental +results demonstrate that our proposed method outperforms the state-of-the-art +AffectNet VA estimation and RAF-DB classification tasks. Moreover, our method +can act as a complement to other existing methods to boost performance in many +emotion inference tasks. + +
+
+ comment: Accepted by ECCVW 2024 +
+
+
+
+
+ + ☆ Tex-ViT: A Generalizable, Robust, Texture-based dual-branch + cross-attention deepfake detector + + +
+ Deepfakes, which employ GAN to produce highly realistic facial modification, +are widely regarded as the prevailing method. Traditional CNN have been able to +identify bogus media, but they struggle to perform well on different datasets +and are vulnerable to adversarial attacks due to their lack of robustness. +Vision transformers have demonstrated potential in the realm of image +classification problems, but they require enough training data. Motivated by +these limitations, this publication introduces Tex-ViT (Texture-Vision +Transformer), which enhances CNN features by combining ResNet with a vision +transformer. The model combines traditional ResNet features with a texture +module that operates in parallel on sections of ResNet before each +down-sampling operation. The texture module then serves as an input to the dual +branch of the cross-attention vision transformer. It specifically focuses on +improving the global texture module, which extracts feature map correlation. +Empirical analysis reveals that fake images exhibit smooth textures that do not +remain consistent over long distances in manipulations. Experiments were +performed on different categories of FF++, such as DF, f2f, FS, and NT, +together with other types of GAN datasets in cross-domain scenarios. +Furthermore, experiments also conducted on FF++, DFDCPreview, and Celeb-DF +dataset underwent several post-processing situations, such as blurring, +compression, and noise. The model surpassed the most advanced models in terms +of generalization, achieving a 98% accuracy in cross-domain scenarios. This +demonstrates its ability to learn the shared distinguishing textural +characteristics in the manipulated samples. These experiments provide evidence +that the proposed model is capable of being applied to various situations and +is resistant to many post-processing procedures. + +
+
+
+
+
+ + ☆ LV-UNet: A Lightweight and Vanilla Model for Medical Image Segmentation + + +
+ Although the progress made by large models in computer vision, optimization +challenges, the complexity of transformer models, computational limitations, +and the requirements of practical applications call for simpler designs in +model architecture for medical image segmentation, especially in mobile medical +devices that require lightweight and deployable models with real-time +performance. However, some of the current lightweight models exhibit poor +robustness across different datasets, which hinders their broader adoption. +This paper proposes a lightweight and vanilla model called LV-UNet, which +effectively utilizes pre-trained MobileNetv3-Large models and introduces +fusible modules. It can be trained using an improved deep training strategy and +switched to deployment mode during inference, reducing both parameter count and +computational load. Experiments are conducted on ISIC 2016, BUSI, CVC- +ClinicDB, CVC-ColonDB, and Kvair-SEG datasets, achieving better performance +compared to the state-of-the-art and classic models. + +
+
+
+
+
+ + ☆ Revising Multimodal VAEs with Diffusion Decoders + + +
+ Multimodal VAEs often struggle with generating high-quality outputs, a +challenge that extends beyond the inherent limitations of the VAE framework. +The core issue lies in the restricted joint representation of the latent space, +particularly when complex modalities like images are involved. Feedforward +decoders, commonly used for these intricate modalities, inadvertently constrain +the joint latent space, leading to a degradation in the quality of the other +modalities as well. Although recent studies have shown improvement by +introducing modality-specific representations, the issue remains significant. +In this work, we demonstrate that incorporating a flexible diffusion decoder +specifically for the image modality not only enhances the generation quality of +the images but also positively impacts the performance of the other modalities +that rely on feedforward decoders. This approach addresses the limitations +imposed by conventional joint representations and opens up new possibilities +for improving multimodal generation tasks using the multimodal VAE framework. +Our model provides state-of-the-art results compared to other multimodal VAEs +in different datasets with higher coherence and superior quality in the +generated modalities + +
+
+
+
+
+ + ☆ FineFACE: Fair Facial Attribute Classification Leveraging Fine-grained + Features + + +
+ Published research highlights the presence of demographic bias in automated +facial attribute classification algorithms, particularly impacting women and +individuals with darker skin tones. Existing bias mitigation techniques +typically require demographic annotations and often obtain a trade-off between +fairness and accuracy, i.e., Pareto inefficiency. Facial attributes, whether +common ones like gender or others such as "chubby" or "high cheekbones", +exhibit high interclass similarity and intraclass variation across demographics +leading to unequal accuracy. This requires the use of local and subtle cues +using fine-grained analysis for differentiation. This paper proposes a novel +approach to fair facial attribute classification by framing it as a +fine-grained classification problem. Our approach effectively integrates both +low-level local features (like edges and color) and high-level semantic +features (like shapes and structures) through cross-layer mutual attention +learning. Here, shallow to deep CNN layers function as experts, offering +category predictions and attention regions. An exhaustive evaluation on facial +attribute annotated datasets demonstrates that our FineFACE model improves +accuracy by 1.32% to 1.74% and fairness by 67% to 83.6%, over the SOTA bias +mitigation techniques. Importantly, our approach obtains a Pareto-efficient +balance between accuracy and fairness between demographic groups. In addition, +our approach does not require demographic annotations and is applicable to +diverse downstream classification tasks. To facilitate reproducibility, the +code and dataset information is available at +https://github.com/VCBSL-Fairness/FineFACE. + +
+
+
+
+
+ + ☆ MSLIQA: Enhancing Learning Representations for Image Quality Assessment + through Multi-Scale Learning + + +
+ No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due +to the diversity of distortions and the lack of large annotated datasets. Many +studies have attempted to tackle these challenges by developing more accurate +NR-IQA models, often employing complex and computationally expensive networks, +or by bridging the domain gap between various distortions to enhance +performance on test datasets. In our work, we improve the performance of a +generic lightweight NR-IQA model by introducing a novel augmentation strategy +that boosts its performance by almost 28\%. This augmentation strategy enables +the network to better discriminate between different distortions in various +parts of the image by zooming in and out. Additionally, the inclusion of +test-time augmentation further enhances performance, making our lightweight +network's results comparable to the current state-of-the-art models, simply +through the use of augmentations. + +
+
+
+
+
+ + ☆ GameIR: A Large-Scale Synthesized Ground-Truth Dataset for Image + Restoration over Gaming Content + + +
+ Image restoration methods like super-resolution and image synthesis have been +successfully used in commercial cloud gaming products like NVIDIA's DLSS. +However, restoration over gaming content is not well studied by the general +public. The discrepancy is mainly caused by the lack of ground-truth gaming +training data that match the test cases. Due to the unique characteristics of +gaming content, the common approach of generating pseudo training data by +degrading the original HR images results in inferior restoration performance. +In this work, we develop GameIR, a large-scale high-quality +computer-synthesized ground-truth dataset to fill in the blanks, targeting at +two different applications. The first is super-resolution with deferred +rendering, to support the gaming solution of rendering and transferring LR +images only and restoring HR images on the client side. We provide 19200 LR-HR +paired ground-truth frames coming from 640 videos rendered at 720p and 1440p +for this task. The second is novel view synthesis (NVS), to support the +multiview gaming solution of rendering and transferring part of the multiview +frames and generating the remaining frames on the client side. This task has +57,600 HR frames from 960 videos of 160 scenes with 6 camera views. In addition +to the RGB frames, the GBuffers during the deferred rendering stage are also +provided, which can be used to help restoration. Furthermore, we evaluate +several SOTA super-resolution algorithms and NeRF-based NVS algorithms over our +dataset, which demonstrates the effectiveness of our ground-truth GameIR data +in improving restoration performance for gaming content. Also, we test the +method of incorporating the GBuffers as additional input information for +helping super-resolution and NVS. We release our dataset and models to the +general public to facilitate research on restoration methods over gaming +content. + +
+
+
+
+
+ + ☆ Comparative Analysis of Transfer Learning Models for Breast Cancer + Classification + + +
+ The classification of histopathological images is crucial for the early and +precise detection of breast cancer. This study investigates the efficiency of +deep learning models in distinguishing between Invasive Ductal Carcinoma (IDC) +and non-IDC in histopathology slides. We conducted a thorough comparison +examination of eight sophisticated models: ResNet-50, DenseNet-121, ResNeXt-50, +Vision Transformer (ViT), GoogLeNet (Inception v3), EfficientNet, MobileNet, +and SqueezeNet. This analysis was carried out using a large dataset of 277,524 +image patches. Our research makes a substantial contribution to the field by +offering a comprehensive assessment of the performance of each model. We +particularly highlight the exceptional efficacy of attention-based mechanisms +in the ViT model, which achieved a remarkable validation accuracy of 93\%, +surpassing conventional convolutional networks. This study highlights the +promise of advanced machine learning approaches in clinical settings, offering +improved precision as well as efficiency in breast cancer diagnosis. + +
+
+
+
+
+ + ☆ Enabling Local Editing in Diffusion Models by Joint and Individual + Component Analysis + + +
+ Recent advances in Diffusion Models (DMs) have led to significant progress in +visual synthesis and editing tasks, establishing them as a strong competitor to +Generative Adversarial Networks (GANs). However, the latent space of DMs is not +as well understood as that of GANs. Recent research has focused on unsupervised +semantic discovery in the latent space of DMs by leveraging the bottleneck +layer of the denoising network, which has been shown to exhibit properties of a +semantic latent space. However, these approaches are limited to discovering +global attributes. In this paper we address, the challenge of local image +manipulation in DMs and introduce an unsupervised method to factorize the +latent semantics learned by the denoising network of pre-trained DMs. Given an +arbitrary image and defined regions of interest, we utilize the Jacobian of the +denoising network to establish a relation between the regions of interest and +their corresponding subspaces in the latent space. Furthermore, we disentangle +the joint and individual components of these subspaces to identify latent +directions that enable local image manipulation. Once discovered, these +directions can be applied to different images to produce semantically +consistent edits, making our method suitable for practical applications. +Experimental results on various datasets demonstrate that our method can +produce semantic edits that are more localized and have better fidelity +compared to the state-of-the-art. + +
+
+ comment: Code available here: https://zelaki.github.io/localdiff/ +
+
+
+
+
+ + ☆ Fluent and Accurate Image Captioning with a Self-Trained Reward Model ICPR 2024 + + +
+ Fine-tuning image captioning models with hand-crafted rewards like the CIDEr +metric has been a classical strategy for promoting caption quality at the +sequence level. This approach, however, is known to limit descriptiveness and +semantic richness and tends to drive the model towards the style of +ground-truth sentences, thus losing detail and specificity. On the contrary, +recent attempts to employ image-text models like CLIP as reward have led to +grammatically incorrect and repetitive captions. In this paper, we propose +Self-Cap, a captioning approach that relies on a learnable reward model based +on self-generated negatives that can discriminate captions based on their +consistency with the image. Specifically, our discriminator is a fine-tuned +contrastive image-text model trained to promote caption correctness while +avoiding the aberrations that typically happen when training with a CLIP-based +reward. To this end, our discriminator directly incorporates negative samples +from a frozen captioner, which significantly improves the quality and richness +of the generated captions but also reduces the fine-tuning time in comparison +to using the CIDEr score as the sole metric for optimization. Experimental +results demonstrate the effectiveness of our training strategy on both standard +and zero-shot image captioning datasets. + +
+
+ comment: ICPR 2024 +
+
+
+
+
+ + ☆ See or Guess: Counterfactually Regularized Image Captioning ACM MM 2024 + + +
+ Image captioning, which generates natural language descriptions of the visual +information in an image, is a crucial task in vision-language research. +Previous models have typically addressed this task by aligning the generative +capabilities of machines with human intelligence through statistical fitting of +existing datasets. While effective for normal images, they may struggle to +accurately describe those where certain parts of the image are obscured or +edited, unlike humans who excel in such cases. These weaknesses they exhibit, +including hallucinations and limited interpretability, often hinder performance +in scenarios with shifted association patterns. In this paper, we present a +generic image captioning framework that employs causal inference to make +existing models more capable of interventional tasks, and counterfactually +explainable. Our approach includes two variants leveraging either total effect +or natural direct effect. Integrating them into the training process enables +models to handle counterfactual scenarios, increasing their generalizability. +Extensive experiments on various datasets show that our method effectively +reduces hallucinations and improves the model's faithfulness to images, +demonstrating high portability across both small-scale and large-scale +image-to-text models. The code is available at +https://github.com/Aman-4-Real/See-or-Guess. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ STEREO: Towards Adversarially Robust Concept Erasing from Text-to-Image + Generation Models + + +
+ The rapid proliferation of large-scale text-to-image generation (T2IG) models +has led to concerns about their potential misuse in generating harmful content. +Though many methods have been proposed for erasing undesired concepts from T2IG +models, they only provide a false sense of security, as recent works +demonstrate that concept-erased models (CEMs) can be easily deceived to +generate the erased concept through adversarial attacks. The problem of +adversarially robust concept erasing without significant degradation to model +utility (ability to generate benign concepts) remains an unresolved challenge, +especially in the white-box setting where the adversary has access to the CEM. +To address this gap, we propose an approach called STEREO that involves two +distinct stages. The first stage searches thoroughly enough for strong and +diverse adversarial prompts that can regenerate an erased concept from a CEM, +by leveraging robust optimization principles from adversarial training. In the +second robustly erase once stage, we introduce an anchor-concept-based +compositional objective to robustly erase the target concept at one go, while +attempting to minimize the degradation on model utility. By benchmarking the +proposed STEREO approach against four state-of-the-art concept erasure methods +under three adversarial attacks, we demonstrate its ability to achieve a better +robustness vs. utility trade-off. Our code and models are available at +https://github.com/koushiksrivats/robust-concept-erasing. + +
+
+ comment: Project Page: + https://koushiksrivats.github.io/robust-concept-erasing/ +
+
+
+
+
+ + ♻ ☆ VGBench: Evaluating Large Language Models on Vector Graphics + Understanding and Generation + + +
+ In the realm of vision models, the primary mode of representation is using +pixels to rasterize the visual world. Yet this is not always the best or unique +way to represent visual content, especially for designers and artists who +depict the world using geometry primitives such as polygons. Vector graphics +(VG), on the other hand, offer a textual representation of visual content, +which can be more concise and powerful for content like cartoons, sketches and +scientific figures. Recent studies have shown promising results on processing +vector graphics with capable Large Language Models (LLMs). However, such works +focus solely on qualitative results, understanding, or a specific type of +vector graphics. We propose VGBench, a comprehensive benchmark for LLMs on +handling vector graphics through diverse aspects, including (a) both visual +understanding and generation, (b) evaluation of various vector graphics +formats, (c) diverse question types, (d) wide range of prompting techniques, +(e) under multiple LLMs and (f) comparison with VLMs on rasterized +representations. Evaluating on our collected 4279 understanding and 5845 +generation samples, we find that LLMs show strong capability on both aspects +while exhibiting less desirable performance on low-level formats (SVG). Both +data and evaluation pipeline will be open-sourced at https://vgbench.github.io. + +
+
+ comment: Project Page: https://vgbench.github.io +
+
+
+
+
+ + ♻ ☆ GRAB: A Challenging GRaph Analysis Benchmark for Large Multimodal Models + + +
+ Large multimodal models (LMMs) have exhibited proficiencies across many +visual tasks. Although numerous well-known benchmarks exist to evaluate model +performance, they increasingly have insufficient headroom. As such, there is a +pressing need for a new generation of benchmarks challenging enough for the +next generation of LMMs. One area that LMMs show potential is graph analysis, +specifically, the tasks an analyst might typically perform when interpreting +figures such as estimating the mean, intercepts or correlations of functions +and data series. In this work, we introduce GRAB, a graph analysis benchmark, +fit for current and future frontier LMMs. Our benchmark is entirely synthetic, +ensuring high-quality, noise-free questions. GRAB is comprised of 2170 +questions, covering four tasks and 23 graph properties. We evaluate 20 LMMs on +GRAB, finding it to be a challenging benchmark, with the highest performing +model attaining a score of just 21.7%. Finally, we conduct various ablations to +investigate where the models succeed and struggle. We release GRAB to encourage +progress in this important, growing domain. + +
+
+ comment: V2: Fixed references formatting +
+
+
+
+
+ + ♻ ☆ Evaluation Framework for Feedback Generation Methods in Skeletal + Movement Assessment ECCV 2024 + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection and analysis from 2D or 3D videos. While the primary +objective of automatic assessment tasks is to score movements, the automatic +generation of feedback highlighting key movement issues has the potential to +significantly enhance and accelerate the rehabilitation process. While numerous +research works exist in the field of automatic movement assessment, only a +handful address feedback generation. In this study, we propose terminology and +criteria for the classification, evaluation, and comparison of feedback +generation solutions. We discuss the challenges associated with each feedback +generation approach and use our proposed criteria to classify existing +solutions. To our knowledge, this is the first work that formulates feedback +generation in skeletal movement assessment. + +
+
+ comment: Accepted to xAI4Biometrics 2024 at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ OpticalRS-4M: Scaling Efficient Masked Autoencoder Learning on Large + Remote Sensing Dataset + + +
+ Masked Image Modeling (MIM) has become an essential method for building +foundational visual models in remote sensing (RS). However, the limitations in +size and diversity of existing RS datasets restrict the ability of MIM methods +to learn generalizable representations. Additionally, conventional MIM +techniques, which require reconstructing all tokens, introduce unnecessary +computational overhead. To address these issues, we present a new pre-training +pipeline for RS models, featuring the creation of a large-scale RS dataset and +an efficient MIM approach. We curated a high-quality dataset named OpticalRS-4M +by collecting publicly available RS datasets and processing them through +exclusion, slicing, and deduplication. OpticalRS-4M comprises 4 million optical +images covering various RS tasks, such as object detection and pixel +segmentation. To enhance efficiency, we propose SelectiveMAE, a pre-training +method that dynamically encodes and reconstructs semantically rich patch +tokens, thereby reducing the inefficiencies of traditional MIM models caused by +redundant background pixels in RS images. Extensive experiments demonstrate +that OpticalRS-4M significantly improves classification, detection, and +segmentation performance, while SelectiveMAE increases training efficiency over +2 times. This highlights the effectiveness and scalability of our pipeline in +developing RS foundational models. + +
+
+
+
+
+ + ♻ ☆ Manipulate-Anything: Automating Real-World Robots using Vision-Language + Models + + +
+ Large-scale endeavors like and widespread community efforts such as +Open-X-Embodiment have contributed to growing the scale of robot demonstration +data. However, there is still an opportunity to improve the quality, quantity, +and diversity of robot demonstration data. Although vision-language models have +been shown to automatically generate demonstration data, their utility has been +limited to environments with privileged state information, they require +hand-designed skills, and are limited to interactions with few object +instances. We propose Manipulate-Anything, a scalable automated generation +method for real-world robotic manipulation. Unlike prior work, our method can +operate in real-world environments without any privileged state information, +hand-designed skills, and can manipulate any static object. We evaluate our +method using two setups. First, Manipulate-Anything successfully generates +trajectories for all 7 real-world and 14 simulation tasks, significantly +outperforming existing methods like VoxPoser. Second, Manipulate-Anything's +demonstrations can train more robust behavior cloning policies than training +with human demonstrations, or from data generated by VoxPoser, Scaling-up, and +Code-As-Policies. We believe Manipulate-Anything can be a scalable method for +both generating data for robotics and solving novel tasks in a zero-shot +setting. Project page: https://robot-ma.github.io/. + +
+
+ comment: Project page: https://robot-ma.github.io/. All supplementary + material, prompts and code can be found on the project page +
+
+
+
+
+ + ♻ ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More + than Measuring Coherence, Grounding, and Repetition + + +
+ Visual storytelling consists in generating a natural language story given a +temporally ordered sequence of images. This task is not only challenging for +models, but also very difficult to evaluate with automatic metrics since there +is no consensus about what makes a story 'good'. In this paper, we introduce a +novel method that measures story quality in terms of human likeness regarding +three key aspects highlighted in previous work: visual grounding, coherence, +and repetitiveness. We then use this method to evaluate the stories generated +by several models, showing that the foundation model LLaVA obtains the best +result, but only slightly so compared to TAPM, a 50-times smaller visual +storytelling model. Upgrading the visual and language components of TAPM +results in a model that yields competitive performance with a relatively low +number of parameters. Finally, we carry out a human evaluation study, whose +results suggest that a 'good' story may require more than a human-like level of +visual grounding, coherence, and repetition. + +
+
+
+
+
+ + ♻ ☆ Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent + Codes + + +
+ Trajectory forecasting is crucial for video surveillance analytics, as it +enables the anticipation of future movements for a set of agents, e.g. +basketball players engaged in intricate interactions with long-term intentions. +Deep generative models offer a natural learning approach for trajectory +forecasting, yet they encounter difficulties in achieving an optimal balance +between sampling fidelity and diversity. We address this challenge by +leveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a +discrete latent space to tackle the issue of posterior collapse. Specifically, +we introduce an instance-based codebook that allows tailored latent +representations for each example. In a nutshell, the rows of the codebook are +dynamically adjusted to reflect contextual information (i.e., past motion +patterns extracted from the observed trajectories). In this way, the +discretization process gains flexibility, leading to improved reconstructions. +Notably, instance-level dynamics are injected into the codebook through +low-rank updates, which restrict the customization of the codebook to a lower +dimension space. The resulting discrete space serves as the basis of the +subsequent step, which regards the training of a diffusion-based predictive +model. We show that such a two-fold framework, augmented with instance-level +discretization, leads to accurate and diverse forecasts, yielding +state-of-the-art performance on three established benchmarks. + +
+
+ comment: 15 pages, 3 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Verification of Geometric Robustness of Neural Networks via Piecewise + Linear Approximation and Lipschitz Optimisation ECAI 2024 + + +
+ We address the problem of verifying neural networks against geometric +transformations of the input image, including rotation, scaling, shearing, and +translation. The proposed method computes provably sound piecewise linear +constraints for the pixel values by using sampling and linear approximations in +combination with branch-and-bound Lipschitz optimisation. The method obtains +provably tighter over-approximations of the perturbation region than the +present state-of-the-art. We report results from experiments on a comprehensive +set of verification benchmarks on MNIST and CIFAR10. We show that our proposed +implementation resolves up to 32% more verification cases than present +approaches. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ♻ ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding + Integration in Adobe Express CIKM 2024 + + +
+ As user content and queries become increasingly multi-modal, the need for +effective multi-modal search systems has grown. Traditional search systems +often rely on textual and metadata annotations for indexed images, while +multi-modal embeddings like CLIP enable direct search using text and image +embeddings. However, embedding-based approaches face challenges in integrating +contextual features such as user locale and recency. Building a scalable +multi-modal search system requires fine-tuning several components. This paper +presents a multi-modal search architecture and a series of AB tests that +optimize embeddings and multi-modal technologies in Adobe Express template +search. We address considerations such as embedding model selection, the roles +of embeddings in matching and ranking, and the balance between dense and sparse +embeddings. Our iterative approach demonstrates how utilizing sparse, dense, +and contextual features enhances short and long query search, significantly +reduces null rates (over 70\%), and increases click-through rates (CTR). Our +findings provide insights into developing robust multi-modal search systems, +thereby enhancing relevance for complex queries. + +
+
+ comment: CIKM 2024 (International Conference on Information and Knowledge + Management), Multimodal Search and Recommendations Workshop +
+
+
+
+
+ + ♻ ☆ On the Efficacy of Text-Based Input Modalities for Action Anticipation + + +
+ Anticipating future actions is a highly challenging task due to the diversity +and scale of potential future actions; yet, information from different +modalities help narrow down plausible action choices. Each modality can provide +diverse and often complementary context for the model to learn from. While +previous multi-modal methods leverage information from modalities such as video +and audio, we primarily explore how text descriptions of actions and objects +can also lead to more accurate action anticipation by providing additional +contextual cues, e.g., about the environment and its contents. We propose a +Multi-modal Contrastive Anticipative Transformer (M-CAT), a video transformer +architecture that jointly learns from multi-modal features and text +descriptions of actions and objects. We train our model in two stages, where +the model first learns to align video clips with descriptions of future +actions, and is subsequently fine-tuned to predict future actions. Compared to +existing methods, M-CAT has the advantage of learning additional context from +two types of text inputs: rich descriptions of future actions during +pre-training, and, text descriptions for detected objects and actions during +modality feature fusion. Through extensive experimental evaluation, we +demonstrate that our model outperforms previous methods on the EpicKitchens +datasets, and show that using simple text descriptions of actions and objects +aid in more effective action anticipation. In addition, we examine the impact +of object and action information obtained via text, and perform extensive +ablations. + +
+
+
+
+
+ + ♻ ☆ Mumpy: Multilateral Temporal-view Pyramid Transformer for Video + Inpainting Detection BMVC 2024 + + +
+ The task of video inpainting detection is to expose the pixel-level inpainted +regions within a video sequence. Existing methods usually focus on leveraging +spatial and temporal inconsistencies. However, these methods typically employ +fixed operations to combine spatial and temporal clues, limiting their +applicability in different scenarios. In this paper, we introduce a novel +Multilateral Temporal-view Pyramid Transformer ({\em MumPy}) that collaborates +spatial-temporal clues flexibly. Our method utilizes a newly designed +multilateral temporal-view encoder to extract various collaborations of +spatial-temporal clues and introduces a deformable window-based temporal-view +interaction module to enhance the diversity of these collaborations. +Subsequently, we develop a multi-pyramid decoder to aggregate the various types +of features and generate detection maps. By adjusting the contribution strength +of spatial and temporal clues, our method can effectively identify inpainted +regions. We validate our method on existing datasets and also introduce a new +challenging and large-scale Video Inpainting dataset based on the YouTube-VOS +dataset, which employs several more recent inpainting methods. The results +demonstrate the superiority of our method in both in-domain and cross-domain +evaluation scenarios. + +
+
+ comment: BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Generalist Segmentation Algorithm for Photoreceptors Analysis in + Adaptive Optics Imaging + + +
+ Analyzing the cone photoreceptor pattern in images obtained from the living +human retina using quantitative methods can be crucial for the early detection +and management of various eye conditions. Confocal adaptive optics scanning +light ophthalmoscope (AOSLO) imaging enables visualization of the cones from +reflections of waveguiding cone photoreceptors. While there have been +significant improvements in automated algorithms for segmenting cones in +confocal AOSLO images, the process of labelling data remains labor-intensive +and manual. This paper introduces a method based on deep learning (DL) for +detecting and segmenting cones in AOSLO images. The models were trained on a +semi-automatically labelled dataset of 20 AOSLO batches of images of 18 +participants for 0$^{\circ}$, 1$^{\circ}$, and 2$^{\circ}$ from the foveal +center. F1 scores were 0.968, 0.958, and 0.954 for 0$^{\circ}$, 1$^{\circ}$, +and 2$^{\circ}$, respectively, which is better than previously reported DL +approaches. Our method minimizes the need for labelled data by only +necessitating a fraction of labelled cones, which is especially beneficial in +the field of ophthalmology, where labelled data can often be limited. + +
+
+
+
+
+ + ♻ ☆ Frequency-Assisted Mamba for Remote Sensing Image Super-Resolution + + +
+ Recent progress in remote sensing image (RSI) super-resolution (SR) has +exhibited remarkable performance using deep neural networks, e.g., +Convolutional Neural Networks and Transformers. However, existing SR methods +often suffer from either a limited receptive field or quadratic computational +overhead, resulting in sub-optimal global representation and unacceptable +computational costs in large-scale RSI. To alleviate these issues, we develop +the first attempt to integrate the Vision State Space Model (Mamba) for RSI-SR, +which specializes in processing large-scale RSI by capturing long-range +dependency with linear complexity. To achieve better SR reconstruction, +building upon Mamba, we devise a Frequency-assisted Mamba framework, dubbed +FMSR, to explore the spatial and frequent correlations. In particular, our FMSR +features a multi-level fusion architecture equipped with the Frequency +Selection Module (FSM), Vision State Space Module (VSSM), and Hybrid Gate +Module (HGM) to grasp their merits for effective spatial-frequency fusion. +Considering that global and local dependencies are complementary and both +beneficial for SR, we further recalibrate these multi-level features for +accurate feature fusion via learnable scaling adaptors. Extensive experiments +on AID, DOTA, and DIOR benchmarks demonstrate that our FMSR outperforms +state-of-the-art Transformer-based methods HAT-L in terms of PSNR by 0.11 dB on +average, while consuming only 28.05% and 19.08% of its memory consumption and +complexity, respectively. Code will be available at +https://github.com/XY-boy/FreMamba + +
+
+ comment: Accepted by IEEE TMM +
+
+
+
+
+ + ♻ ☆ On Feasibility of Intent Obfuscating Attacks + + +
+ Intent obfuscation is a common tactic in adversarial situations, enabling the +attacker to both manipulate the target system and avoid culpability. +Surprisingly, it has rarely been implemented in adversarial attacks on machine +learning systems. We are the first to propose using intent obfuscation to +generate adversarial examples for object detectors: by perturbing another +non-overlapping object to disrupt the target object, the attacker hides their +intended target. We conduct a randomized experiment on 5 prominent detectors -- +YOLOv3, SSD, RetinaNet, Faster R-CNN, and Cascade R-CNN -- using both targeted +and untargeted attacks and achieve success on all models and attacks. We +analyze the success factors characterizing intent obfuscating attacks, +including target object confidence and perturb object sizes. We then +demonstrate that the attacker can exploit these success factors to increase +success rates for all models and attacks. Finally, we discuss main takeaways +and legal repercussions. + +
+
+ comment: 33 pages, 21 Figures. Includes technical appendix. To appear in AIES + 2024 +
+
+
+
+
+ + ♻ ☆ VideoMambaPro: A Leap Forward for Mamba in Video Understanding + + +
+ Video understanding requires the extraction of rich spatio-temporal +representations, which transformer models achieve through self-attention. +Unfortunately, self-attention poses a computational burden. In NLP, Mamba has +surfaced as an efficient alternative for transformers. However, Mamba's +successes do not trivially extend to computer vision tasks, including those in +video analysis. In this paper, we theoretically analyze the differences between +self-attention and Mamba. We identify two limitations in Mamba's token +processing: historical decay and element contradiction. We propose +VideoMambaPro (VMP) that solves the identified limitations by adding masked +backward computation and elemental residual connections to a VideoMamba +backbone. VideoMambaPro shows state-of-the-art video action recognition +performance compared to transformer models, and surpasses VideoMamba by clear +margins: 7.9% and 8.1% top-1 on Kinetics-400 and Something-Something V2, +respectively. Our VideoMambaPro-M model achieves 91.9% top-1 on Kinetics-400, +only 0.2% below InternVideo2-6B but with only 1.2% of its parameters. The +combination of high performance and efficiency makes VideoMambaPro an +interesting alternative for transformer models. + +
+
+ comment: Model weights are lost due to management error, will re-calculate and + update the results +
+
+
+
+
+ + ♻ ☆ Learning to Detect and Segment for Open Vocabulary Object Detection CVPR2023 + + +
+ Open vocabulary object detection has been greatly advanced by the recent +development of vision-language pretrained model, which helps recognize novel +objects with only semantic categories. The prior works mainly focus on +knowledge transferring to the object proposal classification and employ +class-agnostic box and mask prediction. In this work, we propose CondHead, a +principled dynamic network design to better generalize the box regression and +mask segmentation for open vocabulary setting. The core idea is to +conditionally parameterize the network heads on semantic embedding and thus the +model is guided with class-specific knowledge to better detect novel +categories. Specifically, CondHead is composed of two streams of network heads, +the dynamically aggregated head and the dynamically generated head. The former +is instantiated with a set of static heads that are conditionally aggregated, +these heads are optimized as experts and are expected to learn sophisticated +prediction. The latter is instantiated with dynamically generated parameters +and encodes general class-specific information. With such a conditional design, +the detection model is bridged by the semantic embedding to offer strongly +generalizable class-wise box and mask prediction. Our method brings significant +improvement to the state-of-the-art open vocabulary object detection methods +with very minor overhead, e.g., it surpasses a RegionClip model by 3.0 +detection AP on novel categories, with only 1.1% more computation. + +
+
+ comment: Accepted to CVPR2023, code will be available later +
+
+
+
+
+ + ♻ ☆ Fast Text-to-3D-Aware Face Generation and Manipulation via Direct + Cross-modal Mapping and Geometric Regularization + + +
+ Text-to-3D-aware face (T3D Face) generation and manipulation is an emerging +research hot spot in machine learning, which still suffers from low efficiency +and poor quality. In this paper, we propose an End-to-End Efficient and +Effective network for fast and accurate T3D face generation and manipulation, +termed $E^3$-FaceNet. Different from existing complex generation paradigms, +$E^3$-FaceNet resorts to a direct mapping from text instructions to 3D-aware +visual space. We introduce a novel Style Code Enhancer to enhance cross-modal +semantic alignment, alongside an innovative Geometric Regularization objective +to maintain consistency across multi-view generations. Extensive experiments on +three benchmark datasets demonstrate that $E^3$-FaceNet can not only achieve +picture-like 3D face generation and manipulation, but also improve inference +speed by orders of magnitudes. For instance, compared with Latent3D, +$E^3$-FaceNet speeds up the five-view generations by almost 470 times, while +still exceeding in generation quality. Our code is released at +https://github.com/Aria-Zhangjl/E3-FaceNet. + +
+
+
+
+
+ + ♻ ☆ HDRTransDC: High Dynamic Range Image Reconstruction with Transformer + Deformation Convolution + + +
+ High Dynamic Range (HDR) imaging aims to generate an artifact-free HDR image +with realistic details by fusing multi-exposure Low Dynamic Range (LDR) images. +Caused by large motion and severe under-/over-exposure among input LDR images, +HDR imaging suffers from ghosting artifacts and fusion distortions. To address +these critical issues, we propose an HDR Transformer Deformation Convolution +(HDRTransDC) network to generate high-quality HDR images, which consists of the +Transformer Deformable Convolution Alignment Module (TDCAM) and the Dynamic +Weight Fusion Block (DWFB). To solve the ghosting artifacts, the proposed TDCAM +extracts long-distance content similar to the reference feature in the entire +non-reference features, which can accurately remove misalignment and fill the +content occluded by moving objects. For the purpose of eliminating fusion +distortions, we propose DWFB to spatially adaptively select useful information +across frames to effectively fuse multi-exposed features. Extensive experiments +show that our method quantitatively and qualitatively achieves state-of-the-art +performance. + +
+
+ comment: We request to withdraw our manuscript due to identified issues: + inaccuracies in the description of a submodule's composition, principles, and + functionality in Section 3.2, and potential problems in metric calculation in + Sections 4.2 and 4.3. To prevent the spread of misleading information, we + believe it is necessary to temporarily withdraw the manuscript for further + research and verification +
+
+
+
+
+ + ♻ ☆ A comparison between humans and AI at recognizing objects in unusual + poses + + +
+ Deep learning is closing the gap with human vision on several object +recognition benchmarks. Here we investigate this gap for challenging images +where objects are seen in unusual poses. We find that humans excel at +recognizing objects in such poses. In contrast, state-of-the-art deep networks +for vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art +large vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically +brittle on unusual poses, with the exception of Gemini showing excellent +robustness in that condition. As we limit image exposure time, human +performance degrades to the level of deep networks, suggesting that additional +mental processes (requiring additional time) are necessary to identify objects +in unusual poses. An analysis of error patterns of humans vs. networks reveals +that even time-limited humans are dissimilar to feed-forward deep networks. In +conclusion, our comparison reveals that humans and deep networks rely on +different mechanisms for recognizing objects in unusual poses. Understanding +the nature of the mental processes taking place during extra viewing time may +be key to reproduce the robustness of human vision in silico. + +
+
+
+
+
+ + ♻ ☆ Sparse-Tuning: Adapting Vision Transformers with Efficient Fine-tuning + and Inference + + +
+ Parameter-efficient fine-tuning (PEFT) has emerged as a popular solution for +adapting pre-trained Vision Transformer (ViT) models to downstream +applications. While current PEFT methods have achieved parameter efficiency, +they overlook the efficiency of computation and GPU memory during both +fine-tuning and inference, falling short of practical requirements. In this +paper, we propose \textbf{Sparse-Tuning}, a novel PEFT method that accounts for +the information redundancy in images and videos to boost the above efficiency. +By sparsely preserving the semantic-relevant tokens and merging irrelevant +ones, Sparse-Tuning minimizes the quantity of tokens processed at each layer, +leading to a quadratic reduction in computational and memory overhead. To align +our token sparsification strategy suitably with fine-tuning purposes, we +further design Dense Adapters that establish dense connections from shallow +layers to deeper layers. These Dense Adapters integrate multi-level local +features to enrich the current tokens, improving both token preservation and +model adaptation. Empirical results on VTAB-1K, three image datasets, and two +video datasets show that our Sparse-Tuning reduces GFLOPs to \textbf{62\%-70\%} +of the original ViT-B while achieving state-of-the-art performance. Source code +is available at \url{https://github.com/liuting20/Sparse-Tuning}. + +
+
+
+
+
+ + ♻ ☆ Deep Learning Based Speckle Filtering for Polarimetric SAR Images. + Application to Sentinel-1 + + +
+ Speckle suppression in synthetic aperture radar (SAR) images is a key +processing step which continues to be a research topic. A wide variety of +methods, using either spatially-based approaches or transform-based strategies, +have been developed and have shown to provide outstanding results. However, +recent advances in deep learning techniques and their application to SAR image +despeckling have been demonstrated to offer state-of-the-art results. +Unfortunately, they have been mostly applied to single-polarimetric images. The +extension of a deep learning-based approach for speckle removal to polarimetric +SAR (PolSAR) images is complicated because of the complex nature of the +measured covariance matrices for every image pixel, the properties of which +must be preserved during filtering. In this work, we propose a complete +framework to remove speckle in polarimetric SAR images using a convolutional +neural network. The methodology includes a reversible transformation of the +original complex covariance matrix to obtain a set of real-valued intensity +bands which are fed to the neural network. In addition, the proposed method +includes a change detection strategy to avoid the neural network to learn +erroneous features in areas strongly affected by temporal changes, so that the +network only learns the underlying speckle component present in the data. The +method is implemented and tested with dual-polarimetric images acquired by +Sentinel-1. Experiments show that the proposed approach offers exceptional +results in both speckle reduction and resolution preservation. More +importantly, it is also shown that the neural network is not generating +artifacts or introducing bias in the filtered images, making them suitable for +further polarimetric processing and exploitation. + +
+
+ comment: 23 pages, 32 figures +
+
+
+
+
+ + ♻ ☆ Dual-Domain CLIP-Assisted Residual Optimization Perception Model for + Metal Artifact Reduction + + +
+ Metal artifacts in computed tomography (CT) imaging pose significant +challenges to accurate clinical diagnosis. The presence of high-density +metallic implants results in artifacts that deteriorate image quality, +manifesting in the forms of streaking, blurring, or beam hardening effects, +etc. Nowadays, various deep learning-based approaches, particularly generative +models, have been proposed for metal artifact reduction (MAR). However, these +methods have limited perception ability in the diverse morphologies of +different metal implants with artifacts, which may generate spurious anatomical +structures and exhibit inferior generalization capability. To address the +issues, we leverage visual-language model (VLM) to identify these morphological +features and introduce them into a dual-domain CLIP-assisted residual +optimization perception model (DuDoCROP) for MAR. Specifically, a dual-domain +CLIP (DuDoCLIP) is fine-tuned on the image domain and sinogram domain using +contrastive learning to extract semantic descriptions from anatomical +structures and metal artifacts. Subsequently, a diffusion model is guided by +the embeddings of DuDoCLIP, thereby enabling the dual-domain prior generation. +Additionally, we design prompt engineering for more precise image-text +descriptions that can enhance the model's perception capability. Then, a +downstream task is devised for the one-step residual optimization and +integration of dual-domain priors, while incorporating raw data fidelity. +Ultimately, a new perceptual indicator is proposed to validate the model's +perception and generation performance. With the assistance of DuDoCLIP, our +DuDoCROP exhibits at least 63.7% higher generalization capability compared to +the baseline model. Numerical experiments demonstrate that the proposed method +can generate more realistic image structures and outperform other SOTA +approaches both qualitatively and quantitatively. + +
+
+ comment: 14 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Enhancing Adaptive Deep Networks for Image Classification via + Uncertainty-aware Decision Fusion + + +
+ Handling varying computational resources is a critical issue in modern AI +applications. Adaptive deep networks, featuring the dynamic employment of +multiple classifier heads among different layers, have been proposed to address +classification tasks under varying computing resources. Existing approaches +typically utilize the last classifier supported by the available resources for +inference, as they believe that the last classifier always performs better +across all classes. However, our findings indicate that earlier classifier +heads can outperform the last head for certain classes. Based on this +observation, we introduce the Collaborative Decision Making (CDM) module, which +fuses the multiple classifier heads to enhance the inference performance of +adaptive deep networks. CDM incorporates an uncertainty-aware fusion method +based on evidential deep learning (EDL), that utilizes the reliability +(uncertainty values) from the first c-1 classifiers to improve the c-th +classifier' accuracy. We also design a balance term that reduces fusion +saturation and unfairness issues caused by EDL constraints to improve the +fusion quality of CDM. Finally, a regularized training strategy that uses the +last classifier to guide the learning process of early classifiers is proposed +to further enhance the CDM module's effect, called the Guided Collaborative +Decision Making (GCDM) framework. The experimental evaluation demonstrates the +effectiveness of our approaches. Results on ImageNet datasets show CDM and GCDM +obtain 0.4% to 2.8% accuracy improvement (under varying computing resources) on +popular adaptive networks. The code is available at the link +https://github.com/Meteor-Stars/GCDM_AdaptiveNet. + +
+
+ comment: 13 pages, 27 figures. In ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ A Flying Bird Object Detection Method for Surveillance Video + + +
+ Aiming at the specific characteristics of flying bird objects in surveillance +video, such as the typically non-obvious features in single-frame images, small +size in most instances, and asymmetric shapes, this paper proposes a Flying +Bird Object Detection method for Surveillance Video (FBOD-SV). Firstly, a new +feature aggregation module, the Correlation Attention Feature Aggregation +(Co-Attention-FA) module, is designed to aggregate the features of the flying +bird object according to the bird object's correlation on multiple consecutive +frames of images. Secondly, a Flying Bird Object Detection Network (FBOD-Net) +with down-sampling followed by up-sampling is designed, which utilizes a large +feature layer that fuses fine spatial information and large receptive field +information to detect special multi-scale (mostly small-scale) bird objects. +Finally, the SimOTA dynamic label allocation method is applied to One-Category +object detection, and the SimOTA-OC dynamic label strategy is proposed to solve +the difficult problem of label allocation caused by irregular flying bird +objects. In this paper, the performance of the FBOD-SV is validated using +experimental datasets of flying bird objects in traction substation +surveillance videos. The experimental results show that the FBOD-SV effectively +improves the detection performance of flying bird objects in surveillance +video. + +
+
+
+
+
+ + ♻ ☆ Conformal Performance Range Prediction for Segmentation Output Quality + Control MICCAI + + +
+ Recent works have introduced methods to estimate segmentation performance +without ground truth, relying solely on neural network softmax outputs. These +techniques hold potential for intuitive output quality control. However, such +performance estimates rely on calibrated softmax outputs, which is often not +the case in modern neural networks. Moreover, the estimates do not take into +account inherent uncertainty in segmentation tasks. These limitations may +render precise performance predictions unattainable, restricting the practical +applicability of performance estimation methods. To address these challenges, +we develop a novel approach for predicting performance ranges with statistical +guarantees of containing the ground truth with a user specified probability. +Our method leverages sampling-based segmentation uncertainty estimation to +derive heuristic performance ranges, and applies split conformal prediction to +transform these estimates into rigorous prediction ranges that meet the desired +guarantees. We demonstrate our approach on the FIVES retinal vessel +segmentation dataset and compare five commonly used sampling-based uncertainty +estimation techniques. Our results show that it is possible to achieve the +desired coverage with small prediction ranges, highlighting the potential of +performance range prediction as a valuable tool for output quality control. + +
+
+ comment: Accepted as an oral presentation at MICCAI UNSURE 2024 +
+
+
+
+
+ + ♻ ☆ Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction + + +
+ Video key frame extraction is important in various fields, such as video +summary, retrieval, and compression. Therefore, we suggest a video key frame +extraction algorithm based on shot segmentation using Von Neumann entropy. The +segmentation of shots is achieved through the computation of Von Neumann +entropy of the similarity matrix among frames within the video sequence. The +initial frame of each shot is selected as key frames, which combines the +temporal sequence information of frames. The experimental results show the +extracted key frames can fully and accurately represent the original video +content while minimizing the number of repeated frames. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Context Prompting for Zero-Shot Action Detection + + +
+ Spatio-temporal action detection encompasses the tasks of localizing and +classifying individual actions within a video. Recent works aim to enhance this +process by incorporating interaction modeling, which captures the relationship +between people and their surrounding context. However, these approaches have +primarily focused on fully-supervised learning, and the current limitation lies +in the lack of generalization capability to recognize unseen action categories. +In this paper, we aim to adapt the pretrained image-language models to detect +unseen actions. To this end, we propose a method which can effectively leverage +the rich knowledge of visual-language models to perform Person-Context +Interaction. Meanwhile, our Context Prompting module will utilize contextual +information to prompt labels, thereby enhancing the generation of more +representative text features. Moreover, to address the challenge of recognizing +distinct actions by multiple people at the same timestamp, we design the +Interest Token Spotting mechanism which employs pretrained visual knowledge to +find each person's interest context tokens, and then these tokens will be used +for prompting to generate text features tailored to each individual. To +evaluate the ability to detect unseen actions, we propose a comprehensive +benchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our +method achieves superior results compared to previous approaches and can be +further extended to multi-action videos, bringing it closer to real-world +applications. The code and data can be found in +https://webber2933.github.io/ST-CLIP-project-page. + +
+
+ comment: Project page: https://webber2933.github.io/ST-CLIP-project-page +
+
+
+
+
+ + ♻ ☆ Text-Region Matching for Multi-Label Image Recognition with Missing + Labels ACM MM + + +
+ Recently, large-scale visual language pre-trained (VLP) models have +demonstrated impressive performance across various downstream tasks. Motivated +by these advancements, pioneering efforts have emerged in multi-label image +recognition with missing labels, leveraging VLP prompt-tuning technology. +However, they usually cannot match text and vision features well, due to +complicated semantics gaps and missing labels in a multi-label image. To tackle +this challenge, we propose $\textbf{T}$ext-$\textbf{R}$egion +$\textbf{M}$atching for optimizing $\textbf{M}$ulti-$\textbf{L}$abel prompt +tuning, namely TRM-ML, a novel method for enhancing meaningful cross-modal +matching. Compared to existing methods, we advocate exploring the information +of category-aware regions rather than the entire image or pixels, which +contributes to bridging the semantic gap between textual and visual +representations in a one-to-one matching manner. Concurrently, we further +introduce multimodal contrastive learning to narrow the semantic gap between +textual and visual modalities and establish intra-class and inter-class +relationships. Additionally, to deal with missing labels, we propose a +multimodal category prototype that leverages intra- and inter-category semantic +relationships to estimate unknown labels, facilitating pseudo-label generation. +Extensive experiments on the MS-COCO, PASCAL VOC, Visual Genome, NUS-WIDE, and +CUB-200-211 benchmark datasets demonstrate that our proposed framework +outperforms the state-of-the-art methods by a significant margin. Our code is +available here: https://github.com/yu-gi-oh-leilei/TRM-ML. + +
+
+ comment: Accepted to ACM International Conference on Multimedia (ACM MM) 2024 +
+
+
+
+
+ + ♻ ☆ Erasing Concepts from Text-to-Image Diffusion Models with Few-shot + Unlearning BMVC2024 + + +
+ Generating images from text has become easier because of the scaling of +diffusion models and advancements in the field of vision and language. These +models are trained using vast amounts of data from the Internet. Hence, they +often contain undesirable content such as copyrighted material. As it is +challenging to remove such data and retrain the models, methods for erasing +specific concepts from pre-trained models have been investigated. We propose a +novel concept-erasure method that updates the text encoder using few-shot +unlearning in which a few real images are used. The discussion regarding the +generated images after erasing a concept has been lacking. While there are +methods for specifying the transition destination for concepts, the validity of +the specified concepts is unclear. Our method implicitly achieves this by +transitioning to the latent concepts inherent in the model or the images. Our +method can erase a concept within 10 s, making concept erasure more accessible +than ever before. Implicitly transitioning to related concepts leads to more +natural concept erasure. We applied the proposed method to various concepts and +confirmed that concept erasure can be achieved tens to hundreds of times faster +than with current methods. By varying the parameters to be updated, we obtained +results suggesting that, like previous research, knowledge is primarily +accumulated in the feed-forward networks of the text encoder. Our code is +available at \url{https://github.com/fmp453/few-shot-erasing} + +
+
+ comment: 25 pages, 28 figures, accepted by BMVC2024 +
+
+
+
+
+ + ♻ ☆ CorMulT: A Semi-supervised Modality Correlation-aware Multimodal + Transformer for Sentiment Analysis + + +
+ Multimodal sentiment analysis is an active research area that combines +multiple data modalities, e.g., text, image and audio, to analyze human +emotions and benefits a variety of applications. Existing multimodal sentiment +analysis methods can be classified as modality interaction-based methods, +modality transformation-based methods and modality similarity-based methods. +However, most of these methods highly rely on the strong correlations between +modalities, and cannot fully uncover and utilize the correlations between +modalities to enhance sentiment analysis. Therefore, these methods usually +achieve bad performance for identifying the sentiment of multimodal data with +weak correlations. To address this issue, we proposed a two-stage +semi-supervised model termed Correlation-aware Multimodal Transformer (CorMulT) +which consists pre-training stage and prediction stage. At the pre-training +stage, a modality correlation contrastive learning module is designed to +efficiently learn modality correlation coefficients between different +modalities. At the prediction stage, the learned correlation coefficients are +fused with modality representations to make the sentiment prediction. According +to the experiments on the popular multimodal dataset CMU-MOSEI, CorMulT +obviously surpasses state-of-the-art multimodal sentiment analysis methods. + +
+
+
+
+
+ + ♻ ☆ KeyMatchNet: Zero-Shot Pose Estimation in 3D Point Clouds by Generalized + Keypoint Matching + + +
+ In this paper, we present KeyMatchNet, a novel network for zero-shot pose +estimation in 3D point clouds. Our method uses only depth information, making +it more applicable for many industrial use cases, as color information is +seldom available. The network is composed of two parallel components for +computing object and scene features. The features are then combined to create +matches used for pose estimation. The parallel structure allows for +pre-processing of the individual parts, which decreases the run-time. Using a +zero-shot network allows for a very short set-up time, as it is not necessary +to train models for new objects. However, as the network is not trained for the +specific object, zero-shot pose estimation methods generally have lower +accuracy compared with conventional methods. To address this, we reduce the +complexity of the task by including the scenario information during training. +This is typically not feasible as collecting real data for new tasks +drastically increases the cost. However, for zero-shot pose estimation, +training for new objects is not necessary and the expensive data collection can +thus be performed only once. Our method is trained on 1,500 objects and is only +tested on unseen objects. We demonstrate that the trained network can not only +accurately estimate poses for novel objects, but also demonstrate the ability +of the network on objects outside of the trained class. Test results are also +shown on real data. We believe that the presented method is valuable for many +real-world scenarios. Project page available at keymatchnet.github.io + +
+
+ comment: 8 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation + for Global Solar Mapping + + +
+ The transition to renewable energy, particularly solar, is key to mitigating +climate change. Google's Solar API aids this transition by estimating solar +potential from aerial imagery, but its impact is constrained by geographical +coverage. This paper proposes expanding the API's reach using satellite +imagery, enabling global solar potential assessment. We tackle challenges +involved in building a Digital Surface Model (DSM) and roof instance +segmentation from lower resolution and single oblique views using deep learning +models. Our models, trained on aligned satellite and aerial datasets, produce +25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch +error and ~56% IOU on roof segmentation, they significantly enhance the Solar +API's potential to promote solar adoption. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via + Rotation-Invariant Analysis + + +
+ The rotation robustness property has drawn much attention to point cloud +analysis, whereas it still poses a critical challenge in 3D object detection. +When subjected to arbitrary rotation, most existing detectors fail to produce +expected outputs due to the poor rotation robustness. In this paper, we present +RIDE, a pioneering exploration of Rotation-Invariance for the 3D +LiDAR-point-based object DEtector, with the key idea of designing +rotation-invariant features from LiDAR scenes and then effectively +incorporating them into existing 3D detectors. Specifically, we design a +bi-feature extractor that extracts (i) object-aware features though sensitive +to rotation but preserve geometry well, and (ii) rotation-invariant features, +which lose geometric information to a certain extent but are robust to +rotation. These two kinds of features complement each other to decode 3D +proposals that are robust to arbitrary rotations. Particularly, our RIDE is +compatible and easy to plug into the existing one-stage and two-stage 3D +detectors, and boosts both detection performance and rotation robustness. +Extensive experiments on the standard benchmarks showcase that the mean average +precision (mAP) and rotation robustness can be significantly boosted by +integrating with our RIDE, with +5.6% mAP and 53% rotation robustness +improvement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes. +The code will be available soon. + +
+
+
+
+
+ + ♻ ☆ Content Significance Distribution of Sub-Text Blocks in Articles and Its + Application to Article-Organization Assessment + + +
+ We explore how to capture the significance of a sub-text block in an article +and how it may be used for text mining tasks. A sub-text block is a +sub-sequence of sentences in the article. We formulate the notion of content +significance distribution (CSD) of sub-text blocks, referred to as CSD of the +first kind and denoted by CSD-1. In particular, we leverage Hugging Face's +SentenceTransformer to generate contextual sentence embeddings, and use +MoverScore over text embeddings to measure how similar a sub-text block is to +the entire text. To overcome the exponential blowup on the number of sub-text +blocks, we present an approximation algorithm and show that the approximated +CSD-1 is almost identical to the exact CSD-1. Under this approximation, we show +that the average and median CSD-1's for news, scholarly research, argument, and +narrative articles share the same pattern. We also show that under a certain +linear transformation, the complement of the cumulative distribution function +of the beta distribution with certain values of $\alpha$ and $\beta$ resembles +a CSD-1 curve. We then use CSD-1's to extract linguistic features to train an +SVC classifier for assessing how well an article is organized. Through +experiments, we show that this method achieves high accuracy for assessing +student essays. Moreover, we study CSD of sentence locations, referred to as +CSD of the second kind and denoted by CSD-2, and show that average CSD-2's for +different types of articles possess distinctive patterns, which either conform +common perceptions of article structures or provide rectification with minor +deviation. + +
+
+
+
+
+ + ♻ ☆ SegVol: Universal and Interactive Volumetric Medical Image Segmentation + + +
+ Precise image segmentation provides clinical study with instructive +information. Despite the remarkable progress achieved in medical image +segmentation, there is still an absence of a 3D foundation segmentation model +that can segment a wide range of anatomical categories with easy user +interaction. In this paper, we propose a 3D foundation segmentation model, +named SegVol, supporting universal and interactive volumetric medical image +segmentation. By scaling up training data to 90K unlabeled Computed Tomography +(CT) volumes and 6K labeled CT volumes, this foundation model supports the +segmentation of over 200 anatomical categories using semantic and spatial +prompts. To facilitate efficient and precise inference on volumetric images, we +design a zoom-out-zoom-in mechanism. Extensive experiments on 22 anatomical +segmentation tasks verify that SegVol outperforms the competitors in 19 tasks, +with improvements up to 37.24% compared to the runner-up methods. We +demonstrate the effectiveness and importance of specific designs by ablation +study. We expect this foundation model can promote the development of +volumetric medical image analysis. The model and code are publicly available +at: https://github.com/BAAI-DCAI/SegVol. + +
+
+
+
+
+ + ♻ ☆ DiffiT: Diffusion Vision Transformers for Image Generation ECCV'24 + + +
+ Diffusion models with their powerful expressivity and high sample quality +have achieved State-Of-The-Art (SOTA) performance in the generative domain. The +pioneering Vision Transformer (ViT) has also demonstrated strong modeling +capabilities and scalability, especially for recognition tasks. In this paper, +we study the effectiveness of ViTs in diffusion-based generative learning and +propose a new model denoted as Diffusion Vision Transformers (DiffiT). +Specifically, we propose a methodology for finegrained control of the denoising +process and introduce the Time-dependant Multihead Self Attention (TMSA) +mechanism. DiffiT is surprisingly effective in generating high-fidelity images +with significantly better parameter efficiency. We also propose latent and +image space DiffiT models and show SOTA performance on a variety of +class-conditional and unconditional synthesis tasks at different resolutions. +The Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256 +dataset while having 19.85%, 16.88% less parameters than other +Transformer-based diffusion models such as MDT and DiT,respectively. Code: +https://github.com/NVlabs/DiffiT + +
+
+ comment: Accepted to ECCV'24 +
+
+
+
+
+ + ♻ ☆ EaDeblur-GS: Event assisted 3D Deblur Reconstruction with Gaussian + Splatting + + +
+ 3D deblurring reconstruction techniques have recently seen significant +advancements with the development of Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS). Although these techniques can recover relatively +clear 3D reconstructions from blurry image inputs, they still face limitations +in handling severe blurring and complex camera motion. To address these issues, +we propose Event-assisted 3D Deblur Reconstruction with Gaussian Splatting +(EaDeblur-GS), which integrates event camera data to enhance the robustness of +3DGS against motion blur. By employing an Adaptive Deviation Estimator (ADE) +network to estimate Gaussian center deviations and using novel loss functions, +EaDeblur-GS achieves sharp 3D reconstructions in real-time, demonstrating +performance comparable to state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Pre-training on Synthetic Driving Data for Trajectory Prediction + + +
+ Accumulating substantial volumes of real-world driving data proves pivotal in +the realm of trajectory forecasting for autonomous driving. Given the heavy +reliance of current trajectory forecasting models on data-driven methodologies, +we aim to tackle the challenge of learning general trajectory forecasting +representations under limited data availability. We propose a pipeline-level +solution to mitigate the issue of data scarcity in trajectory forecasting. The +solution is composed of two parts: firstly, we adopt HD map augmentation and +trajectory synthesis for generating driving data, and then we learn +representations by pre-training on them. Specifically, we apply vector +transformations to reshape the maps, and then employ a rule-based model to +generate trajectories on both original and augmented scenes; thus enlarging the +driving data without collecting additional real ones. To foster the learning of +general representations within this augmented dataset, we comprehensively +explore the different pre-training strategies, including extending the concept +of a Masked AutoEncoder (MAE) for trajectory forecasting. Without bells and +whistles, our proposed pipeline-level solution is general, simple, yet +effective: we conduct extensive experiments to demonstrate the effectiveness of +our data expansion and pre-training strategies, which outperform the baseline +prediction model by large margins, e.g. 5.04%, 3.84% and 8.30% in terms of +$MR_6$, $minADE_6$ and $minFDE_6$. The pre-training dataset and the codes for +pre-training and fine-tuning are released at +https://github.com/yhli123/Pretraining_on_Synthetic_Driving_Data_for_Trajectory_Prediction. + +
+
+
+
+
+ + ♻ ☆ Improving Deep Representation Learning via Auxiliary Learnable Target + Coding + + +
+ Deep representation learning is a subfield of machine learning that focuses +on learning meaningful and useful representations of data through deep neural +networks. However, existing methods for semantic classification typically +employ pre-defined target codes such as the one-hot and the Hadamard codes, +which can either fail or be less flexible to model inter-class correlation. In +light of this, this paper introduces a novel learnable target coding as an +auxiliary regularization of deep representation learning, which can not only +incorporate latent dependency across classes but also impose geometric +properties of target codes into representation space. Specifically, a +margin-based triplet loss and a correlation consistency loss on the proposed +target codes are designed to encourage more discriminative representations +owing to enlarging between-class margins in representation space and favoring +equal semantic correlation of learnable target codes respectively. Experimental +results on several popular visual classification and retrieval benchmarks can +demonstrate the effectiveness of our method on improving representation +learning, especially for imbalanced data. Source codes are made publicly +available at +\href{https://github.com/AkonLau/LTC}{https://github.com/AkonLau/LTC}. + +
+
+ comment: Accepted by Pattern Recognition, 33 pages, 8 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ SITransformer: Shared Information-Guided Transformer for Extreme + Multimodal Summarization + + +
+ Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an +attractive summarization approach by integrating various types of information +to create extremely concise yet informative summaries for individual +modalities. Existing methods overlook the issue that multimodal data often +contains more topic irrelevant information, which can mislead the model into +producing inaccurate summaries especially for extremely short ones. In this +paper, we propose SITransformer, a Shared Information-guided Transformer for +extreme multimodal summarization. It has a shared information guided pipeline +which involves a cross-modal shared information extractor and a cross-modal +interaction module. The extractor formulates semantically shared salient +information from different modalities by devising a novel filtering process +consisting of a differentiable top-k selector and a shared-information guided +gating unit. As a result, the common, salient, and relevant contents across +modalities are identified. Next, a transformer with cross-modal attentions is +developed for intra- and inter-modality learning with the shared information +guidance to produce the extreme summary. Comprehensive experiments demonstrate +that SITransformer significantly enhances the summarization quality for both +video and text summaries for XMSMO. Our code will be publicly available at +https://github.com/SichengLeoLiu/MMAsia24-XMSMO. + +
+
+ comment: 8 pages, 5 figures, submitted to ACM Multimedia Asia 2024 +
+
+
+
+
+ + ♻ ☆ Hierarchical Spatial Proximity Reasoning for Vision-and-Language + Navigation + + +
+ Most Vision-and-Language Navigation (VLN) algorithms are prone to making +decision due to a lack of visual common sense and insufficient reasoning +capabilities. To address this issue, we propose a Hierarchical Spatial +Proximity Reasoning (HSPR) method. First, we introduce a scene understanding +auxiliary task to help the agent build a knowledge base of hierarchical spatial +proximity. This task utilizes panoramic views and object features to identify +types of nodes and uncover the adjacency relationships between nodes, objects, +and between nodes and objects. Second, we propose a multi-step reasoning +navigation algorithm based on hierarchical spatial proximity knowledge base, +which continuously plans feasible paths to enhance exploration efficiency. +Third, we introduce a residual fusion method to improve navigation decision +accuracy. Finally, we validate our approach with experiments on publicly +available datasets including REVERIE, SOON, R2R, and R4R. Our code is available +at https://github.com/iCityLab/HSPR. + +
+
+
+
+
+ + ♻ ☆ DuoSpaceNet: Leveraging Both Bird's-Eye-View and Perspective View + Representations for 3D Object Detection + + +
+ Recent advances in multi-view camera-only 3D object detection either rely on +an accurate reconstruction of bird's-eye-view (BEV) 3D features or on +traditional 2D perspective view (PV) image features. While both have their own +pros and cons, few have found a way to stitch them together in order to benefit +from "the best of both worlds". To this end, we explore a duo space (i.e., BEV +and PV) 3D perception framework, in conjunction with some useful duo space +fusion strategies that allow effective aggregation of the two feature +representations. To the best of our knowledge, our proposed method, +DuoSpaceNet, is the first to leverage two distinct feature spaces and achieves +the state-of-the-art 3D object detection and BEV map segmentation results on +nuScenes dataset. + +
+
+
+
+
+ + ♻ ☆ TiCoSS: Tightening the Coupling between Semantic Segmentation and Stereo + Matching within A Joint Learning Framework + + +
+ Semantic segmentation and stereo matching, respectively analogous to the +ventral and dorsal streams in our human brain, are two key components of +autonomous driving perception systems. Addressing these two tasks with separate +networks is no longer the mainstream direction in developing computer vision +algorithms, particularly with the recent advances in large vision models and +embodied artificial intelligence. The trend is shifting towards combining them +within a joint learning framework, especially emphasizing feature sharing +between the two tasks. The major contributions of this study lie in +comprehensively tightening the coupling between semantic segmentation and +stereo matching. Specifically, this study introduces three novelties: (1) a +tightly coupled, gated feature fusion strategy, (2) a hierarchical deep +supervision strategy, and (3) a coupling tightening loss function. The combined +use of these technical contributions results in TiCoSS, a state-of-the-art +joint learning framework that simultaneously tackles semantic segmentation and +stereo matching. Through extensive experiments on the KITTI and vKITTI2 +datasets, along with qualitative and quantitative analyses, we validate the +effectiveness of our developed strategies and loss function, and demonstrate +its superior performance compared to prior arts, with a notable increase in +mIoU by over 9%. Our source code will be publicly available at +mias.group/TiCoSS upon publication. + +
+
+
+
+
+ + ♻ ☆ 360 Layout Estimation via Orthogonal Planes Disentanglement and + Multi-view Geometric Consistency Perception + + +
+ Existing panoramic layout estimation solutions tend to recover room +boundaries from a vertically compressed sequence, yielding imprecise results as +the compression process often muddles the semantics between various planes. +Besides, these data-driven approaches impose an urgent demand for massive data +annotations, which are laborious and time-consuming. For the first problem, we +propose an orthogonal plane disentanglement network (termed DOPNet) to +distinguish ambiguous semantics. DOPNet consists of three modules that are +integrated to deliver distortion-free, semantics-clean, and detail-sharp +disentangled representations, which benefit the subsequent layout recovery. For +the second problem, we present an unsupervised adaptation technique tailored +for horizon-depth and ratio representations. Concretely, we introduce an +optimization strategy for decision-level layout analysis and a 1D cost volume +construction method for feature-level multi-view aggregation, both of which are +designed to fully exploit the geometric consistency across multiple +perspectives. The optimizer provides a reliable set of pseudo-labels for +network training, while the 1D cost volume enriches each view with +comprehensive scene information derived from other perspectives. Extensive +experiments demonstrate that our solution outperforms other SoTA models on both +monocular layout estimation and multi-view layout estimation tasks. Cobe can be +available at https://github.com/zhijieshen-bjtu/MV-DOPNet. + +
+
+ comment: Accept to TPAMI2024. arXiv admin note: substantial text overlap with + arXiv:2303.00971 +
+
+
+
+
+ + ♻ ☆ Embodiment: Self-Supervised Depth Estimation Based on Camera Models + + +
+ Depth estimation is a critical topic for robotics and vision-related tasks. +In monocular depth estimation, in comparison with supervised learning that +requires expensive ground truth labeling, self-supervised methods possess great +potential due to no labeling cost. However, self-supervised learning still has +a large gap with supervised learning in 3D reconstruction and depth estimation +performance. Meanwhile, scaling is also a major issue for monocular +unsupervised depth estimation, which commonly still needs ground truth scale +from GPS, LiDAR, or existing maps to correct. In the era of deep learning, +existing methods primarily rely on exploring image relationships to train +unsupervised neural networks, while the physical properties of the camera +itself such as intrinsics and extrinsics are often overlooked. These physical +properties are not just mathematical parameters; they are embodiments of the +camera's interaction with the physical world. By embedding these physical +properties into the deep learning model, we can calculate depth priors for +ground regions and regions connected to the ground based on physical +principles, providing free supervision signals without the need for additional +sensors. This approach is not only easy to implement but also enhances the +effects of all unsupervised methods by embedding the camera's physical +properties into the model, thereby achieving an embodied understanding of the +real world. + +
+
+
+
+
+ + ♻ ☆ Using Texture to Classify Forests Separately from Vegetation + + +
+ Identifying terrain within satellite image data is a key issue in +geographical information sciences, with numerous environmental and safety +implications. Many techniques exist to derive classifications from spectral +data captured by satellites. However, the ability to reliably classify +vegetation remains a challenge. In particular, no precise methods exist for +classifying forest vs. non-forest vegetation in high-level satellite images. +This paper provides an initial proposal for a static, algorithmic process to +identify forest regions in satellite image data through texture features +created from detected edges and the NDVI ratio captured by Sentinel-2 satellite +images. With strong initial results, this paper also identifies the next steps +to improve the accuracy of the classification and verification processes. + +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce several improvements to the ColBERT model architecture +and training pipeline, leveraging techniques successful in the more established +single-vector embedding model paradigm, particularly those suited for +heterogeneous multilingual data. Our new model, Jina-ColBERT-v2, demonstrates +strong performance across a range of English and multilingual retrieval tasks, +while also cutting storage requirements by up to 50% compared to previous +models. + +
+
+
+
+
+ + ☆ Transformers Meet ACT-R: Repeat-Aware and Sequential Listening Session + Recommendation RecSys'2024 + + +
+ Music streaming services often leverage sequential recommender systems to +predict the best music to showcase to users based on past sequences of +listening sessions. Nonetheless, most sequential recommendation methods ignore +or insufficiently account for repetitive behaviors. This is a crucial +limitation for music recommendation, as repeatedly listening to the same song +over time is a common phenomenon that can even change the way users perceive +this song. In this paper, we introduce PISA (Psychology-Informed Session +embedding using ACT-R), a session-level sequential recommender system that +overcomes this limitation. PISA employs a Transformer architecture learning +embedding representations of listening sessions and users using attention +mechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational), +a cognitive architecture modeling human information access and memory dynamics. +This approach enables us to capture dynamic and repetitive patterns from user +behaviors, allowing us to effectively predict the songs they will listen to in +subsequent sessions, whether they are repeated or new ones. We demonstrate the +empirical relevance of PISA using both publicly available listening data from +Last.fm and proprietary data from Deezer, a global music streaming service, +confirming the critical importance of repetition modeling for sequential +listening session recommendation. Along with this paper, we publicly release +our proprietary dataset to foster future research in this field, as well as the +source code of PISA to facilitate its future use. + +
+
+ comment: 11 pages. Accepted by RecSys'2024, full paper +
+
+
+
+
+ + ☆ Is text normalization relevant for classifying medieval charters? + + +
+ This study examines the impact of historical text normalization on the +classification of medieval charters, specifically focusing on document dating +and locating. Using a data set of Middle High German charters from a digital +archive, we evaluate various classifiers, including traditional and +transformer-based models, with and without normalization. Our results indicate +that the given normalization minimally improves locating tasks but reduces +accuracy for dating, implying that original texts contain crucial features that +normalization may obscure. We find that support vector machines and gradient +boosting outperform other models, questioning the efficiency of transformers +for this use case. Results suggest a selective approach to historical text +normalization, emphasizing the significance of preserving some textual +characteristics that are critical for classification tasks in document +analysis. + +
+
+ comment: This preprint has not undergone peer review or any post-submission + improvements or corrections +
+
+
+
+
+ + ☆ Do Recommender Systems Promote Local Music? A Reproducibility Study + Using Music Streaming Data + + +
+ This paper examines the influence of recommender systems on local music +representation, discussing prior findings from an empirical study on the LFM-2b +public dataset. This prior study argued that different recommender systems +exhibit algorithmic biases shifting music consumption either towards or against +local content. However, LFM-2b users do not reflect the diverse audience of +music streaming services. To assess the robustness of this study's conclusions, +we conduct a comparative analysis using proprietary listening data from a +global music streaming service, which we publicly release alongside this paper. +We observe significant differences in local music consumption patterns between +our dataset and LFM-2b, suggesting that caution should be exercised when +drawing conclusions on local music based solely on LFM-2b. Moreover, we show +that the algorithmic biases exhibited in the original work vary in our dataset, +and that several unexplored model parameters can significantly influence these +biases and affect the study's conclusion on both datasets. Finally, we discuss +the complexity of accurately labeling local music, emphasizing the risk of +misleading conclusions due to unreliable, biased, or incomplete labels. To +encourage further research and ensure reproducibility, we have publicly shared +our dataset and code. + +
+
+
+
+
+ + ☆ SynDL: A Large-Scale Synthetic Test Collection + + +
+ Large-scale test collections play a crucial role in Information Retrieval +(IR) research. However, according to the Cranfield paradigm and the research +into publicly available datasets, the existing information retrieval research +studies are commonly developed on small-scale datasets that rely on human +assessors for relevance judgments - a time-intensive and expensive process. +Recent studies have shown the strong capability of Large Language Models (LLMs) +in producing reliable relevance judgments with human accuracy but at a greatly +reduced cost. In this paper, to address the missing large-scale ad-hoc document +retrieval dataset, we extend the TREC Deep Learning Track (DL) test collection +via additional language model synthetic labels to enable researchers to test +and evaluate their search systems at a large scale. Specifically, such a test +collection includes more than 1,900 test queries from the previous years of +tracks. We compare system evaluation with past human labels from past years and +find that our synthetically created large-scale test collection can lead to +highly correlated system rankings. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Rethinking Sparse Lexical Representations for Image Retrieval in the Age + of Rising Multi-Modal Large Language Models ECCV 2024 + + +
+ In this paper, we rethink sparse lexical representations for image retrieval. +By utilizing multi-modal large language models (M-LLMs) that support visual +prompting, we can extract image features and convert them into textual data, +enabling us to utilize efficient sparse retrieval algorithms employed in +natural language processing for image retrieval tasks. To assist the LLM in +extracting image features, we apply data augmentation techniques for key +expansion and analyze the impact with a metric for relevance between images and +textual data. We empirically show the superior precision and recall performance +of our image retrieval method compared to conventional vision-language +model-based methods on the MS-COCO, PASCAL VOC, and NUS-WIDE datasets in a +keyword-based image retrieval scenario, where keywords serve as search queries. +We also demonstrate that the retrieval performance can be improved by +iteratively incorporating keywords into search queries. + +
+
+ comment: Accepted to ECCV 2024 Workshops: 2nd Workshop on Traditional Computer + Vision in the Age of Deep Learning (TradiCV) +
+
+
+
+
+ + ☆ Efficient Transfer Learning Framework for Cross-Domain Click-Through + Rate Prediction + + +
+ Natural content and advertisement coexist in industrial recommendation +systems but differ in data distribution. Concretely, traffic related to the +advertisement is considerably sparser compared to that of natural content, +which motivates the development of transferring knowledge from the richer +source natural content domain to the sparser advertising domain. The challenges +include the inefficiencies arising from the management of extensive source data +and the problem of 'catastrophic forgetting' that results from the CTR model's +daily updating. To this end, we propose a novel tri-level asynchronous +framework, i.e., Efficient Transfer Learning Framework for Cross-Domain +Click-Through Rate Prediction (E-CDCTR), to transfer comprehensive knowledge of +natural content to advertisement CTR models. This framework consists of three +key components: Tiny Pre-training Model ((TPM), which trains a tiny CTR model +with several basic features on long-term natural data; Complete Pre-training +Model (CPM), which trains a CTR model holding network structure and input +features the same as target advertisement on short-term natural data; +Advertisement CTR model (A-CTR), which derives its parameter initialization +from CPM together with multiple historical embeddings from TPM as extra feature +and then fine-tunes on advertisement data. TPM provides richer representations +of user and item for both the CPM and A-CTR, effectively alleviating the +forgetting problem inherent in the daily updates. CPM further enhances the +advertisement model by providing knowledgeable initialization, thereby +alleviating the data sparsity challenges typically encountered by advertising +CTR models. Such a tri-level cross-domain transfer learning framework offers an +efficient solution to address both data sparsity and `catastrophic forgetting', +yielding remarkable improvements. + +
+
+
+
+
+ + ☆ A Prototype Model of Zero-Trust Architecture Blockchain with + EigenTrust-Based Practical Byzantine Fault Tolerance Protocol to Manage + Decentralized Clinical Trials + + +
+ The COVID-19 pandemic necessitated the emergence of decentralized Clinical +Trials (DCTs) due to patient retention, accelerate trials, improve data +accessibility, enable virtual care, and facilitate seamless communication +through integrated systems. However, integrating systems in DCTs exposes +clinical data to potential security threats, making them susceptible to theft +at any stage, a high risk of protocol deviations, and monitoring issues. To +mitigate these challenges, blockchain technology serves as a secure framework, +acting as a decentralized ledger, creating an immutable environment by +establishing a zero-trust architecture, where data are deemed untrusted until +verified. In combination with Internet of Things (IoT)-enabled wearable +devices, blockchain secures the transfer of clinical trial data on private +blockchains during DCT automation and operations. This paper proposes a +prototype model of the Zero-Trust Architecture Blockchain (z-TAB) to integrate +patient-generated clinical trial data during DCT operation management. The +EigenTrust-based Practical Byzantine Fault Tolerance (T-PBFT) algorithm has +been incorporated as a consensus protocol, leveraging Hyperledger Fabric. +Furthermore, the Internet of Things (IoT) has been integrated to streamline +data processing among stakeholders within the blockchain platforms. Rigorous +evaluation has been done to evaluate the quality of the system. + +
+
+ comment: NA +
+
+
+
+
+ + ☆ Longitudinal Modularity, a Modularity for Link Streams + + +
+ Temporal networks are commonly used to model real-life phenomena. When these +phenomena represent interactions and are captured at a fine-grained temporal +resolution, they are modeled as link streams. Community detection is an +essential network analysis task. Although many methods exist for static +networks, and some methods have been developed for temporal networks +represented as sequences of snapshots, few works can handle link streams. This +article introduces the first adaptation of the well-known Modularity quality +function to link streams. Unlike existing methods, it is independent of the +time scale of analysis. After introducing the quality function, and its +relation to existing static and dynamic definitions of Modularity, we show +experimentally its relevance for dynamic community evaluation. + +
+
+
+
+
+ + ♻ ☆ Smart Multi-Modal Search: Contextual Sparse and Dense Embedding + Integration in Adobe Express CIKM 2024 + + +
+ As user content and queries become increasingly multi-modal, the need for +effective multi-modal search systems has grown. Traditional search systems +often rely on textual and metadata annotations for indexed images, while +multi-modal embeddings like CLIP enable direct search using text and image +embeddings. However, embedding-based approaches face challenges in integrating +contextual features such as user locale and recency. Building a scalable +multi-modal search system requires fine-tuning several components. This paper +presents a multi-modal search architecture and a series of AB tests that +optimize embeddings and multi-modal technologies in Adobe Express template +search. We address considerations such as embedding model selection, the roles +of embeddings in matching and ranking, and the balance between dense and sparse +embeddings. Our iterative approach demonstrates how utilizing sparse, dense, +and contextual features enhances short and long query search, significantly +reduces null rates (over 70\%), and increases click-through rates (CTR). Our +findings provide insights into developing robust multi-modal search systems, +thereby enhancing relevance for complex queries. + +
+
+ comment: CIKM 2024 (International Conference on Information and Knowledge + Management), Multimodal Search and Recommendations Workshop +
+
+
+
+
+ + ♻ ☆ GenRec: Generative Sequential Recommendation with Large Language Models + + +
+ Sequential recommendation is a task to capture hidden user preferences from +historical user item interaction data and recommend next items for the user. +Significant progress has been made in this domain by leveraging classification +based learning methods. Inspired by the recent paradigm of 'pretrain, prompt +and predict' in NLP, we consider sequential recommendation as a sequence to +sequence generation task and propose a novel model named Generative +Recommendation (GenRec). Unlike classification based models that learn explicit +user and item representations, GenRec utilizes the sequence modeling capability +of Transformer and adopts the masked item prediction objective to effectively +learn the hidden bidirectional sequential patterns. Different from existing +generative sequential recommendation models, GenRec does not rely on manually +designed hard prompts. The input to GenRec is textual user item sequence and +the output is top ranked next items. Moreover, GenRec is lightweight and +requires only a few hours to train effectively in low-resource settings, making +it highly applicable to real-world scenarios and helping to democratize large +language models in the sequential recommendation domain. Our extensive +experiments have demonstrated that GenRec generalizes on various public +real-world datasets and achieves state-of-the-art results. Our experiments also +validate the effectiveness of the the proposed masked item prediction objective +that improves the model performance by a large margin. + +
+
+
+
+
+ + ♻ ☆ Summaries, Highlights, and Action items: Design, implementation and + evaluation of an LLM-powered meeting recap system SC + + +
+ Meetings play a critical infrastructural role in the coordination of work. In +recent years, due to shift to hybrid and remote work, more meetings are moving +to online Computer Mediated Spaces. This has led to new problems (e.g. more +time spent in less engaging meetings) and new opportunities (e.g. automated +transcription/captioning and recap support). Recent advances in large language +models (LLMs) for dialog summarization have the potential to improve the +experience of meetings by reducing individuals' meeting load and increasing the +clarity and alignment of meeting outputs. Despite this potential, they face +technological limitation due to long transcripts and inability to capture +diverse recap needs based on user's context. To address these gaps, we design, +implement and evaluate in-context a meeting recap system. We first +conceptualize two salient recap representations -- important highlights, and a +structured, hierarchical minutes view. We develop a system to operationalize +the representations with dialogue summarization as its building blocks. +Finally, we evaluate the effectiveness of the system with seven users in the +context of their work meetings. Our findings show promise in using LLM-based +dialogue summarization for meeting recap and the need for both representations +in different contexts. However, we find that LLM-based recap still lacks an +understanding of whats personally relevant to participants, can miss important +details, and mis-attributions can be detrimental to group dynamics. We identify +collaboration opportunities such as a shared recap document that a high quality +recap enables. We report on implications for designing AI systems to partner +with users to learn and improve from natural interactions to overcome the +limitations related to personal relevance and summarization quality. + +
+
+ comment: in review for CSCW 24 +
+
+
+
+
+ + ♻ ☆ Use of a Structured Knowledge Base Enhances Metadata Curation by Large + Language Models + + +
+ Metadata play a crucial role in ensuring the findability, accessibility, +interoperability, and reusability of datasets. This paper investigates the +potential of large language models (LLMs), specifically GPT-4, to improve +adherence to metadata standards. We conducted experiments on 200 random data +records describing human samples relating to lung cancer from the NCBI +BioSample repository, evaluating GPT-4's ability to suggest edits for adherence +to metadata standards. We computed the adherence accuracy of field name-field +value pairs through a peer review process, and we observed a marginal average +improvement in adherence to the standard data dictionary from 79% to 80% +(p<0.5). We then prompted GPT-4 with domain information in the form of the +textual descriptions of CEDAR templates and recorded a significant improvement +to 97% from 79% (p<0.01). These results indicate that, while LLMs may not be +able to correct legacy metadata to ensure satisfactory adherence to standards +when unaided, they do show promise for use in automated metadata curation when +integrated with a structured knowledge base + +
+
+
+
+
+
+
+
+ + Machine Learning 171 + +
+
+
+ + ☆ A Score-Based Density Formula, with Applications in Diffusion Generative + Models + + +
+ Score-based generative models (SGMs) have revolutionized the field of +generative modeling, achieving unprecedented success in generating realistic +and diverse content. Despite empirical advances, the theoretical basis for why +optimizing the evidence lower bound (ELBO) on the log-likelihood is effective +for training diffusion generative models, such as DDPMs, remains largely +unexplored. In this paper, we address this question by establishing a density +formula for a continuous-time diffusion process, which can be viewed as the +continuous-time limit of the forward process in an SGM. This formula reveals +the connection between the target density and the score function associated +with each step of the forward process. Building on this, we demonstrate that +the minimizer of the optimization objective for training DDPMs nearly coincides +with that of the true objective, providing a theoretical foundation for +optimizing DDPMs using the ELBO. Furthermore, we offer new insights into the +role of score-matching regularization in training GANs, the use of ELBO in +diffusion classifiers, and the recently proposed diffusion loss. + +
+
+
+
+
+ + ☆ UV-free Texture Generation with Denoising and Geodesic Heat Diffusions + + +
+ Seams, distortions, wasted UV space, vertex-duplication, and varying +resolution over the surface are the most prominent issues of the standard +UV-based texturing of meshes. These issues are particularly acute when +automatic UV-unwrapping techniques are used. For this reason, instead of +generating textures in automatically generated UV-planes like most +state-of-the-art methods, we propose to represent textures as coloured +point-clouds whose colours are generated by a denoising diffusion probabilistic +model constrained to operate on the surface of 3D objects. Our sampling and +resolution agnostic generative model heavily relies on heat diffusion over the +surface of the meshes for spatial communication between points. To enable +processing of arbitrarily sampled point-cloud textures and ensure long-distance +texture consistency we introduce a fast re-sampling of the mesh spectral +properties used during the heat diffusion and introduce a novel +heat-diffusion-based self-attention mechanism. Our code and pre-trained models +are available at github.com/simofoti/UV3-TeD. + +
+
+
+
+
+ + ☆ Reinforcement Learning without Human Feedback for Last Mile Fine-Tuning + of Large Language Models + + +
+ Reinforcement learning is used to align language models with human preference +signals after first pre-training the model to predict the next token of text +within a large corpus using likelihood maximization. Before being deployed in a +specific domain, models are often further fine-tuned on task specific data. +Since human preferences are often unavailable for the last step, it is +performed using likelihood maximization as that is the typical default method. +However, reinforcement learning has other advantages besides facilitating +alignment to a human derived reward function. For one, whereas likelihood +maximization is a form of imitation learning in which the model is trained on +what to do under ideal conditions, reinforcement learning is not limited to +demonstrating actions just for optimally reached states and trains a model what +to do under a range of scenarios as it explores the policy space. In addition, +it also trains a model what not to do, suppressing competitive but poor +actions. This work develops a framework for last-mile fine-tuning using +reinforcement learning and tests whether it garners performance gains. The +experiments center on abstractive summarization, but the framework is general +and broadly applicable. Use of the procedure produced significantly better +results than likelihood maximization when comparing raw predictions. For the +specific data tested, the gap could be bridged by employing post-processing of +the maximum likelihood outputs. Nonetheless, the framework offers a new avenue +for model optimization in situations where post-processing may be less +straightforward or effective, and it can be extended to include more complex +classes of undesirable outputs to penalize and train against, such as +hallucinations. + +
+
+
+
+
+ + ☆ A Gradient Analysis Framework for Rewarding Good and Penalizing Bad + Examples in Language Models + + +
+ Beyond maximum likelihood estimation (MLE), the standard objective of a +language model (LM) that optimizes good examples probabilities, many studies +have explored ways that also penalize bad examples for enhancing the quality of +output distribution, including unlikelihood training, exponential maximizing +average treatment effect (ExMATE), and direct preference optimization (DPO). To +systematically compare these methods and further provide a unified recipe for +LM optimization, in this paper, we present a unique angle of gradient analysis +of loss functions that simultaneously reward good examples and penalize bad +ones in LMs. Through both mathematical results and experiments on +CausalDialogue and Anthropic HH-RLHF datasets, we identify distinct functional +characteristics among these methods. We find that ExMATE serves as a superior +surrogate for MLE, and that combining DPO with ExMATE instead of MLE further +enhances both the statistical (5-7%) and generative (+18% win rate) +performance. + +
+
+
+
+
+ + ☆ Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming + + +
+ Recent advances in language models have achieved significant progress. +GPT-4o, as a new milestone, has enabled real-time conversations with humans, +demonstrating near-human natural fluency. Such human-computer interaction +necessitates models with the capability to perform reasoning directly with the +audio modality and generate output in streaming. However, this remains beyond +the reach of current academic models, as they typically depend on extra TTS +systems for speech synthesis, resulting in undesirable latency. This paper +introduces the Mini-Omni, an audio-based end-to-end conversational model, +capable of real-time speech interaction. To achieve this capability, we propose +a text-instructed speech generation method, along with batch-parallel +strategies during inference to further boost the performance. Our method also +helps to retain the original model's language capabilities with minimal +degradation, enabling other works to establish real-time interaction +capabilities. We call this training method "Any Model Can Talk". We also +introduce the VoiceAssistant-400K dataset to fine-tune models optimized for +speech output. To our best knowledge, Mini-Omni is the first fully end-to-end, +open-source model for real-time speech interaction, offering valuable potential +for future research. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ A GREAT Architecture for Edge-Based Graph Problems Like TSP + + +
+ In the last years, many neural network-based approaches have been proposed to +tackle combinatorial optimization problems such as routing problems. Many of +these approaches are based on graph neural networks (GNNs) or related +transformers, operating on the Euclidean coordinates representing the routing +problems. However, GNNs are inherently not well suited to operate on dense +graphs, such as in routing problems. Furthermore, models operating on Euclidean +coordinates cannot be applied to non-Euclidean versions of routing problems +that are often found in real-world settings. To overcome these limitations, we +propose a novel GNN-related edge-based neural model called Graph Edge Attention +Network (GREAT). We evaluate the performance of GREAT in the +edge-classification task to predict optimal edges in the Traveling Salesman +Problem (TSP). We can use such a trained GREAT model to produce sparse TSP +graph instances, keeping only the edges GREAT finds promising. Compared to +other, non-learning-based methods to sparsify TSP graphs, GREAT can produce +very sparse graphs while keeping most of the optimal edges. Furthermore, we +build a reinforcement learning-based GREAT framework which we apply to +Euclidean and non-Euclidean asymmetric TSP. This framework achieves +state-of-the-art results. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ Enhanced forecasting of stock prices based on variational mode + decomposition, PatchTST, and adaptive scale-weighted layer + + +
+ The significant fluctuations in stock index prices in recent years highlight +the critical need for accurate forecasting to guide investment and financial +strategies. This study introduces a novel composite forecasting framework that +integrates variational mode decomposition (VMD), PatchTST, and adaptive +scale-weighted layer (ASWL) to address these challenges. Utilizing datasets of +four major stock indices--SP500, DJI, SSEC, and FTSE--from 2000 to 2024, the +proposed method first decomposes the raw price series into intrinsic mode +functions (IMFs) using VMD. Each IMF is then modeled with PatchTST to capture +temporal patterns effectively. The ASWL module is applied to incorporate scale +information, enhancing prediction accuracy. The final forecast is derived by +aggregating predictions from all IMFs. The VMD-PatchTST-ASWL framework +demonstrates significant improvements in forecasting accuracy compared to +traditional models, showing robust performance across different indices. This +innovative approach provides a powerful tool for stock index price forecasting, +with potential applications in various financial analysis and investment +decision-making contexts. + +
+
+
+
+
+ + ☆ SympGNNs: Symplectic Graph Neural Networks for identifiying + high-dimensional Hamiltonian systems and node classification + + +
+ Existing neural network models to learn Hamiltonian systems, such as +SympNets, although accurate in low-dimensions, struggle to learn the correct +dynamics for high-dimensional many-body systems. Herein, we introduce +Symplectic Graph Neural Networks (SympGNNs) that can effectively handle system +identification in high-dimensional Hamiltonian systems, as well as node +classification. SympGNNs combines symplectic maps with permutation +equivariance, a property of graph neural networks. Specifically, we propose two +variants of SympGNNs: i) G-SympGNN and ii) LA-SympGNN, arising from different +parameterizations of the kinetic and potential energy. We demonstrate the +capabilities of SympGNN on two physical examples: a 40-particle coupled +Harmonic oscillator, and a 2000-particle molecular dynamics simulation in a +two-dimensional Lennard-Jones potential. Furthermore, we demonstrate the +performance of SympGNN in the node classification task, achieving accuracy +comparable to the state-of-the-art. We also empirically show that SympGNN can +overcome the oversmoothing and heterophily problems, two key challenges in the +field of graph neural networks. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ CW-CNN & CW-AN: Convolutional Networks and Attention Networks for + CW-Complexes + + +
+ We present a novel framework for learning on CW-complex structured data +points. Recent advances have discussed CW-complexes as ideal learning +representations for problems in cheminformatics. However, there is a lack of +available machine learning methods suitable for learning on CW-complexes. In +this paper we develop notions of convolution and attention that are well +defined for CW-complexes. These notions enable us to create the first neural +network that can receive a CW-complex as input. We illustrate and interpret +this framework in the context of supervised prediction. + +
+
+
+
+
+ + ☆ A Catalog of Fairness-Aware Practices in Machine Learning Engineering + + +
+ Machine learning's widespread adoption in decision-making processes raises +concerns about fairness, particularly regarding the treatment of sensitive +features and potential discrimination against minorities. The software +engineering community has responded by developing fairness-oriented metrics, +empirical studies, and approaches. However, there remains a gap in +understanding and categorizing practices for engineering fairness throughout +the machine learning lifecycle. This paper presents a novel catalog of +practices for addressing fairness in machine learning derived from a systematic +mapping study. The study identifies and categorizes 28 practices from existing +literature, mapping them onto different stages of the machine learning +lifecycle. From this catalog, the authors extract actionable items and +implications for both researchers and practitioners in software engineering. +This work aims to provide a comprehensive resource for integrating fairness +considerations into the development and deployment of machine learning systems, +enhancing their reliability, accountability, and credibility. + +
+
+
+
+
+ + ☆ Entropic Distribution Matching in Supervised Fine-tuning of LLMs: Less + Overfitting and Better Diversity + + +
+ Large language models rely on Supervised Fine-Tuning (SFT) to specialize in +downstream tasks. Cross Entropy (CE) loss is the de facto choice in SFT, but it +often leads to overfitting and limited output diversity due to its aggressive +updates to the data distribution. This paper aim to address these issues by +introducing the maximum entropy principle, which favors models with flatter +distributions that still effectively capture the data. Specifically, we develop +a new distribution matching method called GEM, which solves reverse +Kullback-Leibler divergence minimization with an entropy regularizer. + For the SFT of Llama-3-8B models, GEM outperforms CE in several aspects. +First, when applied to the UltraFeedback dataset to develop general +instruction-following abilities, GEM exhibits reduced overfitting, evidenced by +lower perplexity and better performance on the IFEval benchmark. Furthermore, +GEM enhances output diversity, leading to performance gains of up to 7 points +on math reasoning and code generation tasks using best-of-n sampling, even +without domain-specific data. Second, when fine-tuning with domain-specific +datasets for math reasoning and code generation, GEM also shows less +overfitting and improvements of up to 10 points compared with CE. + +
+
+
+
+
+ + ☆ Iterative Graph Alignment + + +
+ By compressing diverse narratives, LLMs go beyond memorization, achieving +intelligence by capturing generalizable causal relationships. However, they +suffer from local 'representation gaps' due to insufficient training data +diversity, limiting their real-world utility, especially in tasks requiring +strict alignment to rules. Traditional alignment methods relying on heavy human +annotations are inefficient and unscalable. Recent self-alignment techniques +also fall short, as they often depend on self-selection based prompting and +memorization-based learning. To address these issues, we introduce Iterative +Graph Alignment (IGA), an annotation-free rule-based alignment algorithm. A +teacher model (VLM) employs Iterative Graph Prompting (IGP) to create logical +graphs and reference answers. The student model (LLM) identifies local +knowledge gaps by attempting to align its responses with these references, +collaborating with helper models to generate diverse answers. These aligned +responses are then used for iterative supervised fine-tuning (SFT). Our +evaluations across five rule-based scenarios demonstrate IGP's effectiveness, +with a 73.12\% alignment improvement in Claude Sonnet 3.5, and +Llama3-8B-Instruct achieving an 86.20\% improvement, outperforming Claude +Sonnet 3.5 in rule-based alignment. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Optimal Parallelization of Boosting + + +
+ Recent works on the parallel complexity of Boosting have established strong +lower bounds on the tradeoff between the number of training rounds $p$ and the +total parallel work per round $t$. These works have also presented highly +non-trivial parallel algorithms that shed light on different regions of this +tradeoff. Despite these advancements, a significant gap persists between the +theoretical lower bounds and the performance of these algorithms across much of +the tradeoff space. In this work, we essentially close this gap by providing +both improved lower bounds on the parallel complexity of weak-to-strong +learners, and a parallel Boosting algorithm whose performance matches these +bounds across the entire $p$ vs.~$t$ compromise spectrum, up to logarithmic +factors. Ultimately, this work settles the true parallel complexity of Boosting +algorithms that are nearly sample-optimal. + +
+
+
+
+
+ + ☆ Towards Efficient Modelling of String Dynamics: A Comparison of State + Space and Koopman based Deep Learning Methods + + +
+ This paper presents an examination of State Space Models (SSM) and +Koopman-based deep learning methods for modelling the dynamics of both linear +and non-linear stiff strings. Through experiments with datasets generated under +different initial conditions and sample rates, we assess the capacity of these +models to accurately model the complex behaviours observed in string dynamics. +Our findings indicate that our proposed Koopman-based model performs as well as +or better than other existing approaches in non-linear cases for long-sequence +modelling. + We inform the design of these architectures with the structure of the +problems at hand. Although challenges remain in extending model predictions +beyond the training horizon (i.e., extrapolation), the focus of our +investigation lies in the models' ability to generalise across different +initial conditions within the training time interval. This research contributes +insights into the physical modelling of dynamical systems (in particular those +addressing musical acoustics) by offering a comparative overview of these and +previous methods and introducing innovative strategies for model improvement. +Our results highlight the efficacy of these models in simulating non-linear +dynamics and emphasise their wide-ranging applicability in accurately modelling +dynamical systems over extended sequences. + +
+
+ comment: Accepted to DAFx2024 +
+
+
+
+
+ + ☆ 3D Pose-Based Temporal Action Segmentation for Figure Skating: A + Fine-Grained and Jump Procedure-Aware Annotation Approach + + +
+ Understanding human actions from videos is essential in many domains, +including sports. In figure skating, technical judgments are performed by +watching skaters' 3D movements, and its part of the judging procedure can be +regarded as a Temporal Action Segmentation (TAS) task. TAS tasks in figure +skating that automatically assign temporal semantics to video are actively +researched. However, there is a lack of datasets and effective methods for TAS +tasks requiring 3D pose data. In this study, we first created the FS-Jump3D +dataset of complex and dynamic figure skating jumps using optical markerless +motion capture. We also propose a new fine-grained figure skating jump TAS +dataset annotation method with which TAS models can learn jump procedures. In +the experimental results, we validated the usefulness of 3D pose features as +input and the fine-grained dataset for the TAS model in figure skating. +FS-Jump3D Dataset is available at https://github.com/ryota-skating/FS-Jump3D. + +
+
+ comment: 10 pages, 7th ACM International Workshop on Multimedia Content + Analysis in Sports +
+
+
+
+
+ + ☆ Turbulence Strength $C_n^2$ Estimation from Video using Physics-based + Deep Learning + + +
+ Images captured from a long distance suffer from dynamic image distortion due +to turbulent flow of air cells with random temperatures, and thus refractive +indices. This phenomenon, known as image dancing, is commonly characterized by +its refractive-index structure constant $C_n^2$ as a measure of the turbulence +strength. For many applications such as atmospheric forecast model, +long-range/astronomy imaging, and aviation safety, optical communication +technology, $C_n^2$ estimation is critical for accurately sensing the turbulent +environment. Previous methods for $C_n^2$ estimation include estimation from +meteorological data (temperature, relative humidity, wind shear, etc.) for +single-point measurements, two-ended pathlength measurements from optical +scintillometer for path-averaged $C_n^2$, and more recently estimating $C_n^2$ +from passive video cameras for low cost and hardware complexity. In this paper, +we present a comparative analysis of classical image gradient methods for +$C_n^2$ estimation and modern deep learning-based methods leveraging +convolutional neural networks. To enable this, we collect a dataset of video +capture along with reference scintillometer measurements for ground truth, and +we release this unique dataset to the scientific community. We observe that +deep learning methods can achieve higher accuracy when trained on similar data, +but suffer from generalization errors to other, unseen imagery as compared to +classical methods. To overcome this trade-off, we present a novel physics-based +network architecture that combines learned convolutional layers with a +differentiable image gradient method that maintains high accuracy while being +generalizable across image datasets. + +
+
+ comment: Code Available: https://github.com/Riponcs/Cn2Estimation +
+
+
+
+
+ + ☆ Towards Infusing Auxiliary Knowledge for Distracted Driver Detection KDD + + +
+ Distracted driving is a leading cause of road accidents globally. +Identification of distracted driving involves reliably detecting and +classifying various forms of driver distraction (e.g., texting, eating, or +using in-car devices) from in-vehicle camera feeds to enhance road safety. This +task is challenging due to the need for robust models that can generalize to a +diverse set of driver behaviors without requiring extensive annotated datasets. +In this paper, we propose KiD3, a novel method for distracted driver detection +(DDD) by infusing auxiliary knowledge about semantic relations between entities +in a scene and the structural configuration of the driver's pose. Specifically, +we construct a unified framework that integrates the scene graphs, and driver +pose information with the visual cues in video frames to create a holistic +representation of the driver's actions.Our results indicate that KiD3 achieves +a 13.64% accuracy improvement over the vision-only baseline by incorporating +such auxiliary knowledge with visual information. + +
+
+ comment: Accepted at KiL 2024: Workshop on Knowledge-infused Learning + co-located with 30th ACM KDD Conference +
+
+
+
+
+ + ☆ Hyperdimensional Vector Tsetlin Machines with Applications to Sequence + Learning and Generation + + +
+ We construct a two-layered model for learning and generating sequential data +that is both computationally fast and competitive with vanilla Tsetlin +machines, adding numerous advantages. Through the use of hyperdimensional +vector computing (HVC) algebras and Tsetlin machine clause structures, we +demonstrate that the combination of both inherits the generality of data +encoding and decoding of HVC with the fast interpretable nature of Tsetlin +machines to yield a powerful machine learning model. We apply the approach in +two areas, namely in forecasting, generating new sequences, and classification. +For the latter, we derive results for the entire UCR Time Series Archive and +compare with the standard benchmarks to see how well the method competes in +time series classification. + +
+
+
+
+
+ + ☆ Blending Low and High-Level Semantics of Time Series for Better Masked + Time Series Generation + + +
+ State-of-the-art approaches in time series generation (TSG), such as +TimeVQVAE, utilize vector quantization-based tokenization to effectively model +complex distributions of time series. These approaches first learn to transform +time series into a sequence of discrete latent vectors, and then a prior model +is learned to model the sequence. The discrete latent vectors, however, only +capture low-level semantics (\textit{e.g.,} shapes). We hypothesize that +higher-fidelity time series can be generated by training a prior model on more +informative discrete latent vectors that contain both low and high-level +semantics (\textit{e.g.,} characteristic dynamics). In this paper, we introduce +a novel framework, termed NC-VQVAE, to integrate self-supervised learning into +those TSG methods to derive a discrete latent space where low and high-level +semantics are captured. Our experimental results demonstrate that NC-VQVAE +results in a considerable improvement in the quality of synthetic samples. + +
+
+
+
+
+ + ☆ Data Quality Monitoring through Transfer Learning on Anomaly Detection + for the Hadron Calorimeters + + +
+ The proliferation of sensors brings an immense volume of spatio-temporal (ST) +data in many domains for various purposes, including monitoring, diagnostics, +and prognostics applications. Data curation is a time-consuming process for a +large volume of data, making it challenging and expensive to deploy data +analytics platforms in new environments. Transfer learning (TL) mechanisms +promise to mitigate data sparsity and model complexity by utilizing pre-trained +models for a new task. Despite the triumph of TL in fields like computer vision +and natural language processing, efforts on complex ST models for anomaly +detection (AD) applications are limited. In this study, we present the +potential of TL within the context of AD for the Hadron Calorimeter of the +Compact Muon Solenoid experiment at CERN. We have transferred the ST AD models +trained on data collected from one part of a calorimeter to another. We have +investigated different configurations of TL on semi-supervised autoencoders of +the ST AD models -- transferring convolutional, graph, and recurrent neural +networks of both the encoder and decoder networks. The experiment results +demonstrate that TL effectively enhances the model learning accuracy on a +target subdetector. The TL achieves promising data reconstruction and AD +performance while substantially reducing the trainable parameters of the AD +models. It also improves robustness against anomaly contamination in the +training data sets of the semi-supervised AD models. + +
+
+ comment: 28 pages, 15 figures, and 9 tables +
+
+
+
+
+ + ☆ Subspace Representation Learning for Sparse Linear Arrays to Localize + More Sources than Sensors: A Deep Learning Methodology + + +
+ Localizing more sources than sensors with a sparse linear array (SLA) has +long relied on minimizing a distance between two covariance matrices and recent +algorithms often utilize semidefinite programming (SDP). Although deep neural +network (DNN)-based methods offer new alternatives, they still depend on +covariance matrix fitting. In this paper, we develop a novel methodology that +estimates the co-array subspaces from a sample covariance for SLAs. Our +methodology trains a DNN to learn signal and noise subspace representations +that are invariant to the selection of bases. To learn such representations, we +propose loss functions that gauge the separation between the desired and the +estimated subspace. In particular, we propose losses that measure the length of +the shortest path between subspaces viewed on a union of Grassmannians, and +prove that it is possible for a DNN to approximate signal subspaces. The +computation of learning subspaces of different dimensions is accelerated by a +new batch sampling strategy called consistent rank sampling. The methodology is +robust to array imperfections due to its geometry-agnostic and data-driven +nature. In addition, we propose a fully end-to-end gridless approach that +directly learns angles to study the possibility of bypassing subspace methods. +Numerical results show that learning such subspace representations is more +beneficial than learning covariances or angles. It outperforms conventional +SDP-based methods such as the sparse and parametric approach (SPA) and existing +DNN-based covariance reconstruction methods for a wide range of signal-to-noise +ratios (SNRs), snapshots, and source numbers for both perfect and imperfect +arrays. + +
+
+ comment: 13 pages. Submitted to the IEEE Transactions on Signal Processing +
+
+
+
+
+ + ☆ sEMG-Driven Physics-Informed Gated Recurrent Networks for Modeling Upper + Limb Multi-Joint Movement Dynamics + + +
+ Exoskeletons and rehabilitation systems offer great potential for enhancing +human strength and recovery through advanced human-machine interfaces (HMIs) +that adapt to movement dynamics. However, the real-time application of +physics-informed neural networks (PINNs) is limited by their reliance on fixed +input lengths and surrogate models. This study introduces a novel +physics-informed Gated Recurrent Network (PiGRN) designed to predict +multi-joint torques using surface electromyography (sEMG) data. The PiGRN model +employs a Gated Recurrent Unit (GRU) to convert time-series sEMG inputs into +multi-joint kinematics and external loads, which are then integrated into an +equation of motion to ensure consistency with physical laws. Experimental +validation with sEMG data from five participants performing elbow +flexion-extension tasks showed that the PiGRN model accurately predicted joint +torques for 10 unfamiliar movements, with RMSE values between 4.02\% and +11.40\% and correlation coefficients ranging from 0.87 to 0.98. These findings +highlight the PiGRN's potential for real-time exoskeleton and rehabilitation +applications. Future research will explore more diverse datasets, improve +musculoskeletal models, and investigate unsupervised learning methods. + +
+
+
+
+
+ + ☆ High-Dimensional Sparse Data Low-rank Representation via Accelerated + Asynchronous Parallel Stochastic Gradient Descent + + +
+ Data characterized by high dimensionality and sparsity are commonly used to +describe real-world node interactions. Low-rank representation (LR) can map +high-dimensional sparse (HDS) data to low-dimensional feature spaces and infer +node interactions via modeling data latent associations. Unfortunately, +existing optimization algorithms for LR models are computationally inefficient +and slowly convergent on large-scale datasets. To address this issue, this +paper proposes an Accelerated Asynchronous Parallel Stochastic Gradient Descent +A2PSGD for High-Dimensional Sparse Data Low-rank Representation with three +fold-ideas: a) establishing a lock-free scheduler to simultaneously respond to +scheduling requests from multiple threads; b) introducing a greedy +algorithm-based load balancing strategy for balancing the computational load +among threads; c) incorporating Nesterov's accelerated gradient into the +learning scheme to accelerate model convergence. Empirical studies show that +A2PSGD outperforms existing optimization algorithms for HDS data LR in both +accuracy and training time. + +
+
+
+
+
+ + ☆ CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions INTERSPEECH2024 + + +
+ We demonstrate that carefully adjusting the tokenizer of the Whisper speech +recognition model significantly improves the precision of word-level timestamps +when applying dynamic time warping to the decoder's cross-attention scores. We +fine-tune the model to produce more verbatim speech transcriptions and employ +several techniques to increase robustness against multiple speakers and +background noise. These adjustments achieve state-of-the-art performance on +benchmarks for verbatim speech transcription, word segmentation, and the timed +detection of filler events, and can further mitigate transcription +hallucinations. The code is available open +https://github.com/nyrahealth/CrisperWhisper. + +
+
+ comment: Published at INTERSPEECH2024 +
+
+
+
+
+ + ☆ Transformers Meet ACT-R: Repeat-Aware and Sequential Listening Session + Recommendation RecSys'2024 + + +
+ Music streaming services often leverage sequential recommender systems to +predict the best music to showcase to users based on past sequences of +listening sessions. Nonetheless, most sequential recommendation methods ignore +or insufficiently account for repetitive behaviors. This is a crucial +limitation for music recommendation, as repeatedly listening to the same song +over time is a common phenomenon that can even change the way users perceive +this song. In this paper, we introduce PISA (Psychology-Informed Session +embedding using ACT-R), a session-level sequential recommender system that +overcomes this limitation. PISA employs a Transformer architecture learning +embedding representations of listening sessions and users using attention +mechanisms inspired by Anderson's ACT-R (Adaptive Control of Thought-Rational), +a cognitive architecture modeling human information access and memory dynamics. +This approach enables us to capture dynamic and repetitive patterns from user +behaviors, allowing us to effectively predict the songs they will listen to in +subsequent sessions, whether they are repeated or new ones. We demonstrate the +empirical relevance of PISA using both publicly available listening data from +Last.fm and proprietary data from Deezer, a global music streaming service, +confirming the critical importance of repetition modeling for sequential +listening session recommendation. Along with this paper, we publicly release +our proprietary dataset to foster future research in this field, as well as the +source code of PISA to facilitate its future use. + +
+
+ comment: 11 pages. Accepted by RecSys'2024, full paper +
+
+
+
+
+ + ☆ Seeking the Sufficiency and Necessity Causal Features in Multimodal + Representation Learning + + +
+ Learning representations with a high Probability of Necessary and Sufficient +Causes (PNS) has been shown to enhance deep learning models' ability. This task +involves identifying causal features that are both sufficient (guaranteeing the +outcome) and necessary (without which the outcome cannot occur). However, +current research predominantly focuses on unimodal data, and extending PNS +learning to multimodal settings presents significant challenges. The challenges +arise as the conditions for PNS identifiability, Exogeneity and Monotonicity, +need to be reconsidered in a multimodal context, where sufficient and necessary +causal features are distributed across different modalities. To address this, +we first propose conceptualizing multimodal representations as comprising +modality-invariant and modality-specific components. We then analyze PNS +identifiability for each component, while ensuring non-trivial PNS estimation. +Finally, we formulate tractable optimization objectives that enable multimodal +models to learn high-PNS representations, thereby enhancing their predictive +performance. Experiments demonstrate the effectiveness of our method on both +synthetic and real-world data. + +
+
+
+
+
+ + ☆ An Adaptive Latent Factorization of Tensors Model for Embedding Dynamic + Communication Network + + +
+ The Dynamic Communication Network (DCN) describes the interactions over time +among various communication nodes, and it is widely used in Big-data +applications as a data source. As the number of communication nodes increases +and temporal slots accumulate, each node interacts in with only a few nodes in +a given temporal slot, the DCN can be represented by an High-Dimensional Sparse +(HDS) tensor. In order to extract rich behavioral patterns from an HDS tensor +in DCN, this paper proposes an Adaptive Temporal-dependent Tensor low-rank +representation (ATT) model. It adopts a three-fold approach: a) designing a +temporal-dependent method to reconstruct temporal feature matrix, thereby +precisely represent the data by capturing the temporal patterns; b) achieving +hyper-parameters adaptation of the model via the Differential Evolutionary +Algorithms (DEA) to avoid tedious hyper-parameters tuning; c) employing +nonnegative learning schemes for the model parameters to effectively handle an +the nonnegativity inherent in HDS data. The experimental results on four +real-world DCNs demonstrate that the proposed ATT model significantly +outperforms several state-of-the-art models in both prediction errors and +convergence rounds. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Identifying Terrain Physical Parameters from Vision -- Towards + Physical-Parameter-Aware Locomotion and Navigation + + +
+ Identifying the physical properties of the surrounding environment is +essential for robotic locomotion and navigation to deal with non-geometric +hazards, such as slippery and deformable terrains. It would be of great benefit +for robots to anticipate these extreme physical properties before contact; +however, estimating environmental physical parameters from vision is still an +open challenge. Animals can achieve this by using their prior experience and +knowledge of what they have seen and how it felt. In this work, we propose a +cross-modal self-supervised learning framework for vision-based environmental +physical parameter estimation, which paves the way for future +physical-property-aware locomotion and navigation. We bridge the gap between +existing policies trained in simulation and identification of physical terrain +parameters from vision. We propose to train a physical decoder in simulation to +predict friction and stiffness from multi-modal input. The trained network +allows the labeling of real-world images with physical parameters in a +self-supervised manner to further train a visual network during deployment, +which can densely predict the friction and stiffness from image data. We +validate our physical decoder in simulation and the real world using a +quadruped ANYmal robot, outperforming an existing baseline method. We show that +our visual network can predict the physical properties in indoor and outdoor +experiments while allowing fast adaptation to new environments. + +
+
+
+
+
+ + ☆ Android Malware Detection Based on RGB Images and Multi-feature Fusion + + +
+ With the widespread adoption of smartphones, Android malware has become a +significant challenge in the field of mobile device security. Current Android +malware detection methods often rely on feature engineering to construct +dynamic or static features, which are then used for learning. However, static +feature-based methods struggle to counter code obfuscation, packing, and +signing techniques, while dynamic feature-based methods involve time-consuming +feature extraction. Image-based methods for Android malware detection offer +better resilience against malware variants and polymorphic malware. This paper +proposes an end-to-end Android malware detection technique based on RGB images +and multi-feature fusion. The approach involves extracting Dalvik Executable +(DEX) files, AndroidManifest.xml files, and API calls from APK files, +converting them into grayscale images, and enhancing their texture features +using Canny edge detection, histogram equalization, and adaptive thresholding +techniques. These grayscale images are then combined into an RGB image +containing multi-feature fusion information, which is analyzed using mainstream +image classification models for Android malware detection. Extensive +experiments demonstrate that the proposed method effectively captures Android +malware characteristics, achieving an accuracy of up to 97.25%, outperforming +existing detection methods that rely solely on DEX files as classification +features. Additionally, ablation experiments confirm the effectiveness of using +the three key files for feature representation in the proposed approach. + +
+
+ comment: 9 pages,10 figures +
+
+
+
+
+ + ☆ Super-Resolution works for coastal simulations + + +
+ Learning fine-scale details of a coastal ocean simulation from a coarse +representation is a challenging task. For real-world applications, +high-resolution simulations are necessary to advance understanding of many +coastal processes, specifically, to predict flooding resulting from tsunamis +and storm surges. We propose a Deep Network for Coastal Super-Resolution +(DNCSR) for spatiotemporal enhancement to efficiently learn the high-resolution +numerical solution. Given images of coastal simulations produced on +low-resolution computational meshes using low polynomial order discontinuous +Galerkin discretizations and a coarse temporal resolution, the proposed DNCSR +learns to produce high-resolution free surface elevation and velocity +visualizations in both time and space. To efficiently model the dynamic changes +over time and space, we propose grid-aware spatiotemporal attention to project +the temporal features to the spatial domain for non-local feature matching. The +coordinate information is also utilized via positional encoding. For the final +reconstruction, we use the spatiotemporal bilinear operation to interpolate the +missing frames and then expand the feature maps to the frequency domain for +residual mapping. Besides data-driven losses, the proposed physics-informed +loss guarantees gradient consistency and momentum changes. Their combination +contributes to the overall 24% improvements in RMSE. To train the proposed +model, we propose a large-scale coastal simulation dataset and use it for model +optimization and evaluation. Our method shows superior super-resolution quality +and fast computation compared to the state-of-the-art methods. + +
+
+ comment: 13 pages, 12 figures +
+
+
+
+
+ + ☆ Statistical and Geometrical properties of regularized Kernel + Kullback-Leibler divergence + + +
+ In this paper, we study the statistical and geometrical properties of the +Kullback-Leibler divergence with kernel covariance operators (KKL) introduced +by Bach [2022]. Unlike the classical Kullback-Leibler (KL) divergence that +involves density ratios, the KKL compares probability distributions through +covariance operators (embeddings) in a reproducible kernel Hilbert space +(RKHS), and compute the Kullback-Leibler quantum divergence. This novel +divergence hence shares parallel but different aspects with both the standard +Kullback-Leibler between probability distributions and kernel embeddings +metrics such as the maximum mean discrepancy. A limitation faced with the +original KKL divergence is its inability to be defined for distributions with +disjoint supports. To solve this problem, we propose in this paper a +regularised variant that guarantees that the divergence is well defined for all +distributions. We derive bounds that quantify the deviation of the regularised +KKL to the original one, as well as finite-sample bounds. In addition, we +provide a closed-form expression for the regularised KKL, specifically +applicable when the distributions consist of finite sets of points, which makes +it implementable. Furthermore, we derive a Wasserstein gradient descent scheme +of the KKL divergence in the case of discrete distributions, and study +empirically its properties to transport a set of points to a target +distribution. + +
+
+
+
+
+ + ☆ SALSA: Speedy ASR-LLM Synchronous Aggregation INTERSPEECH 2024 + + +
+ Harnessing pre-trained LLMs to improve ASR systems, particularly for +low-resource languages, is now an emerging area of research. Existing methods +range from using LLMs for ASR error correction to tightly coupled systems that +replace the ASR decoder with the LLM. These approaches either increase decoding +time or require expensive training of the cross-attention layers. We propose +SALSA, which couples the decoder layers of the ASR to the LLM decoder, while +synchronously advancing both decoders. Such coupling is performed with a simple +projection of the last decoder state, and is thus significantly more training +efficient than earlier approaches. A challenge of our proposed coupling is +handling the mismatch between the tokenizers of the LLM and ASR systems. We +handle this mismatch using cascading tokenization with respect to the LLM and +ASR vocabularies. We evaluate SALSA on 8 low-resource languages in the FLEURS +benchmark, yielding substantial WER reductions of up to 38%. + +
+
+ comment: Accepted to INTERSPEECH 2024 +
+
+
+
+
+ + ☆ SFR-GNN: Simple and Fast Robust GNNs against Structural Attacks + + +
+ Graph Neural Networks (GNNs) have demonstrated commendable performance for +graph-structured data. Yet, GNNs are often vulnerable to adversarial structural +attacks as embedding generation relies on graph topology. Existing efforts are +dedicated to purifying the maliciously modified structure or applying adaptive +aggregation, thereby enhancing the robustness against adversarial structural +attacks. It is inevitable for a defender to consume heavy computational costs +due to lacking prior knowledge about modified structures. To this end, we +propose an efficient defense method, called Simple and Fast Robust Graph Neural +Network (SFR-GNN), supported by mutual information theory. The SFR-GNN first +pre-trains a GNN model using node attributes and then fine-tunes it over the +modified graph in the manner of contrastive learning, which is free of +purifying modified structures and adaptive aggregation, thus achieving great +efficiency gains. Consequently, SFR-GNN exhibits a 24%--162% speedup compared +to advanced robust models, demonstrating superior robustness for node +classification tasks. + +
+
+
+
+
+ + ☆ TinyTNAS: GPU-Free, Time-Bound, Hardware-Aware Neural Architecture + Search for TinyML Time Series Classification + + +
+ In this work, we present TinyTNAS, a novel hardware-aware multi-objective +Neural Architecture Search (NAS) tool specifically designed for TinyML time +series classification. Unlike traditional NAS methods that rely on GPU +capabilities, TinyTNAS operates efficiently on CPUs, making it accessible for a +broader range of applications. Users can define constraints on RAM, FLASH, and +MAC operations to discover optimal neural network architectures within these +parameters. Additionally, the tool allows for time-bound searches, ensuring the +best possible model is found within a user-specified duration. By experimenting +with benchmark dataset UCI HAR, PAMAP2, WISDM, MIT BIH, and PTB Diagnostic ECG +Databas TinyTNAS demonstrates state-of-the-art accuracy with significant +reductions in RAM, FLASH, MAC usage, and latency. For example, on the UCI HAR +dataset, TinyTNAS achieves a 12x reduction in RAM usage, a 144x reduction in +MAC operations, and a 78x reduction in FLASH memory while maintaining superior +accuracy and reducing latency by 149x. Similarly, on the PAMAP2 and WISDM +datasets, it achieves a 6x reduction in RAM usage, a 40x reduction in MAC +operations, an 83x reduction in FLASH, and a 67x reduction in latency, all +while maintaining superior accuracy. Notably, the search process completes +within 10 minutes in a CPU environment. These results highlight TinyTNAS's +capability to optimize neural network architectures effectively for +resource-constrained TinyML applications, ensuring both efficiency and high +performance. The code for TinyTNAS is available at the GitHub repository and +can be accessed at https://github.com/BidyutSaha/TinyTNAS.git. + +
+
+
+
+
+ + ☆ WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio + Language Modeling + + +
+ Language models have been effectively applied to modeling natural signals, +such as images, video, speech, and audio. A crucial component of these models +is the codec tokenizer, which compresses high-dimensional natural signals into +lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, +which offers several advantages over previous SOTA acoustic codec models in the +audio domain: 1)extreme compression. By compressing the layers of quantizers +and the temporal dimension of the discrete codec, one-second audio of 24kHz +sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved +subjective quality. Despite the reduced number of tokens, WavTokenizer achieves +state-of-the-art reconstruction quality with outstanding UTMOS scores and +inherently contains richer semantic information. Specifically, we achieve these +results by designing a broader VQ space, extended contextual windows, and +improved attention networks, as well as introducing a powerful multi-scale +discriminator and an inverse Fourier transform structure. We conducted +extensive reconstruction experiments in the domains of speech, audio, and +music. WavTokenizer exhibited strong performance across various objective and +subjective metrics compared to state-of-the-art models. We also tested semantic +information, VQ utilization, and adaptability to generative models. +Comprehensive ablation studies confirm the necessity of each module in +WavTokenizer. The related code, demos, and pre-trained models are available at +https://github.com/jishengpeng/WavTokenizer. + +
+
+ comment: Working in progress. arXiv admin note: text overlap with + arXiv:2402.12208 +
+
+
+
+
+ + ☆ Multitask learning for improved scour detection: A dynamic wave tank + study + + +
+ Population-based structural health monitoring (PBSHM), aims to share +information between members of a population. An offshore wind (OW) farm could +be considered as a population of nominally-identical wind-turbine structures. +However, benign variations exist among members, such as geometry, sea-bed +conditions and temperature differences. These factors could influence +structural properties and therefore the dynamic response, making it more +difficult to detect structural problems via traditional SHM techniques. + This paper explores the use of a Bayesian hierarchical model as a means of +multitask learning, to infer foundation stiffness distribution parameters at +both population and local levels. To do this, observations of natural frequency +from populations of structures were first generated from both numerical and +experimental models. These observations were then used in a partially-pooled +Bayesian hierarchical model in tandem with surrogate FE models of the +structures to infer foundation stiffness parameters. Finally, it is +demonstrated how the learned parameters may be used as a basis to perform more +robust anomaly detection (as compared to a no-pooling approach) e.g. as a +result of scour. + +
+
+ comment: 25 pages, 12 figures, early work features in ISWHM 2023 conference + proceedings and available here: arXiv:2402.19295. Submitted to the Renewable + Energy journal +
+
+
+
+
+ + ☆ Adaptive Variational Continual Learning via Task-Heuristic Modelling + + +
+ Variational continual learning (VCL) is a turn-key learning algorithm that +has state-of-the-art performance among the best continual learning models. In +our work, we explore an extension of the generalized variational continual +learning (GVCL) model, named AutoVCL, which combines task heuristics for +informed learning and model optimization. We demonstrate that our model +outperforms the standard GVCL with fixed hyperparameters, benefiting from the +automatic adjustment of the hyperparameter based on the difficulty and +similarity of the incoming task compared to the previous tasks. + +
+
+ comment: 4 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ On-device AI: Quantization-aware Training of Transformers in Time-Series + + +
+ Artificial Intelligence (AI) models for time-series in pervasive computing +keep getting larger and more complicated. The Transformer model is by far the +most compelling of these AI models. However, it is difficult to obtain the +desired performance when deploying such a massive model on a sensor device with +limited resources. My research focuses on optimizing the Transformer model for +time-series forecasting tasks. The optimized model will be deployed as hardware +accelerators on embedded Field Programmable Gate Arrays (FPGAs). I will +investigate the impact of applying Quantization-aware Training to the +Transformer model to reduce its size and runtime memory footprint while +maximizing the advantages of FPGAs. + +
+
+ comment: This paper is accepted by 2023 IEEE International Conference on + Pervasive Computing and Communications(PhD Forum) +
+
+
+
+
+ + ☆ An Exploratory Deep Learning Approach for Predicting Subsequent Suicidal + Acts in Chinese Psychological Support Hotlines + + +
+ Psychological support hotlines are an effective suicide prevention measure +that typically relies on professionals using suicide risk assessment scales to +predict individual risk scores. However, the accuracy of scale-based predictive +methods for suicide risk assessment can vary widely depending on the expertise +of the operator. This limitation underscores the need for more reliable +methods, prompting this research's innovative exploration of the use of +artificial intelligence to improve the accuracy and efficiency of suicide risk +prediction within the context of psychological support hotlines. The study +included data from 1,549 subjects from 2015-2017 in China who contacted a +psychological support hotline. Each participant was followed for 12 months to +identify instances of suicidal behavior. We proposed a novel multi-task +learning method that uses the large-scale pre-trained model Whisper for feature +extraction and fits psychological scales while predicting the risk of suicide. +The proposed method yields a 2.4\% points improvement in F1-score compared to +the traditional manual approach based on the psychological scales. Our model +demonstrated superior performance compared to the other eight popular models. +To our knowledge, this study is the first to apply deep learning to long-term +speech data to predict suicide risk in China, indicating grate potential for +clinical applications. The source code is publicly available at: +\url{https://github.com/songchangwei/Suicide-Risk-Prediction}. + +
+
+
+
+
+ + ☆ HYGENE: A Diffusion-based Hypergraph Generation Method + + +
+ Hypergraphs are powerful mathematical structures that can model complex, +high-order relationships in various domains, including social networks, +bioinformatics, and recommender systems. However, generating realistic and +diverse hypergraphs remains challenging due to their inherent complexity and +lack of effective generative models. In this paper, we introduce a +diffusion-based Hypergraph Generation (HYGENE) method that addresses these +challenges through a progressive local expansion approach. HYGENE works on the +bipartite representation of hypergraphs, starting with a single pair of +connected nodes and iteratively expanding it to form the target hypergraph. At +each step, nodes and hyperedges are added in a localized manner using a +denoising diffusion process, which allows for the construction of the global +structure before refining local details. Our experiments demonstrated the +effectiveness of HYGENE, proving its ability to closely mimic a variety of +properties in hypergraphs. To the best of our knowledge, this is the first +attempt to employ deep learning models for hypergraph generation, and our work +aims to lay the groundwork for future research in this area. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.11529 by other authors +
+
+
+
+
+ + ☆ Do Recommender Systems Promote Local Music? A Reproducibility Study + Using Music Streaming Data + + +
+ This paper examines the influence of recommender systems on local music +representation, discussing prior findings from an empirical study on the LFM-2b +public dataset. This prior study argued that different recommender systems +exhibit algorithmic biases shifting music consumption either towards or against +local content. However, LFM-2b users do not reflect the diverse audience of +music streaming services. To assess the robustness of this study's conclusions, +we conduct a comparative analysis using proprietary listening data from a +global music streaming service, which we publicly release alongside this paper. +We observe significant differences in local music consumption patterns between +our dataset and LFM-2b, suggesting that caution should be exercised when +drawing conclusions on local music based solely on LFM-2b. Moreover, we show +that the algorithmic biases exhibited in the original work vary in our dataset, +and that several unexplored model parameters can significantly influence these +biases and affect the study's conclusion on both datasets. Finally, we discuss +the complexity of accurately labeling local music, emphasizing the risk of +misleading conclusions due to unreliable, biased, or incomplete labels. To +encourage further research and ensure reproducibility, we have publicly shared +our dataset and code. + +
+
+
+
+
+ + ☆ Gradient-free variational learning with conditional mixture networks + + +
+ Balancing computational efficiency with robust predictive performance is +crucial in supervised learning, especially for critical applications. Standard +deep learning models, while accurate and scalable, often lack probabilistic +features like calibrated predictions and uncertainty quantification. Bayesian +methods address these issues but can be computationally expensive as model and +data complexity increase. Previous work shows that fast variational methods can +reduce the compute requirements of Bayesian methods by eliminating the need for +gradient computation or sampling, but are often limited to simple models. We +demonstrate that conditional mixture networks (CMNs), a probabilistic variant +of the mixture-of-experts (MoE) model, are suitable for fast, gradient-free +inference and can solve complex classification tasks. CMNs employ linear +experts and a softmax gating network. By exploiting conditional conjugacy and +P\'olya-Gamma augmentation, we furnish Gaussian likelihoods for the weights of +both the linear experts and the gating network. This enables efficient +variational updates using coordinate ascent variational inference (CAVI), +avoiding traditional gradient-based optimization. We validate this approach by +training two-layer CMNs on standard benchmarks from the UCI repository. Our +method, CAVI-CMN, achieves competitive and often superior predictive accuracy +compared to maximum likelihood estimation (MLE) with backpropagation, while +maintaining competitive runtime and full posterior distributions over all model +parameters. Moreover, as input size or the number of experts increases, +computation time scales competitively with MLE and other gradient-based +solutions like black-box variational inference (BBVI), making CAVI-CMN a +promising tool for deep, fast, and gradient-free Bayesian networks. + +
+
+ comment: 16 pages main text (3 figures), including references. 9 pages + supplementary material (5 figures) +
+
+
+
+
+ + ☆ A Comparative Study of Hyperparameter Tuning Methods + + +
+ The study emphasizes the challenge of finding the optimal trade-off between +bias and variance, especially as hyperparameter optimization increases in +complexity. Through empirical analysis, three hyperparameter tuning algorithms +Tree-structured Parzen Estimator (TPE), Genetic Search, and Random Search are +evaluated across regression and classification tasks. The results show that +nonlinear models, with properly tuned hyperparameters, significantly outperform +linear models. Interestingly, Random Search excelled in regression tasks, while +TPE was more effective for classification tasks. This suggests that there is no +one-size-fits-all solution, as different algorithms perform better depending on +the task and model type. The findings underscore the importance of selecting +the appropriate tuning method and highlight the computational challenges +involved in optimizing machine learning models, particularly as search spaces +expand. + +
+
+ comment: This chapter has been accepted in the edited volume titles "Data + Science in Theory and Practice", editor J Sen & S Roy Choudhury. The volume + is expected to be published in October 2024 by Cambridge Scholars Publishing, + New Castle upon Tyne, UK. This chapter is 34 pages long and it contains 11 + tables and 8 images +
+
+
+
+
+ + ☆ Fourier Spectral Physics Informed Neural Network: An Efficient and + Low-Memory PINN + + +
+ With growing investigations into solving partial differential equations by +physics-informed neural networks (PINNs), more accurate and efficient PINNs are +required to meet the practical demands of scientific computing. One bottleneck +of current PINNs is computing the high-order derivatives via automatic +differentiation which often necessitates substantial computing resources. In +this paper, we focus on removing the automatic differentiation of the spatial +derivatives and propose a spectral-based neural network that substitutes the +differential operator with a multiplication. Compared to the PINNs, our +approach requires lower memory and shorter training time. Thanks to the +exponential convergence of the spectral basis, our approach is more accurate. +Moreover, to handle the different situations between physics domain and +spectral domain, we provide two strategies to train networks by their spectral +information. Through a series of comprehensive experiments, We validate the +aforementioned merits of our proposed network. + +
+
+
+
+
+ + ☆ DeepSPoC: A Deep Learning-Based PDE Solver Governed by Sequential + Propagation of Chaos + + +
+ Sequential propagation of chaos (SPoC) is a recently developed tool to solve +mean-field stochastic differential equations and their related nonlinear +Fokker-Planck equations. Based on the theory of SPoC, we present a new method +(deepSPoC) that combines the interacting particle system of SPoC and deep +learning. Under the framework of deepSPoC, two classes of frequently used deep +models include fully connected neural networks and normalizing flows are +considered. For high-dimensional problems, spatial adaptive method are designed +to further improve the accuracy and efficiency of deepSPoC. We analysis the +convergence of the framework of deepSPoC under some simplified conditions and +also provide a posterior error estimation for the algorithm. Finally, we test +our methods on a wide range of different types of mean-field equations. + +
+
+
+
+
+ + ☆ Illuminating the Diversity-Fitness Trade-Off in Black-Box Optimization + + +
+ In real-world applications, users often favor structurally diverse design +choices over one high-quality solution. It is hence important to consider more +solutions that decision-makers can compare and further explore based on +additional criteria. Alongside the existing approaches of evolutionary +diversity optimization, quality diversity, and multimodal optimization, this +paper presents a fresh perspective on this challenge by considering the problem +of identifying a fixed number of solutions with a pairwise distance above a +specified threshold while maximizing their average quality. + We obtain first insight into these objectives by performing a subset +selection on the search trajectories of different well-established search +heuristics, whether specifically designed with diversity in mind or not. We +emphasize that the main goal of our work is not to present a new algorithm but +to look at the problem in a more fundamental and theoretically tractable way by +asking the question: What trade-off exists between the minimum distance within +batches of solutions and the average quality of their fitness? These insights +also provide us with a way of making general claims concerning the properties +of optimization problems that shall be useful in turn for benchmarking +algorithms of the approaches enumerated above. + A possibly surprising outcome of our empirical study is the observation that +naive uniform random sampling establishes a very strong baseline for our +problem, hardly ever outperformed by the search trajectories of the considered +heuristics. We interpret these results as a motivation to develop algorithms +tailored to produce diverse solutions of high average quality. + +
+
+
+
+
+ + ☆ TempoKGAT: A Novel Graph Attention Network Approach for Temporal Graph + Analysis + + +
+ Graph neural networks (GNN) have shown significant capabilities in handling +structured data, yet their application to dynamic, temporal data remains +limited. This paper presents a new type of graph attention network, called +TempoKGAT, which combines time-decaying weight and a selective neighbor +aggregation mechanism on the spatial domain, which helps uncover latent +patterns in the graph data. In this approach, a top-k neighbor selection based +on the edge weights is introduced to represent the evolving features of the +graph data. We evaluated the performance of our TempoKGAT on multiple datasets +from the traffic, energy, and health sectors involving spatio-temporal data. We +compared the performance of our approach to several state-of-the-art methods +found in the literature on several open-source datasets. Our method shows +superior accuracy on all datasets. These results indicate that TempoKGAT builds +on existing methodologies to optimize prediction accuracy and provide new +insights into model interpretation in temporal contexts. + +
+
+
+
+
+ + ☆ Addressing Common Misinterpretations of KART and UAT in Neural Network + Literature + + +
+ This note addresses the Kolmogorov-Arnold Representation Theorem (KART) and +the Universal Approximation Theorem (UAT), focusing on their common +misinterpretations in some papers related to neural network approximation. Our +remarks aim to support a more accurate understanding of KART and UAT among +neural network specialists. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ TG-PhyNN: An Enhanced Physically-Aware Graph Neural Network framework + for forecasting Spatio-Temporal Data + + +
+ Accurately forecasting dynamic processes on graphs, such as traffic flow or +disease spread, remains a challenge. While Graph Neural Networks (GNNs) excel +at modeling and forecasting spatio-temporal data, they often lack the ability +to directly incorporate underlying physical laws. This work presents TG-PhyNN, +a novel Temporal Graph Physics-Informed Neural Network framework. TG-PhyNN +leverages the power of GNNs for graph-based modeling while simultaneously +incorporating physical constraints as a guiding principle during training. This +is achieved through a two-step prediction strategy that enables the calculation +of physical equation derivatives within the GNN architecture. Our findings +demonstrate that TG-PhyNN significantly outperforms traditional forecasting +models (e.g., GRU, LSTM, GAT) on real-world spatio-temporal datasets like +PedalMe (traffic flow), COVID-19 spread, and Chickenpox outbreaks. These +datasets are all governed by well-defined physical principles, which TG-PhyNN +effectively exploits to offer more reliable and accurate forecasts in various +domains where physical processes govern the dynamics of data. This paves the +way for improved forecasting in areas like traffic flow prediction, disease +outbreak prediction, and potentially other fields where physics plays a crucial +role. + +
+
+
+
+
+ + ☆ Machine learning models for daily rainfall forecasting in Northern + Tropical Africa using tropical wave predictors + + +
+ Numerical weather prediction (NWP) models often underperform compared to +simpler climatology-based precipitation forecasts in northern tropical Africa, +even after statistical postprocessing. AI-based forecasting models show promise +but have avoided precipitation due to its complexity. Synoptic-scale forcings +like African easterly waves and other tropical waves (TWs) are important for +predictability in tropical Africa, yet their value for predicting daily +rainfall remains unexplored. This study uses two machine-learning models--gamma +regression and a convolutional neural network (CNN)--trained on TW predictors +from satellite-based GPM IMERG data to predict daily rainfall during the +July-September monsoon season. Predictor variables are derived from the local +amplitude and phase information of seven TW from the target and +up-and-downstream neighboring grids at 1-degree spatial resolution. The ML +models are combined with Easy Uncertainty Quantification (EasyUQ) to generate +calibrated probabilistic forecasts and are compared with three benchmarks: +Extended Probabilistic Climatology (EPC15), ECMWF operational ensemble forecast +(ENS), and a probabilistic forecast from the ENS control member using EasyUQ +(CTRL EasyUQ). The study finds that downstream predictor variables offer the +highest predictability, with downstream tropical depression (TD)-type +wave-based predictors being most important. Other waves like mixed-Rossby +gravity (MRG), Kelvin, and inertio-gravity waves also contribute significantly +but show regional preferences. ENS forecasts exhibit poor skill due to +miscalibration. CTRL EasyUQ shows improvement over ENS and marginal enhancement +over EPC15. Both gamma regression and CNN forecasts significantly outperform +benchmarks in tropical Africa. This study highlights the potential of ML models +trained on TW-based predictors to improve daily precipitation forecasts in +tropical Africa. + +
+
+
+
+
+ + ☆ Do Graph Neural Networks Work for High Entropy Alloys? + + +
+ Graph neural networks (GNNs) have excelled in predictive modeling for both +crystals and molecules, owing to the expressiveness of graph representations. +High-entropy alloys (HEAs), however, lack chemical long-range order, limiting +the applicability of current graph representations. To overcome this challenge, +we propose a representation of HEAs as a collection of local environment (LE) +graphs. Based on this representation, we introduce the LESets machine learning +model, an accurate, interpretable GNN for HEA property prediction. We +demonstrate the accuracy of LESets in modeling the mechanical properties of +quaternary HEAs. Through analyses and interpretation, we further extract +insights into the modeling and design of HEAs. In a broader sense, LESets +extends the potential applicability of GNNs to disordered materials with +combinatorial complexity formed by diverse constituents and their flexible +configurations. + +
+
+
+
+
+ + ☆ GL-TSVM: A robust and smooth twin support vector machine with guardian + loss function + + +
+ Twin support vector machine (TSVM), a variant of support vector machine +(SVM), has garnered significant attention due to its $3/4$ times lower +computational complexity compared to SVM. However, due to the utilization of +the hinge loss function, TSVM is sensitive to outliers or noise. To remedy it, +we introduce the guardian loss (G-loss), a novel loss function distinguished by +its asymmetric, bounded, and smooth characteristics. We then fuse the proposed +G-loss function into the TSVM and yield a robust and smooth classifier termed +GL-TSVM. Further, to adhere to the structural risk minimization (SRM) principle +and reduce overfitting, we incorporate a regularization term into the objective +function of GL-TSVM. To address the optimization challenges of GL-TSVM, we +devise an efficient iterative algorithm. The experimental analysis on UCI and +KEEL datasets substantiates the effectiveness of the proposed GL-TSVM in +comparison to the baseline models. Moreover, to showcase the efficacy of the +proposed GL-TSVM in the biomedical domain, we evaluated it on the breast cancer +(BreaKHis) and schizophrenia datasets. The outcomes strongly demonstrate the +competitiveness of the proposed GL-TSVM against the baseline models. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.18101 +
+
+
+
+
+ + ☆ Self-Improving Diffusion Models with Synthetic Data + + +
+ The artificial intelligence (AI) world is running out of real data for +training increasingly large generative models, resulting in accelerating +pressure to train on synthetic data. Unfortunately, training new generative +models with synthetic data from current or past generation models creates an +autophagous (self-consuming) loop that degrades the quality and/or diversity of +the synthetic data in what has been termed model autophagy disorder (MAD) and +model collapse. Current thinking around model autophagy recommends that +synthetic data is to be avoided for model training lest the system deteriorate +into MADness. In this paper, we take a different tack that treats synthetic +data differently from real data. Self-IMproving diffusion models with Synthetic +data (SIMS) is a new training concept for diffusion models that uses +self-synthesized data to provide negative guidance during the generation +process to steer a model's generative process away from the non-ideal synthetic +data manifold and towards the real data distribution. We demonstrate that SIMS +is capable of self-improvement; it establishes new records based on the +Fr\'echet inception distance (FID) metric for CIFAR-10 and ImageNet-64 +generation and achieves competitive results on FFHQ-64 and ImageNet-512. +Moreover, SIMS is, to the best of our knowledge, the first prophylactic +generative AI algorithm that can be iteratively trained on self-generated +synthetic data without going MAD. As a bonus, SIMS can adjust a diffusion +model's synthetic data distribution to match any desired in-domain target +distribution to help mitigate biases and ensure fairness. + +
+
+
+
+
+ + ☆ Minimising changes to audit when updating decision trees + + +
+ Interpretable models are important, but what happens when the model is +updated on new training data? We propose an algorithm for updating a decision +tree while minimising the number of changes to the tree that a human would need +to audit. We achieve this via a greedy approach that incorporates the number of +changes to the tree as part of the objective function. We compare our algorithm +to existing methods and show that it sits in a sweet spot between final +accuracy and number of changes to audit. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Passenger hazard perception based on EEG signals for highly automated + driving vehicles + + +
+ Enhancing the safety of autonomous vehicles is crucial, especially given +recent accidents involving automated systems. As passengers in these vehicles, +humans' sensory perception and decision-making can be integrated with +autonomous systems to improve safety. This study explores neural mechanisms in +passenger-vehicle interactions, leading to the development of a Passenger +Cognitive Model (PCM) and the Passenger EEG Decoding Strategy (PEDS). Central +to PEDS is a novel Convolutional Recurrent Neural Network (CRNN) that captures +spatial and temporal EEG data patterns. The CRNN, combined with stacking +algorithms, achieves an accuracy of $85.0\% \pm 3.18\%$. Our findings highlight +the predictive power of pre-event EEG data, enhancing the detection of +hazardous scenarios and offering a network-driven framework for safer +autonomous vehicles. + +
+
+
+
+
+ + ☆ Physics of Language Models: Part 2.2, How to Learn From Mistakes on + Grade-School Math Problems + + +
+ Language models have demonstrated remarkable performance in solving reasoning +tasks; however, even the strongest models still occasionally make reasoning +mistakes. Recently, there has been active research aimed at improving reasoning +accuracy, particularly by using pretrained language models to "self-correct" +their mistakes via multi-round prompting. In this paper, we follow this line of +work but focus on understanding the usefulness of incorporating +"error-correction" data directly into the pretraining stage. This data consists +of erroneous solution steps immediately followed by their corrections. Using a +synthetic math dataset, we show promising results: this type of pretrain data +can help language models achieve higher reasoning accuracy directly (i.e., +through simple auto-regression, without multi-round prompting) compared to +pretraining on the same amount of error-free data. We also delve into many +details, such as (1) how this approach differs from beam search, (2) how such +data can be prepared, (3) whether masking is needed on the erroneous tokens, +(4) the amount of error required, (5) whether such data can be deferred to the +fine-tuning stage, and many others. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.20311 +
+
+
+
+
+ + ☆ Flexible framework for generating synthetic electrocardiograms and + photoplethysmograms + + +
+ By generating synthetic biosignals, the quantity and variety of health data +can be increased. This is especially useful when training machine learning +models by enabling data augmentation and introduction of more physiologically +plausible variation to the data. For these purposes, we have developed a +synthetic biosignal model for two signal modalities, electrocardiography (ECG) +and photoplethysmography (PPG). The model produces realistic signals that +account for physiological effects such as breathing modulation and changes in +heart rate due to physical stress. Arrhythmic signals can be generated with +beat intervals extracted from real measurements. The model also includes a +flexible approach to adding different kinds of noise and signal artifacts. The +noise is generated from power spectral densities extracted from both measured +noisy signals and modeled power spectra. Importantly, the model also +automatically produces labels for noise, segmentation (e.g. P and T waves, QRS +complex, for electrocardiograms), and artifacts. We assessed how this +comprehensive model can be used in practice to improve the performance of +models trained on ECG or PPG data. For example, we trained an LSTM to detect +ECG R-peaks using both real ECG signals from the MIT-BIH arrythmia set and our +new generator. The F1 score of the model was 0.83 using real data, in +comparison to 0.98 using our generator. In addition, the model can be used for +example in signal segmentation, quality detection and bench-marking detection +algorithms. The model code has been released in +\url{https://github.com/UTU-Health-Research/framework_for_synthetic_biosignals} + +
+
+
+
+
+ + ☆ OpenFGL: A Comprehensive Benchmarks for Federated Graph Learning + + +
+ Federated graph learning (FGL) has emerged as a promising distributed +training paradigm for graph neural networks across multiple local systems +without direct data sharing. This approach is particularly beneficial in +privacy-sensitive scenarios and offers a new perspective on addressing +scalability challenges in large-scale graph learning. Despite the proliferation +of FGL, the diverse motivations from practical applications, spanning various +research backgrounds and experimental settings, pose a significant challenge to +fair evaluation. To fill this gap, we propose OpenFGL, a unified benchmark +designed for the primary FGL scenarios: Graph-FL and Subgraph-FL. Specifically, +OpenFGL includes 38 graph datasets from 16 application domains, 8 federated +data simulation strategies that emphasize graph properties, and 5 graph-based +downstream tasks. Additionally, it offers 18 recently proposed SOTA FGL +algorithms through a user-friendly API, enabling a thorough comparison and +comprehensive evaluation of their effectiveness, robustness, and efficiency. +Empirical results demonstrate the ability of FGL while also revealing its +potential limitations, offering valuable insights for future exploration in +this thriving field. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Near-Optimal Policy Identification in Robust Constrained Markov Decision + Processes via Epigraph Form + + +
+ Designing a safe policy for uncertain environments is crucial in real-world +control applications. However, this challenge remains inadequately addressed +within the Markov decision process (MDP) framework. This paper presents the +first algorithm capable of identifying a near-optimal policy in a robust +constrained MDP (RCMDP), where an optimal policy minimizes cumulative cost +while satisfying constraints in the worst-case scenario across a set of +environments. We first prove that the conventional Lagrangian max-min +formulation with policy gradient methods can become trapped in suboptimal +solutions by encountering a sum of conflicting gradients from the objective and +constraint functions during its inner minimization problem. To address this, we +leverage the epigraph form of the RCMDP problem, which resolves the conflict by +selecting a single gradient from either the objective or the constraints. +Building on the epigraph form, we propose a binary search algorithm with a +policy gradient subroutine and prove that it identifies an +$\varepsilon$-optimal policy in an RCMDP with +$\tilde{\mathcal{O}}(\varepsilon^{-4})$ policy evaluations. + +
+
+
+
+
+ + ☆ ART: Actually Robust Training + + +
+ Current interest in deep learning captures the attention of many programmers +and researchers. Unfortunately, the lack of a unified schema for developing +deep learning models results in methodological inconsistencies, unclear +documentation, and problems with reproducibility. Some guidelines have been +proposed, yet currently, they lack practical implementations. Furthermore, +neural network training often takes on the form of trial and error, lacking a +structured and thoughtful process. To alleviate these issues, in this paper, we +introduce Art, a Python library designed to help automatically impose rules and +standards while developing deep learning pipelines. Art divides model +development into a series of smaller steps of increasing complexity, each +concluded with a validation check improving the interpretability and robustness +of the process. The current version of Art comes equipped with nine predefined +steps inspired by Andrej Karpathy's Recipe for Training Neural Networks, a +visualization dashboard, and integration with loggers such as Neptune. The code +related to this paper is available at: +https://github.com/SebChw/Actually-Robust-Training. + +
+
+
+
+
+ + ☆ Enhancing Customer Churn Prediction in Telecommunications: An Adaptive + Ensemble Learning Approach + + +
+ Customer churn, the discontinuation of services by existing customers, poses +a significant challenge to the telecommunications industry. This paper proposes +a novel adaptive ensemble learning framework for highly accurate customer churn +prediction. The framework integrates multiple base models, including XGBoost, +LightGBM, LSTM, a Multi-Layer Perceptron (MLP) neural network, and Support +Vector Machine (SVM). These models are strategically combined using a stacking +ensemble method, further enhanced by meta-feature generation from base model +predictions. A rigorous data preprocessing pipeline, coupled with a +multi-faceted feature engineering approach, optimizes model performance. The +framework is evaluated on three publicly available telecom churn datasets, +demonstrating substantial accuracy improvements over state-of-the-art +techniques. The research achieves a remarkable 99.28% accuracy, signifying a +major advancement in churn prediction.The implications of this research for +developing proactive customer retention strategies withinthe telecommunications +industry are discussed. + +
+
+ comment: 12 pages,2 figures +
+
+
+
+
+ + ☆ Web Service QoS Prediction via Extended Canonical Polyadic-based Tensor + Network + + +
+ Today, numerous web services with similar functionalities are available on +the Internet. Users often evaluate the Quality of Service (QoS) to choose the +best option among them. Predicting the QoS values of these web services is a +significant challenge in the field of web services. A Canonical Polyadic +(CP)-based tensor network model has proven to be efficient for predicting +dynamic QoS data. However, current CP-based tensor network models do not +consider the correlation of users and services in the low-dimensional latent +feature space, thereby limiting model's prediction capability. To tackle this +issue, this paper proposes an Extended Canonical polyadic-based Tensor Network +(ECTN) model. It models the correlation of users and services via building a +relation dimension between user feature and service feature in low-dimensional +space, and then designs an extended CP decomposition structure to improve +prediction accuracy. Experiments are conducted on two public dynamic QoS data, +and the results show that compared with state-of-the-art QoS prediction models, +the ECTN obtains higher prediction accuracy. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ On Convergence of Average-Reward Q-Learning in Weakly Communicating + Markov Decision Processes + + +
+ This paper analyzes reinforcement learning (RL) algorithms for Markov +decision processes (MDPs) under the average-reward criterion. We focus on +Q-learning algorithms based on relative value iteration (RVI), which are +model-free stochastic analogues of the classical RVI method for average-reward +MDPs. These algorithms have low per-iteration complexity, making them +well-suited for large state space problems. We extend the almost-sure +convergence analysis of RVI Q-learning algorithms developed by Abounadi, +Bertsekas, and Borkar (2001) from unichain to weakly communicating MDPs. This +extension is important both practically and theoretically: weakly communicating +MDPs cover a much broader range of applications compared to unichain MDPs, and +their optimality equations have a richer solution structure (with multiple +degrees of freedom), introducing additional complexity in proving algorithmic +convergence. We also characterize the sets to which RVI Q-learning algorithms +converge, showing that they are compact, connected, potentially nonconvex, and +comprised of solutions to the average-reward optimality equation, with exactly +one less degree of freedom than the general solution set of this equation. +Furthermore, we extend our analysis to two RVI-based hierarchical +average-reward RL algorithms using the options framework, proving their +almost-sure convergence and characterizing their sets of convergence under the +assumption that the underlying semi-Markov decision process is weakly +communicating. + +
+
+
+
+
+ + ☆ Evaluating Time-Series Training Dataset through Lens of Spectrum in Deep + State Space Models + + +
+ This study investigates a method to evaluate time-series datasets in terms of +the performance of deep neural networks (DNNs) with state space models (deep +SSMs) trained on the dataset. SSMs have attracted attention as components +inside DNNs to address time-series data. Since deep SSMs have powerful +representation capacities, training datasets play a crucial role in solving a +new task. However, the effectiveness of training datasets cannot be known until +deep SSMs are actually trained on them. This can increase the cost of data +collection for new tasks, as a trial-and-error process of data collection and +time-consuming training are needed to achieve the necessary performance. To +advance the practical use of deep SSMs, the metric of datasets to estimate the +performance early in the training can be one key element. To this end, we +introduce the concept of data evaluation methods used in system identification. +In system identification of linear dynamical systems, the effectiveness of +datasets is evaluated by using the spectrum of input signals. We introduce this +concept to deep SSMs, which are nonlinear dynamical systems. We propose the +K-spectral metric, which is the sum of the top-K spectra of signals inside deep +SSMs, by focusing on the fact that each layer of a deep SSM can be regarded as +a linear dynamical system. Our experiments show that the K-spectral metric has +a large absolute value of the correlation coefficient with the performance and +can be used to evaluate the quality of training datasets. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Coalitions of AI-based Methods Predict 15-Year Risks of Breast Cancer + Metastasis Using Real-World Clinical Data with AUC up to 0.9 + + +
+ Breast cancer is one of the two cancers responsible for the most deaths in +women, with about 42,000 deaths each year in the US. That there are over +300,000 breast cancers newly diagnosed each year suggests that only a fraction +of the cancers result in mortality. Thus, most of the women undergo seemingly +curative treatment for localized cancers, but a significant later succumb to +metastatic disease for which current treatments are only temporizing for the +vast majority. The current prognostic metrics are of little actionable value +for 4 of the 5 women seemingly cured after local treatment, and many women are +exposed to morbid and even mortal adjuvant therapies unnecessarily, with these +adjuvant therapies reducing metastatic recurrence by only a third. Thus, there +is a need for better prognostics to target aggressive treatment at those who +are likely to relapse and spare those who were actually cured. While there is a +plethora of molecular and tumor-marker assays in use and under-development to +detect recurrence early, these are time consuming, expensive and still often +un-validated as to actionable prognostic utility. A different approach would +use large data techniques to determine clinical and histopathological +parameters that would provide accurate prognostics using existing data. Herein, +we report on machine learning, together with grid search and Bayesian Networks +to develop algorithms that present a AUC of up to 0.9 in ROC analyses, using +only extant data. Such algorithms could be rapidly translated to clinical +management as they do not require testing beyond routine tumor evaluations. + +
+
+
+
+
+ + ☆ Iterated Energy-based Flow Matching for Sampling from Boltzmann + Densities + + +
+ In this work, we consider the problem of training a generator from +evaluations of energy functions or unnormalized densities. This is a +fundamental problem in probabilistic inference, which is crucial for scientific +applications such as learning the 3D coordinate distribution of a molecule. To +solve this problem, we propose iterated energy-based flow matching (iEFM), the +first off-policy approach to train continuous normalizing flow (CNF) models +from unnormalized densities. We introduce the simulation-free energy-based flow +matching objective, which trains the model to predict the Monte Carlo +estimation of the marginal vector field constructed from known energy +functions. Our framework is general and can be extended to variance-exploding +(VE) and optimal transport (OT) conditional probability paths. We evaluate iEFM +on a two-dimensional Gaussian mixture model (GMM) and an eight-dimensional +four-particle double-well potential (DW-4) energy function. Our results +demonstrate that iEFM outperforms existing methods, showcasing its potential +for efficient and scalable probabilistic modeling in complex high-dimensional +systems. + +
+
+
+
+
+ + ☆ PACiM: A Sparsity-Centric Hybrid Compute-in-Memory Architecture via + Probabilistic Approximation + + +
+ Approximate computing emerges as a promising approach to enhance the +efficiency of compute-in-memory (CiM) systems in deep neural network +processing. However, traditional approximate techniques often significantly +trade off accuracy for power efficiency, and fail to reduce data transfer +between main memory and CiM banks, which dominates power consumption. This +paper introduces a novel probabilistic approximate computation (PAC) method +that leverages statistical techniques to approximate multiply-and-accumulation +(MAC) operations, reducing approximation error by 4X compared to existing +approaches. PAC enables efficient sparsity-based computation in CiM systems by +simplifying complex MAC vector computations into scalar calculations. Moreover, +PAC enables sparsity encoding and eliminates the LSB activations transmission, +significantly reducing data reads and writes. This sets PAC apart from +traditional approximate computing techniques, minimizing not only computation +power but also memory accesses by 50%, thereby boosting system-level +efficiency. We developed PACiM, a sparsity-centric architecture that fully +exploits sparsity to reduce bit-serial cycles by 81% and achieves a peak 8b/8b +efficiency of 14.63 TOPS/W in 65 nm CMOS while maintaining high accuracy of +93.85/72.36/66.02% on CIFAR-10/CIFAR-100/ImageNet benchmarks using a ResNet-18 +model, demonstrating the effectiveness of our PAC methodology. + +
+
+
+
+
+ + ☆ Large-Scale Multi-omic Biosequence Transformers for Modeling + Peptide-Nucleotide Interactions + + +
+ The transformer architecture has revolutionized bioinformatics and driven +progress in the understanding and prediction of the properties of biomolecules. +Almost all research on large-scale biosequence transformers has focused on one +domain at a time (single-omic), usually nucleotides or peptides. These models +have seen incredible success in downstream tasks in each domain and have +achieved particularly noteworthy breakthroughs in sequences of peptides and +structural modeling. However, these single-omic models are naturally incapable +of modeling multi-omic tasks, one of the most biologically critical being +nucleotide-peptide interactions. + We present our work training the first multi-omic nucleotide-peptide +foundation models. We show that these multi-omic models (MOMs) can learn joint +representations between various single-omic distributions that are emergently +consistent with the Central Dogma of molecular biology, despite only being +trained on unlabeled biosequences. We further demonstrate that MOMs can be +fine-tuned to achieve state-of-the-art results on peptide-nucleotide +interaction tasks, namely predicting the change in Gibbs free energy +({\Delta}G) of the binding interaction between a given oligonucleotide and +peptide, as well as the effect on this binding interaction due to mutations in +the oligonucleotide sequence ({\Delta}{\Delta}G). + Remarkably, we show that multi-omic biosequence transformers emergently learn +useful structural information without any prior structural training, allowing +us to predict which peptide residues are most involved in the +peptide-nucleotide binding interaction. Lastly, we provide evidence that +multi-omic biosequence models are non-inferior to foundation models trained on +single-omics distributions, suggesting a more generalized or foundational +approach to building these models. + +
+
+ comment: 27 pages, 5 figures +
+
+
+
+
+ + ☆ Enhancing Conditional Image Generation with Explainable Latent Space + Manipulation + + +
+ In the realm of image synthesis, achieving fidelity to a reference image +while adhering to conditional prompts remains a significant challenge. This +paper proposes a novel approach that integrates a diffusion model with latent +space manipulation and gradient-based selective attention mechanisms to address +this issue. Leveraging Grad-SAM (Gradient-based Selective Attention +Manipulation), we analyze the cross attention maps of the cross attention +layers and gradients for the denoised latent vector, deriving importance scores +of elements of denoised latent vector related to the subject of interest. Using +this information, we create masks at specific timesteps during denoising to +preserve subjects while seamlessly integrating the reference image features. +This approach ensures the faithful formation of subjects based on conditional +prompts, while concurrently refining the background for a more coherent +composition. Our experiments on places365 dataset demonstrate promising +results, with our proposed model achieving the lowest mean and median Frechet +Inception Distance (FID) scores compared to baseline models, indicating +superior fidelity preservation. Furthermore, our model exhibits competitive +performance in aligning the generated images with provided textual +descriptions, as evidenced by high CLIP scores. These results highlight the +effectiveness of our approach in both fidelity preservation and textual context +preservation, offering a significant advancement in text-to-image synthesis +tasks. + +
+
+ comment: 7 pages , 5 figures +
+
+
+
+
+ + ☆ Policy Adaptation via Language Optimization: Decomposing Tasks for + Few-Shot Imitation + + +
+ Learned language-conditioned robot policies often struggle to effectively +adapt to new real-world tasks even when pre-trained across a diverse set of +instructions. We propose a novel approach for few-shot adaptation to unseen +tasks that exploits the semantic understanding of task decomposition provided +by vision-language models (VLMs). Our method, Policy Adaptation via Language +Optimization (PALO), combines a handful of demonstrations of a task with +proposed language decompositions sampled from a VLM to quickly enable rapid +nonparametric adaptation, avoiding the need for a larger fine-tuning dataset. +We evaluate PALO on extensive real-world experiments consisting of challenging +unseen, long-horizon robot manipulation tasks. We find that PALO is able of +consistently complete long-horizon, multi-tier tasks in the real world, +outperforming state of the art pre-trained generalist policies, and methods +that have access to the same demonstrations. + +
+
+ comment: 27 pages, 14 figures +
+
+
+
+
+ + ☆ Targeted Cause Discovery with Data-Driven Learning + + +
+ We propose a novel machine learning approach for inferring causal variables +of a target variable from observations. Our goal is to identify both direct and +indirect causes within a system, thereby efficiently regulating the target +variable when the difficulty and cost of intervening on each causal variable +vary. Our method employs a neural network trained to identify causality through +supervised learning on simulated data. By implementing a local-inference +strategy, we achieve linear complexity with respect to the number of variables, +efficiently scaling up to thousands of variables. Empirical results demonstrate +the effectiveness of our method in identifying causal relationships within +large-scale gene regulatory networks, outperforming existing causal discovery +methods that primarily focus on direct causality. We validate our model's +generalization capability across novel graph structures and generating +mechanisms, including gene regulatory networks of E. coli and the human K562 +cell line. Implementation codes are available at +https://github.com/snu-mllab/Targeted-Cause-Discovery. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Adversarial Network Optimization under Bandit Feedback: Maximizing + Utility in Non-Stationary Multi-Hop Networks + + +
+ Stochastic Network Optimization (SNO) concerns scheduling in stochastic +queueing systems. It has been widely studied in network theory. Classical SNO +algorithms require network conditions to be stationary with time, which fails +to capture the non-stationary components in many real-world scenarios. Many +existing algorithms also assume knowledge of network conditions before +decision, which rules out applications where unpredictability presents. + Motivated by these issues, we consider Adversarial Network Optimization (ANO) +under bandit feedback. Specifically, we consider the task of *i)* maximizing +some unknown and time-varying utility function associated to scheduler's +actions, where *ii)* the underlying network is a non-stationary multi-hop one +whose conditions change arbitrarily with time, and *iii)* only bandit feedback +(effect of actually deployed actions) is revealed after decisions. Our proposed +`UMO2` algorithm ensures network stability and also matches the utility +maximization performance of any "mildly varying" reference policy up to a +polynomially decaying gap. To our knowledge, no previous ANO algorithm handled +multi-hop networks or achieved utility guarantees under bandit feedback, +whereas ours can do both. + Technically, our method builds upon a novel integration of online learning +into Lyapunov analyses: To handle complex inter-dependencies among queues in +multi-hop networks, we propose meticulous techniques to balance online learning +and Lyapunov arguments. To tackle the learning obstacles due to potentially +unbounded queue sizes, we design a new online linear optimization algorithm +that automatically adapts to loss magnitudes. To maximize utility, we propose a +bandit convex optimization algorithm with novel queue-dependent learning rate +scheduling that suites drastically varying queue lengths. Our new insights in +online learning can be of independent interest. + +
+
+
+
+
+ + ☆ The Application of Machine Learning in Tidal Evolution Simulation of + Star-Planet Systems + + +
+ With the release of a large amount of astronomical data, an increasing number +of close-in hot Jupiters have been discovered. Calculating their evolutionary +curves using star-planet interaction models presents a challenge. To expedite +the generation of evolutionary curves for these close-in hot Jupiter systems, +we utilized tidal interaction models established on MESA to create 15,745 +samples of star-planet systems and 7,500 samples of stars. Additionally, we +employed a neural network (Multi-Layer Perceptron - MLP) to predict the +evolutionary curves of the systems, including stellar effective temperature, +radius, stellar rotation period, and planetary orbital period. The median +relative errors of the predicted evolutionary curves were found to be 0.15%, +0.43%, 2.61%, and 0.57%, respectively. Furthermore, the speed at which we +generate evolutionary curves exceeds that of model-generated curves by more +than four orders of magnitude. We also extracted features of planetary +migration states and utilized lightGBM to classify the samples into 6 +categories for prediction. We found that by combining three types that undergo +long-term double synchronization into one label, the classifier effectively +recognized these features. Apart from systems experiencing long-term double +synchronization, the median relative errors of the predicted evolutionary +curves were all below 4%. Our work provides an efficient method to save +significant computational resources and time with minimal loss in accuracy. +This research also lays the foundation for analyzing the evolutionary +characteristics of systems under different migration states, aiding in the +understanding of the underlying physical mechanisms of such systems. Finally, +to a large extent, our approach could replace the calculations of theoretical +models. + +
+
+
+
+
+ + ☆ ReXamine-Global: A Framework for Uncovering Inconsistencies in Radiology + Report Generation Metrics + + +
+ Given the rapidly expanding capabilities of generative AI models for +radiology, there is a need for robust metrics that can accurately measure the +quality of AI-generated radiology reports across diverse hospitals. We develop +ReXamine-Global, a LLM-powered, multi-site framework that tests metrics across +different writing styles and patient populations, exposing gaps in their +generalization. First, our method tests whether a metric is undesirably +sensitive to reporting style, providing different scores depending on whether +AI-generated reports are stylistically similar to ground-truth reports or not. +Second, our method measures whether a metric reliably agrees with experts, or +whether metric and expert scores of AI-generated report quality diverge for +some sites. Using 240 reports from 6 hospitals around the world, we apply +ReXamine-Global to 7 established report evaluation metrics and uncover serious +gaps in their generalizability. Developers can apply ReXamine-Global when +designing new report evaluation metrics, ensuring their robustness across +sites. Additionally, our analysis of existing metrics can guide users of those +metrics towards evaluation procedures that work reliably at their sites of +interest. + +
+
+
+
+
+ + ☆ Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient + Manipulation + + +
+ Micro-batch clipping, a gradient clipping method, has recently shown +potential in enhancing auto-speech recognition (ASR) model performance. +However, the underlying mechanism behind this improvement remains mysterious, +particularly the observation that only certain micro-batch sizes are +beneficial. In this paper, we make the first attempt to explain this +phenomenon. Inspired by recent data pruning research, we assume that specific +training samples may impede model convergence during certain training phases. +Under this assumption, the convergence analysis shows that micro-batch clipping +can improve the convergence rate asymptotically at the cost of an additional +constant bias that does not diminish with more training iterations. The bias is +dependent on a few factors and can be minimized at specific micro-batch size, +thereby elucidating the existence of the sweet-spot micro-batch size observed +previously. We also verify the effectiveness of micro-batch clipping beyond +speech models on vision and language models, and show promising performance +gains in these domains. An exploration of potential limitations shows that +micro-batch clipping is less effective when training data originates from +multiple distinct domains. + +
+
+
+
+
+ + ☆ Short-Term Electricity-Load Forecasting by Deep Learning: A + Comprehensive Survey + + +
+ Short-Term Electricity-Load Forecasting (STELF) refers to the prediction of +the immediate demand (in the next few hours to several days) for the power +system. Various external factors, such as weather changes and the emergence of +new electricity consumption scenarios, can impact electricity demand, causing +load data to fluctuate and become non-linear, which increases the complexity +and difficulty of STELF. In the past decade, deep learning has been applied to +STELF, modeling and predicting electricity demand with high accuracy, and +contributing significantly to the development of STELF. This paper provides a +comprehensive survey on deep-learning-based STELF over the past ten years. It +examines the entire forecasting process, including data pre-processing, feature +extraction, deep-learning modeling and optimization, and results evaluation. +This paper also identifies some research challenges and potential research +directions to be further investigated in future work. + +
+
+
+
+
+ + ☆ Uni-3DAD: GAN-Inversion Aided Universal 3D Anomaly Detection on + Model-free Products + + +
+ Anomaly detection is a long-standing challenge in manufacturing systems. +Traditionally, anomaly detection has relied on human inspectors. However, 3D +point clouds have gained attention due to their robustness to environmental +factors and their ability to represent geometric data. Existing 3D anomaly +detection methods generally fall into two categories. One compares scanned 3D +point clouds with design files, assuming these files are always available. +However, such assumptions are often violated in many real-world applications +where model-free products exist, such as fresh produce (i.e., ``Cookie", +``Potato", etc.), dentures, bone, etc. The other category compares patches of +scanned 3D point clouds with a library of normal patches named memory bank. +However, those methods usually fail to detect incomplete shapes, which is a +fairly common defect type (i.e., missing pieces of different products). The +main challenge is that missing areas in 3D point clouds represent the absence +of scanned points. This makes it infeasible to compare the missing region with +existing point cloud patches in the memory bank. To address these two +challenges, we proposed a unified, unsupervised 3D anomaly detection framework +capable of identifying all types of defects on model-free products. Our method +integrates two detection modules: a feature-based detection module and a +reconstruction-based detection module. Feature-based detection covers geometric +defects, such as dents, holes, and cracks, while the reconstruction-based +method detects missing regions. Additionally, we employ a One-class Support +Vector Machine (OCSVM) to fuse the detection results from both modules. The +results demonstrate that (1) our proposed method outperforms the +state-of-the-art methods in identifying incomplete shapes and (2) it still +maintains comparable performance with the SOTA methods in detecting all other +types of anomalies. + +
+
+
+
+
+ + ☆ Variational Mode-Driven Graph Convolutional Network for Spatiotemporal + Traffic Forecasting + + +
+ This paper focuses on spatio-temporal (ST) traffic prediction traffic using +graph neural networks. Given that ST data consists of non-stationary and +complex time events, interpreting and predicting such trends is comparatively +complicated. Representation of ST data in modes helps us infer behavior and +assess the impact of noise on prediction applications. We propose a framework +that decomposes ST data into modes using the variational mode decomposition +(VMD) method, which is then fed into the neural network for forecasting future +states. This hybrid approach is known as a variational mode graph convolutional +network (VMGCN). Instead of exhaustively searching for the number of modes, +they are determined using the reconstruction loss from the real-time +application data. We also study the significance of each mode and the impact of +bandwidth constraints on different horizon predictions in traffic flow data. We +evaluate the performance of our proposed network on the LargeST dataset for +both short and long-term predictions. Our framework yields better results +compared to state-of-the-art methods. + +
+
+ comment: IEEE Transactions on Intelligent Transportation Systems Submission, + 2024 +
+
+
+
+
+ + ☆ A More Unified Theory of Transfer Learning + + +
+ We show that some basic moduli of continuity $\delta$ -- which measure how +fast target risk decreases as source risk decreases -- appear to be at the root +of many of the classical relatedness measures in transfer learning and related +literature. Namely, bounds in terms of $\delta$ recover many of the existing +bounds in terms of other measures of relatedness -- both in regression and +classification -- and can at times be tighter. + We are particularly interested in general situations where the learner has +access to both source data and some or no target data. The unified perspective +allowed by the moduli $\delta$ allow us to extend many existing notions of +relatedness at once to these scenarios involving target data: interestingly, +while $\delta$ itself might not be efficiently estimated, adaptive procedures +exist -- based on reductions to confidence sets -- which can get nearly tight +rates in terms of $\delta$ with no prior distributional knowledge. Such +adaptivity to unknown $\delta$ immediately implies adaptivity to many classical +relatedness notions, in terms of combined source and target samples' sizes. + +
+
+
+
+
+ + ☆ Real-Time Energy Pricing in New Zealand: An Evolving Stream Analysis PRICAI + + +
+ This paper introduces a group of novel datasets representing real-time +time-series and streaming data of energy prices in New Zealand, sourced from +the Electricity Market Information (EMI) website maintained by the New Zealand +government. The datasets are intended to address the scarcity of proper +datasets for streaming regression learning tasks. We conduct extensive analyses +and experiments on these datasets, covering preprocessing techniques, +regression tasks, prediction intervals, concept drift detection, and anomaly +detection. Our experiments demonstrate the datasets' utility and highlight the +challenges and opportunities for future research in energy price forecasting. + +
+
+ comment: 12 Pages, 8 figures, short version accepted by PRICAI +
+
+
+
+
+ + ☆ Single-Loop Deterministic and Stochastic Interior-Point Algorithms for + Nonlinearly Constrained Optimization + + +
+ An interior-point algorithm framework is proposed, analyzed, and tested for +solving nonlinearly constrained continuous optimization problems. The main +setting of interest is when the objective and constraint functions may be +nonlinear and/or nonconvex, and when constraint values and derivatives are +tractable to compute, but objective function values and derivatives can only be +estimated. The algorithm is intended primarily for a setting that is similar +for stochastic-gradient methods for unconstrained optimization, namely, the +setting when stochastic-gradient estimates are available and employed in place +of gradients of the objective, and when no objective function values (nor +estimates of them) are employed. This is achieved by the interior-point +framework having a single-loop structure rather than the nested-loop structure +that is typical of contemporary interior-point methods. For completeness, +convergence guarantees for the framework are provided both for deterministic +and stochastic settings. Numerical experiments show that the algorithm yields +good performance on a large set of test problems. + +
+
+
+
+
+ + ☆ A Minibatch-SGD-Based Learning Meta-Policy for Inventory Systems with + Myopic Optimal Policy + + +
+ Stochastic gradient descent (SGD) has proven effective in solving many +inventory control problems with demand learning. However, it often faces the +pitfall of an infeasible target inventory level that is lower than the current +inventory level. Several recent works (e.g., Huh and Rusmevichientong (2009), +Shi et al.(2016)) are successful to resolve this issue in various inventory +systems. However, their techniques are rather sophisticated and difficult to be +applied to more complicated scenarios such as multi-product and +multi-constraint inventory systems. + In this paper, we address the infeasible-target-inventory-level issue from a +new technical perspective -- we propose a novel minibatch-SGD-based +meta-policy. Our meta-policy is flexible enough to be applied to a general +inventory systems framework covering a wide range of inventory management +problems with myopic clairvoyant optimal policy. By devising the optimal +minibatch scheme, our meta-policy achieves a regret bound of +$\mathcal{O}(\sqrt{T})$ for the general convex case and $\mathcal{O}(\log T)$ +for the strongly convex case. To demonstrate the power and flexibility of our +meta-policy, we apply it to three important inventory control problems: +multi-product and multi-constraint systems, multi-echelon serial systems, and +one-warehouse and multi-store systems by carefully designing +application-specific subroutines.We also conduct extensive numerical +experiments to demonstrate that our meta-policy enjoys competitive regret +performance, high computational efficiency, and low variances among a wide +range of applications. + +
+
+ comment: Forthcoming in Management Science +
+
+
+
+
+ + ☆ Different Victims, Same Layout: Email Visual Similarity Detection for + Enhanced Email Protection CCS 2024 + + +
+ In the pursuit of an effective spam detection system, the focus has often +been on identifying known spam patterns either through rule-based detection +systems or machine learning (ML) solutions. However, both systems are +susceptible to evasion techniques and zero-day attacks that can be achieved at +low cost. Therefore, an email that bypassed the defense system once can do it +again in the following days, even though rules are updated or the ML models are +retrained. The recurrence of failures to detect emails that exhibit layout +similarities to previously undetected spam is concerning for customers and can +erode their trust in a company. Our observations show that threat actors reuse +email kits extensively and can bypass detection with little effort, for +example, by making changes to the content of emails. In this work, we propose +an email visual similarity detection approach, named Pisco, to improve the +detection capabilities of an email threat defense system. We apply our proof of +concept to some real-world samples received from different sources. Our results +show that email kits are being reused extensively and visually similar emails +are sent to our customers at various time intervals. Therefore, this method +could be very helpful in situations where detection features that rely on +contextual information and keywords are bypassed, an occurrence our +observations show happens frequently. + +
+
+ comment: To be published in the proceedings of the ACM Conference on Computer + and Communications Security (ACM CCS 2024) +
+
+
+
+
+ + ☆ FlowRetrieval: Flow-Guided Data Retrieval for Few-Shot Imitation + Learning + + +
+ Few-shot imitation learning relies on only a small amount of task-specific +demonstrations to efficiently adapt a policy for a given downstream tasks. +Retrieval-based methods come with a promise of retrieving relevant past +experiences to augment this target data when learning policies. However, +existing data retrieval methods fall under two extremes: they either rely on +the existence of exact behaviors with visually similar scenes in the prior +data, which is impractical to assume; or they retrieve based on semantic +similarity of high-level language descriptions of the task, which might not be +that informative about the shared low-level behaviors or motions across tasks +that is often a more important factor for retrieving relevant data for policy +learning. In this work, we investigate how we can leverage motion similarity in +the vast amount of cross-task data to improve few-shot imitation learning of +the target task. Our key insight is that motion-similar data carries rich +information about the effects of actions and object interactions that can be +leveraged during few-shot adaptation. We propose FlowRetrieval, an approach +that leverages optical flow representations for both extracting similar motions +to target tasks from prior data, and for guiding learning of a policy that can +maximally benefit from such data. Our results show FlowRetrieval significantly +outperforms prior methods across simulated and real-world domains, achieving on +average 27% higher success rate than the best retrieval-based prior method. In +the Pen-in-Cup task with a real Franka Emika robot, FlowRetrieval achieves 3.7x +the performance of the baseline imitation learning technique that learns from +all prior and target data. Website: https://flow-retrieval.github.io + +
+
+
+
+
+ + ☆ Efficient Transonic Aeroelastic Model Reduction Using Optimized Sparse + Multi-Input Polynomial Functionals + + +
+ Nonlinear aeroelastic reduced-order models (ROMs) based on machine learning +or artificial intelligence algorithms can be complex and computationally +demanding to train, meaning that for practical aeroelastic applications, the +conservative nature of linearization is often favored. Therefore, there is a +requirement for novel nonlinear aeroelastic model reduction approaches that are +accurate, simple and, most importantly, efficient to generate. This paper +proposes a novel formulation for the identification of a compact multi-input +Volterra series, where Orthogonal Matching Pursuit is used to obtain a set of +optimally sparse nonlinear multi-input ROM coefficients from unsteady +aerodynamic training data. The framework is exemplified using the Benchmark +Supercritical Wing, considering; forced response, flutter and limit cycle +oscillation. The simple and efficient Optimal Sparsity Multi-Input ROM +(OSM-ROM) framework performs with high accuracy compared to the full-order +aeroelastic model, requiring only a fraction of the tens-of-thousands of +possible multi-input terms to be identified and allowing a 96% reduction in the +number of training samples. + +
+
+ comment: 24 pages, preprint, under review +
+
+
+
+
+ + ☆ Theoretical Insights into Overparameterized Models in Multi-Task and + Replay-Based Continual Learning + + +
+ Multi-task learning (MTL) is a machine learning paradigm that aims to improve +the generalization performance of a model on multiple related tasks by training +it simultaneously on those tasks. Unlike MTL, where the model has instant +access to the training data of all tasks, continual learning (CL) involves +adapting to new sequentially arriving tasks over time without forgetting the +previously acquired knowledge. Despite the wide practical adoption of CL and +MTL and extensive literature on both areas, there remains a gap in the +theoretical understanding of these methods when used with overparameterized +models such as deep neural networks. This paper studies the overparameterized +linear models as a proxy for more complex models. We develop theoretical +results describing the effect of various system parameters on the model's +performance in an MTL setup. Specifically, we study the impact of model size, +dataset size, and task similarity on the generalization error and knowledge +transfer. Additionally, we present theoretical results to characterize the +performance of replay-based CL models. Our results reveal the impact of buffer +size and model capacity on the forgetting rate in a CL setup and help shed +light on some of the state-of-the-art CL methods. Finally, through extensive +empirical evaluations, we demonstrate that our theoretical findings are also +applicable to deep neural networks, offering valuable guidance for designing +MTL and CL models in practice. + +
+
+ comment: 41 pages, 21 figures +
+
+
+
+
+ + ☆ AI-driven Reverse Engineering of QML Models + + +
+ Quantum machine learning (QML) is a rapidly emerging area of research, driven +by the capabilities of Noisy Intermediate-Scale Quantum (NISQ) devices. With +the progress in the research of QML models, there is a rise in third-party +quantum cloud services to cater to the increasing demand for resources. New +security concerns surface, specifically regarding the protection of +intellectual property (IP) from untrustworthy service providers. One of the +most pressing risks is the potential for reverse engineering (RE) by malicious +actors who may steal proprietary quantum IPs such as trained parameters and QML +architecture, modify them to remove additional watermarks or signatures and +re-transpile them for other quantum hardware. Prior work presents a brute force +approach to RE the QML parameters which takes exponential time overhead. In +this paper, we introduce an autoencoder-based approach to extract the +parameters from transpiled QML models deployed on untrusted third-party +vendors. We experiment on multi-qubit classifiers and note that they can be +reverse-engineered under restricted conditions with a mean error of order +10^-1. The amount of time taken to prepare the dataset and train the model to +reverse engineer the QML circuit being of the order 10^3 seconds (which is +10^2x better than the previously reported value for 4-layered 4-qubit +classifiers) makes the threat of RE highly potent, underscoring the need for +continued development of effective defenses. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Analyzing Inference Privacy Risks Through Gradients in Machine Learning + + +
+ In distributed learning settings, models are iteratively updated with shared +gradients computed from potentially sensitive user data. While previous work +has studied various privacy risks of sharing gradients, our paper aims to +provide a systematic approach to analyze private information leakage from +gradients. We present a unified game-based framework that encompasses a broad +range of attacks including attribute, property, distributional, and user +disclosures. We investigate how different uncertainties of the adversary affect +their inferential power via extensive experiments on five datasets across +various data modalities. Our results demonstrate the inefficacy of solely +relying on data aggregation to achieve privacy against inference attacks in +distributed learning. We further evaluate five types of defenses, namely, +gradient pruning, signed gradient descent, adversarial perturbations, +variational information bottleneck, and differential privacy, under both static +and adaptive adversary settings. We provide an information-theoretic view for +analyzing the effectiveness of these defenses against inference from gradients. +Finally, we introduce a method for auditing attribute inference privacy, +improving the empirical estimation of worst-case privacy through crafting +adversarial canary records. + +
+
+
+
+
+ + ☆ DLFormer: Enhancing Explainability in Multivariate Time Series + Forecasting using Distributed Lag Embedding + + +
+ . Most real-world variables are multivariate time series influenced by past +values and explanatory factors. Consequently, predicting these time series data +using artificial intelligence is ongoing. In particular, in fields such as +healthcare and finance, where reliability is crucial, having understandable +explanations for predictions is essential. However, achieving a balance between +high prediction accuracy and intuitive explainability has proven challenging. +Although attention-based models have limitations in representing the individual +influences of each variable, these models can influence the temporal +dependencies in time series prediction and the magnitude of the influence of +individual variables. To address this issue, this study introduced DLFormer, an +attention-based architecture integrated with distributed lag embedding, to +temporally embed individual variables and capture their temporal influence. +Through validation against various real-world datasets, DLFormer showcased +superior performance improvements compared to existing attention-based +high-performance models. Furthermore, comparing the relationships between +variables enhanced the reliability of explainability. + +
+
+
+
+
+ + ☆ Exploring Multiple Strategies to Improve Multilingual Coreference + Resolution in CorefUD + + +
+ Coreference resolution, the task of identifying expressions in text that +refer to the same entity, is a critical component in various natural language +processing (NLP) applications. This paper presents our end-to-end neural +coreference resolution system, utilizing the CorefUD 1.1 dataset, which spans +17 datasets across 12 languages. We first establish strong baseline models, +including monolingual and cross-lingual variations, and then propose several +extensions to enhance performance across diverse linguistic contexts. These +extensions include cross-lingual training, incorporation of syntactic +information, a Span2Head model for optimized headword prediction, and advanced +singleton modeling. We also experiment with headword span representation and +long-documents modeling through overlapping segments. The proposed extensions, +particularly the heads-only approach, singleton modeling, and long document +prediction significantly improve performance across most datasets. We also +perform zero-shot cross-lingual experiments, highlighting the potential and +limitations of cross-lingual transfer in coreference resolution. Our findings +contribute to the development of robust and scalable coreference systems for +multilingual coreference resolution. Finally, we evaluate our model on CorefUD +1.1 test set and surpass the best model from CRAC 2023 shared task of a +comparable size by a large margin. Our nodel is available on GitHub: +\url{https://github.com/ondfa/coref-multiling} + +
+
+
+
+
+ + ☆ Tex-ViT: A Generalizable, Robust, Texture-based dual-branch + cross-attention deepfake detector + + +
+ Deepfakes, which employ GAN to produce highly realistic facial modification, +are widely regarded as the prevailing method. Traditional CNN have been able to +identify bogus media, but they struggle to perform well on different datasets +and are vulnerable to adversarial attacks due to their lack of robustness. +Vision transformers have demonstrated potential in the realm of image +classification problems, but they require enough training data. Motivated by +these limitations, this publication introduces Tex-ViT (Texture-Vision +Transformer), which enhances CNN features by combining ResNet with a vision +transformer. The model combines traditional ResNet features with a texture +module that operates in parallel on sections of ResNet before each +down-sampling operation. The texture module then serves as an input to the dual +branch of the cross-attention vision transformer. It specifically focuses on +improving the global texture module, which extracts feature map correlation. +Empirical analysis reveals that fake images exhibit smooth textures that do not +remain consistent over long distances in manipulations. Experiments were +performed on different categories of FF++, such as DF, f2f, FS, and NT, +together with other types of GAN datasets in cross-domain scenarios. +Furthermore, experiments also conducted on FF++, DFDCPreview, and Celeb-DF +dataset underwent several post-processing situations, such as blurring, +compression, and noise. The model surpassed the most advanced models in terms +of generalization, achieving a 98% accuracy in cross-domain scenarios. This +demonstrates its ability to learn the shared distinguishing textural +characteristics in the manipulated samples. These experiments provide evidence +that the proposed model is capable of being applied to various situations and +is resistant to many post-processing procedures. + +
+
+
+
+
+ + ☆ Robotic warehousing operations: a learn-then-optimize approach to + large-scale neighborhood search + + +
+ The rapid deployment of robotics technologies requires dedicated optimization +algorithms to manage large fleets of autonomous agents. This paper supports +robotic parts-to-picker operations in warehousing by optimizing +order-workstation assignments, item-pod assignments and the schedule of order +fulfillment at workstations. The model maximizes throughput, while managing +human workload at the workstations and congestion in the facility. We solve it +via large-scale neighborhood search, with a novel learn-then-optimize approach +to subproblem generation. The algorithm relies on an offline machine learning +procedure to predict objective improvements based on subproblem features, and +an online optimization model to generate a new subproblem at each iteration. In +collaboration with Amazon Robotics, we show that our model and algorithm +generate much stronger solutions for practical problems than state-of-the-art +approaches. In particular, our solution enhances the utilization of robotic +fleets by coordinating robotic tasks for human operators to pick multiple items +at once, and by coordinating robotic routes to avoid congestion in the +facility. + +
+
+
+
+
+ + ☆ LLaVA-Chef: A Multi-modal Generative Model for Food Recipes + + +
+ In the rapidly evolving landscape of online recipe sharing within a +globalized context, there has been a notable surge in research towards +comprehending and generating food recipes. Recent advancements in large +language models (LLMs) like GPT-2 and LLaVA have paved the way for Natural +Language Processing (NLP) approaches to delve deeper into various facets of +food-related tasks, encompassing ingredient recognition and comprehensive +recipe generation. Despite impressive performance and multi-modal adaptability +of LLMs, domain-specific training remains paramount for their effective +application. This work evaluates existing LLMs for recipe generation and +proposes LLaVA-Chef, a novel model trained on a curated dataset of diverse +recipe prompts in a multi-stage approach. First, we refine the mapping of +visual food image embeddings to the language space. Second, we adapt LLaVA to +the food domain by fine-tuning it on relevant recipe data. Third, we utilize +diverse prompts to enhance the model's recipe comprehension. Finally, we +improve the linguistic quality of generated recipes by penalizing the model +with a custom loss function. LLaVA-Chef demonstrates impressive improvements +over pretrained LLMs and prior works. A detailed qualitative analysis reveals +that LLaVA-Chef generates more detailed recipes with precise ingredient +mentions, compared to existing approaches. + +
+
+
+
+
+ + ☆ Revising Multimodal VAEs with Diffusion Decoders + + +
+ Multimodal VAEs often struggle with generating high-quality outputs, a +challenge that extends beyond the inherent limitations of the VAE framework. +The core issue lies in the restricted joint representation of the latent space, +particularly when complex modalities like images are involved. Feedforward +decoders, commonly used for these intricate modalities, inadvertently constrain +the joint latent space, leading to a degradation in the quality of the other +modalities as well. Although recent studies have shown improvement by +introducing modality-specific representations, the issue remains significant. +In this work, we demonstrate that incorporating a flexible diffusion decoder +specifically for the image modality not only enhances the generation quality of +the images but also positively impacts the performance of the other modalities +that rely on feedforward decoders. This approach addresses the limitations +imposed by conventional joint representations and opens up new possibilities +for improving multimodal generation tasks using the multimodal VAE framework. +Our model provides state-of-the-art results compared to other multimodal VAEs +in different datasets with higher coherence and superior quality in the +generated modalities + +
+
+
+
+
+ + ☆ Coverage Analysis of Multi-Environment Q-Learning Algorithms for + Wireless Network Optimization + + +
+ Q-learning is widely used to optimize wireless networks with unknown system +dynamics. Recent advancements include ensemble multi-environment hybrid +Q-learning algorithms, which utilize multiple Q-learning algorithms across +structurally related but distinct Markovian environments and outperform +existing Q-learning algorithms in terms of accuracy and complexity in +large-scale wireless networks. We herein conduct a comprehensive coverage +analysis to ensure optimal data coverage conditions for these algorithms. +Initially, we establish upper bounds on the expectation and variance of +different coverage coefficients. Leveraging these bounds, we present an +algorithm for efficient initialization of these algorithms. We test our +algorithm on two distinct real-world wireless networks. Numerical simulations +show that our algorithm can achieve %50 less policy error and %40 less runtime +complexity than state-of-the-art reinforcement learning algorithms. +Furthermore, our algorithm exhibits robustness to changes in network settings +and parameters. We also numerically validate our theoretical results. + +
+
+
+
+
+ + ☆ Longitudinal Modularity, a Modularity for Link Streams + + +
+ Temporal networks are commonly used to model real-life phenomena. When these +phenomena represent interactions and are captured at a fine-grained temporal +resolution, they are modeled as link streams. Community detection is an +essential network analysis task. Although many methods exist for static +networks, and some methods have been developed for temporal networks +represented as sequences of snapshots, few works can handle link streams. This +article introduces the first adaptation of the well-known Modularity quality +function to link streams. Unlike existing methods, it is independent of the +time scale of analysis. After introducing the quality function, and its +relation to existing static and dynamic definitions of Modularity, we show +experimentally its relevance for dynamic community evaluation. + +
+
+
+
+
+ + ☆ Learning Multi-agent Multi-machine Tending by Mobile Robots + + +
+ Robotics can help address the growing worker shortage challenge of the +manufacturing industry. As such, machine tending is a task collaborative robots +can tackle that can also highly boost productivity. Nevertheless, existing +robotics systems deployed in that sector rely on a fixed single-arm setup, +whereas mobile robots can provide more flexibility and scalability. In this +work, we introduce a multi-agent multi-machine tending learning framework by +mobile robots based on Multi-agent Reinforcement Learning (MARL) techniques +with the design of a suitable observation and reward. Moreover, an +attention-based encoding mechanism is developed and integrated into Multi-agent +Proximal Policy Optimization (MAPPO) algorithm to boost its performance for +machine tending scenarios. Our model (AB-MAPPO) outperformed MAPPO in this new +challenging scenario in terms of task success, safety, and resources +utilization. Furthermore, we provided an extensive ablation study to support +our various design decisions. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ GSTAM: Efficient Graph Distillation with Structural Attention-Matching ECCV + + +
+ Graph distillation has emerged as a solution for reducing large graph +datasets to smaller, more manageable, and informative ones. Existing methods +primarily target node classification, involve computationally intensive +processes, and fail to capture the true distribution of the full graph dataset. +To address these issues, we introduce Graph Distillation with Structural +Attention Matching (GSTAM), a novel method for condensing graph classification +datasets. GSTAM leverages the attention maps of GNNs to distill structural +information from the original dataset into synthetic graphs. The structural +attention-matching mechanism exploits the areas of the input graph that GNNs +prioritize for classification, effectively distilling such information into the +synthetic graphs and improving overall distillation performance. Comprehensive +experiments demonstrate GSTAM's superiority over existing methods, achieving +0.45% to 6.5% better performance in extreme condensation ratios, highlighting +its potential use in advancing distillation for graph classification tasks +(Code available at https://github.com/arashrasti96/GSTAM). + +
+
+ comment: Accepted at ECCV-DD 2024 +
+
+
+
+
+ + ☆ Characterization of point-source transient events with a rolling-shutter + compressed sensing system + + +
+ Point-source transient events (PSTEs) - optical events that are both +extremely fast and extremely small - pose several challenges to an imaging +system. Due to their speed, accurately characterizing such events often +requires detectors with very high frame rates. Due to their size, accurately +detecting such events requires maintaining coverage over an extended +field-of-view, often through the use of imaging focal plane arrays (FPA) with a +global shutter readout. Traditional imaging systems that meet these +requirements are costly in terms of price, size, weight, power consumption, and +data bandwidth, and there is a need for cheaper solutions with adequate +temporal and spatial coverage. To address these issues, we develop a novel +compressed sensing algorithm adapted to the rolling shutter readout of an +imaging system. This approach enables reconstruction of a PSTE signature at the +sampling rate of the rolling shutter, offering a 1-2 order of magnitude +temporal speedup and a proportional reduction in data bandwidth. We present +empirical results demonstrating accurate recovery of PSTEs using measurements +that are spatially undersampled by a factor of 25, and our simulations show +that, relative to other compressed sensing algorithms, our algorithm is both +faster and yields higher quality reconstructions. We also present theoretical +results characterizing our algorithm and corroborating simulations. The +potential impact of our work includes the development of much faster, cheaper +sensor solutions for PSTE detection and characterization. + +
+
+ comment: 20 pages, 11 figures +
+
+
+
+
+ + ☆ Probabilistic Decomposed Linear Dynamical Systems for Robust Discovery + of Latent Neural Dynamics + + +
+ Time-varying linear state-space models are powerful tools for obtaining +mathematically interpretable representations of neural signals. For example, +switching and decomposed models describe complex systems using latent variables +that evolve according to simple locally linear dynamics. However, existing +methods for latent variable estimation are not robust to dynamical noise and +system nonlinearity due to noise-sensitive inference procedures and limited +model formulations. This can lead to inconsistent results on signals with +similar dynamics, limiting the model's ability to provide scientific insight. +In this work, we address these limitations and propose a probabilistic approach +to latent variable estimation in decomposed models that improves robustness +against dynamical noise. Additionally, we introduce an extended latent dynamics +model to improve robustness against system nonlinearities. We evaluate our +approach on several synthetic dynamical systems, including an +empirically-derived brain-computer interface experiment, and demonstrate more +accurate latent variable inference in nonlinear systems with diverse noise +conditions. Furthermore, we apply our method to a real-world clinical +neurophysiology dataset, illustrating the ability to identify interpretable and +coherent structure where previous models cannot. + +
+
+
+
+
+ + ☆ The Star Geometry of Critic-Based Regularizer Learning + + +
+ Variational regularization is a classical technique to solve statistical +inference tasks and inverse problems, with modern data-driven approaches +parameterizing regularizers via deep neural networks showcasing impressive +empirical performance. Recent works along these lines learn task-dependent +regularizers. This is done by integrating information about the measurements +and ground-truth data in an unsupervised, critic-based loss function, where the +regularizer attributes low values to likely data and high values to unlikely +data. However, there is little theory about the structure of regularizers +learned via this process and how it relates to the two data distributions. To +make progress on this challenge, we initiate a study of optimizing critic-based +loss functions to learn regularizers over a particular family of regularizers: +gauges (or Minkowski functionals) of star-shaped bodies. This family contains +regularizers that are commonly employed in practice and shares properties with +regularizers parameterized by deep neural networks. We specifically investigate +critic-based losses derived from variational representations of statistical +distances between probability measures. By leveraging tools from star geometry +and dual Brunn-Minkowski theory, we illustrate how these losses can be +interpreted as dual mixed volumes that depend on the data distribution. This +allows us to derive exact expressions for the optimal regularizer in certain +cases. Finally, we identify which neural network architectures give rise to +such star body gauges and when do such regularizers have favorable properties +for optimization. More broadly, this work highlights how the tools of star +geometry can aid in understanding the geometry of unsupervised regularizer +learning. + +
+
+
+
+
+ + ☆ Machine Learning-Based Research on the Adaptability of Adolescents to + Online Education + + +
+ With the rapid advancement of internet technology, the adaptability of +adolescents to online learning has emerged as a focal point of interest within +the educational sphere. However, the academic community's efforts to develop +predictive models for adolescent online learning adaptability require further +refinement and expansion. Utilizing data from the "Chinese Adolescent Online +Education Survey" spanning the years 2014 to 2016, this study implements five +machine learning algorithms - logistic regression, K-nearest neighbors, random +forest, XGBoost, and CatBoost - to analyze the factors influencing adolescent +online learning adaptability and to determine the model best suited for +prediction. The research reveals that the duration of courses, the financial +status of the family, and age are the primary factors affecting students' +adaptability in online learning environments. Additionally, age significantly +impacts students' adaptive capacities. Among the predictive models, the random +forest, XGBoost, and CatBoost algorithms demonstrate superior forecasting +capabilities, with the random forest model being particularly adept at +capturing the characteristics of students' adaptability. + +
+
+
+
+
+ + ♻ ☆ Batched Stochastic Bandit for Nondegenerate Functions + + +
+ This paper studies batched bandit learning problems for nondegenerate +functions. We introduce an algorithm that solves the batched bandit problem for +nondegenerate functions near-optimally. More specifically, we introduce an +algorithm, called Geometric Narrowing (GN), whose regret bound is of order +$\widetilde{{\mathcal{O}}} ( A_{+}^d \sqrt{T} )$. In addition, GN only needs +$\mathcal{O} (\log \log T)$ batches to achieve this regret. We also provide +lower bound analysis for this problem. More specifically, we prove that over +some (compact) doubling metric space of doubling dimension $d$: 1. For any +policy $\pi$, there exists a problem instance on which $\pi$ admits a regret of +order ${\Omega} ( A_-^d \sqrt{T})$; 2. No policy can achieve a regret of order +$ A_-^d \sqrt{T} $ over all problem instances, using less than $ \Omega ( \log +\log T ) $ rounds of communications. Our lower bound analysis shows that the GN +algorithm achieves near optimal regret with minimal number of batches. + +
+
+ comment: 34 pages, 14 colored figures +
+
+
+
+
+ + ♻ ☆ VGBench: Evaluating Large Language Models on Vector Graphics + Understanding and Generation + + +
+ In the realm of vision models, the primary mode of representation is using +pixels to rasterize the visual world. Yet this is not always the best or unique +way to represent visual content, especially for designers and artists who +depict the world using geometry primitives such as polygons. Vector graphics +(VG), on the other hand, offer a textual representation of visual content, +which can be more concise and powerful for content like cartoons, sketches and +scientific figures. Recent studies have shown promising results on processing +vector graphics with capable Large Language Models (LLMs). However, such works +focus solely on qualitative results, understanding, or a specific type of +vector graphics. We propose VGBench, a comprehensive benchmark for LLMs on +handling vector graphics through diverse aspects, including (a) both visual +understanding and generation, (b) evaluation of various vector graphics +formats, (c) diverse question types, (d) wide range of prompting techniques, +(e) under multiple LLMs and (f) comparison with VLMs on rasterized +representations. Evaluating on our collected 4279 understanding and 5845 +generation samples, we find that LLMs show strong capability on both aspects +while exhibiting less desirable performance on low-level formats (SVG). Both +data and evaluation pipeline will be open-sourced at https://vgbench.github.io. + +
+
+ comment: Project Page: https://vgbench.github.io +
+
+
+
+
+ + ♻ ☆ Conditional score-based diffusion models for solving inverse problems in + mechanics + + +
+ We propose a framework to perform Bayesian inference using conditional +score-based diffusion models to solve a class of inverse problems in mechanics +involving the inference of a specimen's spatially varying material properties +from noisy measurements of its mechanical response to loading. Conditional +score-based diffusion models are generative models that learn to approximate +the score function of a conditional distribution using samples from the joint +distribution. More specifically, the score functions corresponding to multiple +realizations of the measurement are approximated using a single neural network, +the so-called score network, which is subsequently used to sample the posterior +distribution using an appropriate Markov chain Monte Carlo scheme based on +Langevin dynamics. Training the score network only requires simulating the +forward model. Hence, the proposed approach can accommodate black-box forward +models and complex measurement noise. Moreover, once the score network has been +trained, it can be re-used to solve the inverse problem for different +realizations of the measurements. We demonstrate the efficacy of the proposed +approach on a suite of high-dimensional inverse problems in mechanics that +involve inferring heterogeneous material properties from noisy measurements. +Some examples we consider involve synthetic data, while others include data +collected from actual elastography experiments. Further, our applications +demonstrate that the proposed approach can handle different measurement +modalities, complex patterns in the inferred quantities, non-Gaussian and +non-additive noise models, and nonlinear black-box forward models. The results +show that the proposed framework can solve large-scale physics-based inverse +problems efficiently. + +
+
+
+
+
+ + ♻ ☆ FilFL: Client Filtering for Optimized Client Participation in Federated + Learning ECAI'24 + + +
+ Federated learning, an emerging machine learning paradigm, enables clients to +collaboratively train a model without exchanging local data. Clients +participating in the training process significantly impact the convergence +rate, learning efficiency, and model generalization. We propose a novel +approach, client filtering, to improve model generalization and optimize client +participation and training. The proposed method periodically filters available +clients to identify a subset that maximizes a combinatorial objective function +with an efficient greedy filtering algorithm. Thus, the clients are assessed as +a combination rather than individually. We theoretically analyze the +convergence of federated learning with client filtering in heterogeneous +settings and evaluate its performance across diverse vision and language tasks, +including realistic scenarios with time-varying client availability. Our +empirical results demonstrate several benefits of our approach, including +improved learning efficiency, faster convergence, and up to 10% higher test +accuracy than training without client filtering. + +
+
+ comment: Accepted at ECAI'24 +
+
+
+
+
+ + ♻ ☆ Learning to Prompt Your Domain for Vision-Language Models + + +
+ Prompt learning has recently become a very efficient transfer learning +paradigm for Contrastive Language Image Pretraining (CLIP) models. Compared +with fine-tuning the entire encoder, prompt learning can obtain highly +competitive results by optimizing only a small number of parameters, which +presents considerably exciting benefits for federated learning applications +that prioritizes communication efficiency. However, in this work, we identify +that directly transferring prompt learning approaches into federated learning +does not yield favorable results since the model often suffers from +considerable domain gaps across different clients. To address this issue, we +propose ADAPT, a novel domain-aware prompt learning approach that facilitates +both intra- and inter-domain prompts across federated participants. The basic +idea of ADAPT is that the prompted CLIP should detect the input image's domain +correspondence and before making the prediction of its category. Extensive +experiments of ADAPT demonstrate its significant efficiency and effectiveness +in federated learning. For example, by learning and sharing only 0.08M +parameters, our ADAPT attains a 68.4% average accuracy over six domains in the +DomainNet dataset, which improves the original CLIP by a large margin of 14.8%. + +
+
+
+
+
+ + ♻ ☆ Evaluation Framework for Feedback Generation Methods in Skeletal + Movement Assessment ECCV 2024 + + +
+ The application of machine-learning solutions to movement assessment from +skeleton videos has attracted significant research attention in recent years. +This advancement has made rehabilitation at home more accessible, utilizing +movement assessment algorithms that can operate on affordable equipment for +human pose detection and analysis from 2D or 3D videos. While the primary +objective of automatic assessment tasks is to score movements, the automatic +generation of feedback highlighting key movement issues has the potential to +significantly enhance and accelerate the rehabilitation process. While numerous +research works exist in the field of automatic movement assessment, only a +handful address feedback generation. In this study, we propose terminology and +criteria for the classification, evaluation, and comparison of feedback +generation solutions. We discuss the challenges associated with each feedback +generation approach and use our proposed criteria to classify existing +solutions. To our knowledge, this is the first work that formulates feedback +generation in skeletal movement assessment. + +
+
+ comment: Accepted to xAI4Biometrics 2024 at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive Log-Euclidean Metrics for SPD Matrix Learning + + +
+ Symmetric Positive Definite (SPD) matrices have received wide attention in +machine learning due to their intrinsic capacity to encode underlying +structural correlation in data. Many successful Riemannian metrics have been +proposed to reflect the non-Euclidean geometry of SPD manifolds. However, most +existing metric tensors are fixed, which might lead to sub-optimal performance +for SPD matrix learning, especially for deep SPD neural networks. To remedy +this limitation, we leverage the commonly encountered pullback techniques and +propose Adaptive Log-Euclidean Metrics (ALEMs), which extend the widely used +Log-Euclidean Metric (LEM). Compared with the previous Riemannian metrics, our +metrics contain learnable parameters, which can better adapt to the complex +dynamics of Riemannian neural networks with minor extra computations. We also +present a complete theoretical analysis to support our ALEMs, including +algebraic and Riemannian properties. The experimental and theoretical results +demonstrate the merit of the proposed metrics in improving the performance of +SPD neural networks. The efficacy of our metrics is further showcased on a set +of recently developed Riemannian building blocks, including Riemannian batch +normalization, Riemannian Residual blocks, and Riemannian classifiers. + +
+
+ comment: Accepted by TIP 2024 +
+
+
+
+
+ + ♻ ☆ Wasserstein Gradient Boosting: A Framework for Distribution-Valued + Supervised Learning + + +
+ Gradient boosting is a sequential ensemble method that fits a new weaker +learner to pseudo residuals at each iteration. We propose Wasserstein gradient +boosting, a novel extension of gradient boosting that fits a new weak learner +to alternative pseudo residuals that are Wasserstein gradients of loss +functionals of probability distributions assigned at each input. It solves +distribution-valued supervised learning, where the output values of the +training dataset are probability distributions for each input. In +classification and regression, a model typically returns, for each input, a +point estimate of a parameter of a noise distribution specified for a response +variable, such as the class probability parameter of a categorical distribution +specified for a response label. A main application of Wasserstein gradient +boosting in this paper is tree-based evidential learning, which returns a +distributional estimate of the response parameter for each input. We +empirically demonstrate the superior performance of the probabilistic +prediction by Wasserstein gradient boosting in comparison with existing +uncertainty quantification methods. + +
+
+
+
+
+ + ♻ ☆ GEAR: An Efficient KV Cache Compression Recipe for Near-Lossless + Generative Inference of LLM + + +
+ Key-value (KV) caching has become the de-facto to accelerate generation speed +for large language models (LLMs) inference. However, the growing cache demand +with increasing sequence length has transformed LLM inference to be a memory +bound problem, significantly constraining the system throughput. Existing +methods rely on dropping unimportant tokens or quantizing all entries +uniformly. Such methods, however, often incur high approximation errors to +represent the compressed matrices. The autoregressive decoding process further +compounds the error of each step, resulting in critical deviation in model +generation and deterioration of performance. To tackle this challenge, we +propose GEAR, an efficient KV cache compression framework that achieves +near-lossless high-ratio compression. GEAR first applies quantization to +majority of entries of similar magnitudes to ultra-low precision. It then +employs a low rank matrix to approximate the quantization error, and a sparse +matrix to remedy individual errors from outlier entries. By adeptly integrating +three techniques, GEAR is able to fully exploit their synergistic potentials. +Our experiments demonstrate that compared to alternatives, GEAR achieves +near-lossless 4-bit KV cache compression with up to 2.38x throughput +improvement, while reducing peak-memory size up to 2.29x. Our code is publicly +available at https://github.com/HaoKang-Timmy/GEAR. + +
+
+
+
+
+ + ♻ ☆ Misam: Using ML in Dataflow Selection of Sparse-Sparse Matrix + Multiplication ISCA 2024 + + +
+ Sparse matrix-matrix multiplication (SpGEMM) is a critical operation in +numerous fields, including scientific computing, graph analytics, and deep +learning. These applications exploit the sparsity of matrices to reduce storage +and computational demands. However, the irregular structure of sparse matrices +poses significant challenges for performance optimization. Traditional hardware +accelerators are tailored for specific sparsity patterns with fixed dataflow +schemes - inner, outer, and row-wise but often perform suboptimally when the +actual sparsity deviates from these predetermined patterns. As the use of +SpGEMM expands across various domains, each with distinct sparsity +characteristics, the demand for hardware accelerators that can efficiently +handle a range of sparsity patterns is increasing. This paper presents a +machine learning based approach for adaptively selecting the most appropriate +dataflow scheme for SpGEMM tasks with diverse sparsity patterns. By employing +decision trees and deep reinforcement learning, we explore the potential of +these techniques to surpass heuristic-based methods in identifying optimal +dataflow schemes. We evaluate our models by comparing their performance with +that of a heuristic, highlighting the strengths and weaknesses of each +approach. Our findings suggest that using machine learning for dynamic dataflow +selection in hardware accelerators can provide upto 28 times gains. + +
+
+ comment: Accepted to ISCA 2024 MLArchSys workshop + https://openreview.net/forum?id=A1V9FaZRbV +
+
+
+
+
+ + ♻ ☆ Iterative Methods for Vecchia-Laplace Approximations for Latent Gaussian + Process Models + + +
+ Latent Gaussian process (GP) models are flexible probabilistic non-parametric +function models. Vecchia approximations are accurate approximations for GPs to +overcome computational bottlenecks for large data, and the Laplace +approximation is a fast method with asymptotic convergence guarantees to +approximate marginal likelihoods and posterior predictive distributions for +non-Gaussian likelihoods. Unfortunately, the computational complexity of +combined Vecchia-Laplace approximations grows faster than linearly in the +sample size when used in combination with direct solver methods such as the +Cholesky decomposition. Computations with Vecchia-Laplace approximations can +thus become prohibitively slow precisely when the approximations are usually +the most accurate, i.e., on large data sets. In this article, we present +iterative methods to overcome this drawback. Among other things, we introduce +and analyze several preconditioners, derive new convergence results, and +propose novel methods for accurately approximating predictive variances. We +analyze our proposed methods theoretically and in experiments with simulated +and real-world data. In particular, we obtain a speed-up of an order of +magnitude compared to Cholesky-based calculations and a threefold increase in +prediction accuracy in terms of the continuous ranked probability score +compared to a state-of-the-art method on a large satellite data set. All +methods are implemented in a free C++ software library with high-level Python +and R packages. + +
+
+
+
+
+ + ♻ ☆ Methods for Recovering Conditional Independence Graphs: A Survey + + +
+ Conditional Independence (CI) graphs are a type of probabilistic graphical +models that are primarily used to gain insights about feature relationships. +Each edge represents the partial correlation between the connected features +which gives information about their direct dependence. In this survey, we list +out different methods and study the advances in techniques developed to recover +CI graphs. We cover traditional optimization methods as well as recently +developed deep learning architectures along with their recommended +implementations. To facilitate wider adoption, we include preliminaries that +consolidate associated operations, for example techniques to obtain covariance +matrix for mixed datatypes. + +
+
+
+
+
+ + ♻ ☆ Post-processing fairness with minimal changes + + +
+ In this paper, we introduce a novel post-processing algorithm that is both +model-agnostic and does not require the sensitive attribute at test time. In +addition, our algorithm is explicitly designed to enforce minimal changes +between biased and debiased predictions; a property that, while highly +desirable, is rarely prioritized as an explicit objective in fairness +literature. Our approach leverages a multiplicative factor applied to the logit +value of probability scores produced by a black-box classifier. We demonstrate +the efficacy of our method through empirical evaluations, comparing its +performance against other four debiasing algorithms on two widely used datasets +in fairness research. + +
+
+
+
+
+ + ♻ ☆ Not (yet) the whole story: Evaluating Visual Storytelling Requires More + than Measuring Coherence, Grounding, and Repetition + + +
+ Visual storytelling consists in generating a natural language story given a +temporally ordered sequence of images. This task is not only challenging for +models, but also very difficult to evaluate with automatic metrics since there +is no consensus about what makes a story 'good'. In this paper, we introduce a +novel method that measures story quality in terms of human likeness regarding +three key aspects highlighted in previous work: visual grounding, coherence, +and repetitiveness. We then use this method to evaluate the stories generated +by several models, showing that the foundation model LLaVA obtains the best +result, but only slightly so compared to TAPM, a 50-times smaller visual +storytelling model. Upgrading the visual and language components of TAPM +results in a model that yields competitive performance with a relatively low +number of parameters. Finally, we carry out a human evaluation study, whose +results suggest that a 'good' story may require more than a human-like level of +visual grounding, coherence, and repetition. + +
+
+
+
+
+ + ♻ ☆ Gameplay Filters: Robust Zero-Shot Safety through Adversarial + Imagination + + +
+ Despite the impressive recent advances in learning-based robot control, +ensuring robustness to out-of-distribution conditions remains an open +challenge. Safety filters can, in principle, keep arbitrary control policies +from incurring catastrophic failures by overriding unsafe actions, but existing +solutions for complex (e.g., legged) robot dynamics do not span the full motion +envelope and instead rely on local, reduced-order models. These filters tend to +overly restrict agility and can still fail when perturbed away from nominal +conditions. This paper presents the gameplay filter, a new class of predictive +safety filter that continually plays out hypothetical matches between its +simulation-trained safety strategy and a virtual adversary co-trained to invoke +worst-case events and sim-to-real error, and precludes actions that would cause +it to fail down the line. We demonstrate the scalability and robustness of the +approach with a first-of-its-kind full-order safety filter for (36-D) +quadrupedal dynamics. Physical experiments on two different quadruped platforms +demonstrate the superior zero-shot effectiveness of the gameplay filter under +large perturbations such as tugging and unmodeled terrain. + +
+
+
+
+
+ + ♻ ☆ Generalization of Hamiltonian algorithms + + +
+ The paper proves generalization results for a class of stochastic learning +algorithms. The method applies whenever the algorithm generates an absolutely +continuous distribution relative to some a-priori measure and the Radon Nikodym +derivative has subgaussian concentration. Applications are bounds for the Gibbs +algorithm and randomizations of stable deterministic algorithms as well as +PAC-Bayesian bounds with data-dependent priors. + +
+
+
+
+
+ + ♻ ☆ Trajectory Forecasting through Low-Rank Adaptation of Discrete Latent + Codes + + +
+ Trajectory forecasting is crucial for video surveillance analytics, as it +enables the anticipation of future movements for a set of agents, e.g. +basketball players engaged in intricate interactions with long-term intentions. +Deep generative models offer a natural learning approach for trajectory +forecasting, yet they encounter difficulties in achieving an optimal balance +between sampling fidelity and diversity. We address this challenge by +leveraging Vector Quantized Variational Autoencoders (VQ-VAEs), which utilize a +discrete latent space to tackle the issue of posterior collapse. Specifically, +we introduce an instance-based codebook that allows tailored latent +representations for each example. In a nutshell, the rows of the codebook are +dynamically adjusted to reflect contextual information (i.e., past motion +patterns extracted from the observed trajectories). In this way, the +discretization process gains flexibility, leading to improved reconstructions. +Notably, instance-level dynamics are injected into the codebook through +low-rank updates, which restrict the customization of the codebook to a lower +dimension space. The resulting discrete space serves as the basis of the +subsequent step, which regards the training of a diffusion-based predictive +model. We show that such a two-fold framework, augmented with instance-level +discretization, leads to accurate and diverse forecasts, yielding +state-of-the-art performance on three established benchmarks. + +
+
+ comment: 15 pages, 3 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Verification of Geometric Robustness of Neural Networks via Piecewise + Linear Approximation and Lipschitz Optimisation ECAI 2024 + + +
+ We address the problem of verifying neural networks against geometric +transformations of the input image, including rotation, scaling, shearing, and +translation. The proposed method computes provably sound piecewise linear +constraints for the pixel values by using sampling and linear approximations in +combination with branch-and-bound Lipschitz optimisation. The method obtains +provably tighter over-approximations of the perturbation region than the +present state-of-the-art. We report results from experiments on a comprehensive +set of verification benchmarks on MNIST and CIFAR10. We show that our proposed +implementation resolves up to 32% more verification cases than present +approaches. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ♻ ☆ On the Efficacy of Text-Based Input Modalities for Action Anticipation + + +
+ Anticipating future actions is a highly challenging task due to the diversity +and scale of potential future actions; yet, information from different +modalities help narrow down plausible action choices. Each modality can provide +diverse and often complementary context for the model to learn from. While +previous multi-modal methods leverage information from modalities such as video +and audio, we primarily explore how text descriptions of actions and objects +can also lead to more accurate action anticipation by providing additional +contextual cues, e.g., about the environment and its contents. We propose a +Multi-modal Contrastive Anticipative Transformer (M-CAT), a video transformer +architecture that jointly learns from multi-modal features and text +descriptions of actions and objects. We train our model in two stages, where +the model first learns to align video clips with descriptions of future +actions, and is subsequently fine-tuned to predict future actions. Compared to +existing methods, M-CAT has the advantage of learning additional context from +two types of text inputs: rich descriptions of future actions during +pre-training, and, text descriptions for detected objects and actions during +modality feature fusion. Through extensive experimental evaluation, we +demonstrate that our model outperforms previous methods on the EpicKitchens +datasets, and show that using simple text descriptions of actions and objects +aid in more effective action anticipation. In addition, we examine the impact +of object and action information obtained via text, and perform extensive +ablations. + +
+
+
+
+
+ + ♻ ☆ Standardized Interpretable Fairness Measures for Continuous Risk Scores + + +
+ We propose a standardized version of fairness measures for continuous scores +with a reasonable interpretation based on the Wasserstein distance. Our +measures are easily computable and well suited for quantifying and interpreting +the strength of group disparities as well as for comparing biases across +different models, datasets, or time points. We derive a link between the +different families of existing fairness measures for scores and show that the +proposed standardized fairness measures outperform ROC-based fairness measures +because they are more explicit and can quantify significant biases that +ROC-based fairness measures miss. + +
+
+
+
+
+ + ♻ ☆ Follow-up Attention: An Empirical Study of Developer and Neural Model + Code Exploration + + +
+ Recent neural models of code, such as OpenAI Codex and AlphaCode, have +demonstrated remarkable proficiency at code generation due to the underlying +attention mechanism. However, it often remains unclear how the models actually +process code, and to what extent their reasoning and the way their attention +mechanism scans the code matches the patterns of developers. A poor +understanding of the model reasoning process limits the way in which current +neural models are leveraged today, so far mostly for their raw prediction. To +fill this gap, this work studies how the processed attention signal of three +open large language models - CodeGen, InCoder and GPT-J - agrees with how +developers look at and explore code when each answers the same sensemaking +questions about code. Furthermore, we contribute an open-source eye-tracking +dataset comprising 92 manually-labeled sessions from 25 developers engaged in +sensemaking tasks. We empirically evaluate five heuristics that do not use the +attention and ten attention-based post-processing approaches of the attention +signal of CodeGen against our ground truth of developers exploring code, +including the novel concept of follow-up attention which exhibits the highest +agreement between model and human attention. Our follow-up attention method can +predict the next line a developer will look at with 47% accuracy. This +outperforms the baseline prediction accuracy of 42.3%, which uses the session +history of other developers to recommend the next line. These results +demonstrate the potential of leveraging the attention signal of pre-trained +models for effective code exploration. + +
+
+ comment: Published at IEEE Transactions on Software Engineering +
+
+
+
+
+ + ♻ ☆ Unified Convergence Theory of Stochastic and Variance-Reduced Cubic + Newton Methods + + +
+ We study stochastic Cubic Newton methods for solving general possibly +non-convex minimization problems. We propose a new framework, which we call the +helper framework, that provides a unified view of the stochastic and +variance-reduced second-order algorithms equipped with global complexity +guarantees. It can also be applied to learning with auxiliary information. Our +helper framework offers the algorithm designer high flexibility for +constructing and analyzing the stochastic Cubic Newton methods, allowing +arbitrary size batches, and the use of noisy and possibly biased estimates of +the gradients and Hessians, incorporating both the variance reduction and the +lazy Hessian updates. We recover the best-known complexities for the stochastic +and variance-reduced Cubic Newton, under weak assumptions on the noise. A +direct consequence of our theory is the new lazy stochastic second-order +method, which significantly improves the arithmetic complexity for large +dimension problems. We also establish complexity bounds for the classes of +gradient-dominated objectives, that include convex and strongly convex +problems. For Auxiliary Learning, we show that using a helper (auxiliary +function) can outperform training alone if a given similarity measure is small. + +
+
+
+
+
+ + ♻ ☆ No Regrets: Investigating and Improving Regret Approximations for + Curriculum Discovery + + +
+ What data or environments to use for training to improve downstream +performance is a longstanding and very topical question in reinforcement +learning. In particular, Unsupervised Environment Design (UED) methods have +gained recent attention as their adaptive curricula enable agents to be robust +to in- and out-of-distribution tasks. We ask to what extent these methods are +themselves robust when applied to a novel setting, closely inspired by a +real-world robotics problem. Surprisingly, we find that the state-of-the-art +UED methods either do not improve upon the na\"{i}ve baseline of Domain +Randomisation (DR), or require substantial hyperparameter tuning to do so. Our +analysis shows that this is due to their underlying scoring functions failing +to predict intuitive measures of ``learnability'', i.e., in finding the +settings that the agent sometimes solves, but not always. Based on this, we +instead directly train on levels with high learnability and find that this +simple and intuitive approach outperforms UED methods and DR in several +binary-outcome environments, including on our domain and the standard UED +domain of Minigrid. We further introduce a new adversarial evaluation procedure +for directly measuring robustness, closely mirroring the conditional value at +risk (CVaR). We open-source all our code and present visualisations of final +policies here: https://github.com/amacrutherford/sampling-for-learnability. + +
+
+
+
+
+ + ♻ ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease + Classification: A Systematic Review + + +
+ Parkinson's disease (PD), the second most prevalent neurodegenerative +disorder worldwide, frequently presents with early-stage speech impairments. +Recent advancements in Artificial Intelligence (AI), particularly deep learning +(DL), have significantly enhanced PD diagnosis through the analysis of speech +data. Nevertheless, the progress of research is restricted by the limited +availability of publicly accessible speech-based PD datasets, primarily due to +privacy concerns. The goal of this systematic review is to explore the current +landscape of speech-based DL approaches for PD classification, based on 33 +scientific works published between 2020 and March 2024. We discuss their +available resources, capabilities, potential limitations, and issues related to +bias, explainability, and privacy. Furthermore, this review provides an +overview of publicly accessible speech-based datasets and open-source material +for PD. The DL approaches are categorized into end-to-end (E2E) learning, +transfer learning (TL) and deep acoustic features extraction (DAFE) approaches. +Among E2E approaches, Convolutional Neural Networks (CNNs) are prevalent, +though Transformers are increasingly popular. E2E approaches face challenges +such as limited data and computational resources, especially with Transformers. +TL addresses these issues by providing more robust PD diagnosis and better +generalizability across languages. DAFE aims to improve the explainability and +interpretability of results by examining the specific effects of deep features +on both other DL approaches and more traditional machine learning (ML) methods. +However, it often underperforms compared to E2E and TL approaches. + +
+
+ comment: Submitted in Applied Sciences - peer reviewed Open Access journal. + This research was funded by the NWO research programme AiNed Fellowship + Grants under the project Responsible AI for Voice Diagnostics (RAIVD) - grant + number NGF.1607.22.013 +
+
+
+
+
+ + ♻ ☆ GANs Conditioning Methods: A Survey + + +
+ In recent years, Generative Adversarial Networks (GANs) have seen significant +advancements, leading to their widespread adoption across various fields. The +original GAN architecture enables the generation of images without any specific +control over the content, making it an unconditional generation process. +However, many practical applications require precise control over the generated +output, which has led to the development of conditional GANs (cGANs) that +incorporate explicit conditioning to guide the generation process. cGANs extend +the original framework by incorporating additional information (conditions), +enabling the generation of samples that adhere to that specific criteria. +Various conditioning methods have been proposed, each differing in how they +integrate the conditioning information into both the generator and the +discriminator networks. In this work, we review the conditioning methods +proposed for GANs, exploring the characteristics of each method and +highlighting their unique mechanisms and theoretical foundations. Furthermore, +we conduct a comparative analysis of these methods, evaluating their +performance on various image datasets. Through these analyses, we aim to +provide insights into the strengths and limitations of various conditioning +techniques, guiding future research and application in generative modeling. + +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in +fields of materials science, chemistry, pharmacology just to name a few. +Conventional MD simulations are plagued by numerical stability as well as long +equilibration time issues, which limits broader applications of MD simulations. +Recently, a surge of deep learning approaches have been devised for +time-coarsened dynamics, which learns the state transition mechanism over much +larger time scales to overcome these limitations. However, only a few methods +target the underlying Boltzmann distribution by resampling techniques, where +proposals are rarely accepted as new states with low efficiency. In this work, +we propose a force-guided bridge matching model, FBM, a novel framework that +first incorporates physical priors into bridge matching for full-atom +time-coarsened dynamics. With the guidance of our well-designed intermediate +force field, FBM is feasible to target the Boltzmann-like distribution by +direct inference without extra steps. Experiments on small peptides verify our +superiority in terms of comprehensive metrics and demonstrate transferability +to unseen peptide systems. + +
+
+
+
+
+ + ♻ ☆ FRRI: a novel algorithm for fuzzy-rough rule induction + + +
+ Interpretability is the next frontier in machine learning research. In the +search for white box models - as opposed to black box models, like random +forests or neural networks - rule induction algorithms are a logical and +promising option, since the rules can easily be understood by humans. Fuzzy and +rough set theory have been successfully applied to this archetype, almost +always separately. As both approaches to rule induction involve granular +computing based on the concept of equivalence classes, it is natural to combine +them. The QuickRules\cite{JensenCornelis2009} algorithm was a first attempt at +using fuzzy rough set theory for rule induction. It is based on QuickReduct, a +greedy algorithm for building decision reducts. QuickRules already showed an +improvement over other rule induction methods. However, to evaluate the full +potential of a fuzzy rough rule induction algorithm, one needs to start from +the foundations. In this paper, we introduce a novel rule induction algorithm +called Fuzzy Rough Rule Induction (FRRI). We provide background and explain the +workings of our algorithm. Furthermore, we perform a computational experiment +to evaluate the performance of our algorithm and compare it to other +state-of-the-art rule induction approaches. We find that our algorithm is more +accurate while creating small rulesets consisting of relatively short rules. We +end the paper by outlining some directions for future work. + +
+
+
+
+
+ + ♻ ☆ A Guide to Feature Importance Methods for Scientific Inference + + +
+ While machine learning (ML) models are increasingly used due to their high +predictive power, their use in understanding the data-generating process (DGP) +is limited. Understanding the DGP requires insights into feature-target +associations, which many ML models cannot directly provide due to their opaque +internal mechanisms. Feature importance (FI) methods provide useful insights +into the DGP under certain conditions. Since the results of different FI +methods have different interpretations, selecting the correct FI method for a +concrete use case is crucial and still requires expert knowledge. This paper +serves as a comprehensive guide to help understand the different +interpretations of global FI methods. Through an extensive review of FI methods +and providing new proofs regarding their interpretation, we facilitate a +thorough understanding of these methods and formulate concrete recommendations +for scientific inference. We conclude by discussing options for FI uncertainty +estimation and point to directions for future research aiming at full +statistical inference from black-box ML models. + +
+
+
+
+
+ + ♻ ☆ Advances and Open Challenges in Federated Foundation Models + + +
+ The integration of Foundation Models (FMs) with Federated Learning (FL) +presents a transformative paradigm in Artificial Intelligence (AI). This +integration offers enhanced capabilities while addressing concerns of privacy, +data decentralization, and computational efficiency. This paper provides a +comprehensive survey of the emerging field of Federated Foundation Models +(FedFM), elucidating their synergistic relationship and exploring novel +methodologies, challenges, and future directions that the FL research field +needs to focus on in order to thrive in the age of FMs. A systematic +multi-tiered taxonomy is proposed, categorizing existing FedFM approaches for +model training, aggregation, trustworthiness, and incentivization. Key +challenges, including how to enable FL to deal with high complexity of +computational demands, privacy considerations, contribution evaluation, and +communication efficiency, are thoroughly discussed. Moreover, the paper +explores the intricate challenges of communication, scalability, and security +inherent in training/fine-tuning FMs via FL. It highlights the potential of +quantum computing to revolutionize the processes of training, inference, +optimization, and data encryption. This survey also introduces the +implementation requirement of FedFM and some practical FedFM applications. +Then, this survey provides the lessons with a clear understanding of our +findings for FedFM. Finally, this survey not only provides insights into the +current state and challenges of FedFM but also paves the way for future +research directions, emphasizing the need for developing trustworthy solutions. +It serves as a foundational guide for researchers and practitioners interested +in contributing to this interdisciplinary and rapidly advancing field. + +
+
+ comment: Survey of Federated Foundation Models (FedFM) +
+
+
+
+
+ + ♻ ☆ Next Level Message-Passing with Hierarchical Support Graphs + + +
+ Message-Passing Neural Networks (MPNNs) are extensively employed in graph +learning tasks but suffer from limitations such as the restricted scope of +information exchange, by being confined to neighboring nodes during each round +of message passing. Various strategies have been proposed to address these +limitations, including incorporating virtual nodes to facilitate global +information exchange. In this study, we introduce the Hierarchical Support +Graph (HSG), an extension of the virtual node concept created through recursive +coarsening of the original graph. This approach provides a flexible framework +for enhancing information flow in graphs, independent of the specific MPNN +layers utilized. We present a theoretical analysis of HSGs, investigate their +empirical performance, and demonstrate that HSGs can surpass other methods +augmented with virtual nodes, achieving state-of-the-art results across +multiple datasets. + +
+
+
+
+
+ + ♻ ☆ A comparison between humans and AI at recognizing objects in unusual + poses + + +
+ Deep learning is closing the gap with human vision on several object +recognition benchmarks. Here we investigate this gap for challenging images +where objects are seen in unusual poses. We find that humans excel at +recognizing objects in such poses. In contrast, state-of-the-art deep networks +for vision (EfficientNet, SWAG, ViT, SWIN, BEiT, ConvNext) and state-of-the-art +large vision-language models (Claude 3.5, Gemini 1.5, GPT-4) are systematically +brittle on unusual poses, with the exception of Gemini showing excellent +robustness in that condition. As we limit image exposure time, human +performance degrades to the level of deep networks, suggesting that additional +mental processes (requiring additional time) are necessary to identify objects +in unusual poses. An analysis of error patterns of humans vs. networks reveals +that even time-limited humans are dissimilar to feed-forward deep networks. In +conclusion, our comparison reveals that humans and deep networks rely on +different mechanisms for recognizing objects in unusual poses. Understanding +the nature of the mental processes taking place during extra viewing time may +be key to reproduce the robustness of human vision in silico. + +
+
+
+
+
+ + ♻ ☆ Gradient Descent Fails to Learn High-frequency Functions and Modular + Arithmetic + + +
+ Classes of target functions containing a large number of approximately +orthogonal elements are known to be hard to learn by the Statistical Query +algorithms. Recently this classical fact re-emerged in a theory of +gradient-based optimization of neural networks. In the novel framework, the +hardness of a class is usually quantified by the variance of the gradient with +respect to a random choice of a target function. + A set of functions of the form $x\to ax \bmod p$, where $a$ is taken from +${\mathbb Z}_p$, has attracted some attention from deep learning theorists and +cryptographers recently. This class can be understood as a subset of +$p$-periodic functions on ${\mathbb Z}$ and is tightly connected with a class +of high-frequency periodic functions on the real line. + We present a mathematical analysis of limitations and challenges associated +with using gradient-based learning techniques to train a high-frequency +periodic function or modular multiplication from examples. We highlight that +the variance of the gradient is negligibly small in both cases when either a +frequency or the prime base $p$ is large. This in turn prevents such a learning +algorithm from being successful. + +
+
+
+
+
+ + ♻ ☆ The $μ\mathcal{G}$ Language for Programming Graph Neural Networks + + +
+ Graph neural networks form a class of deep learning architectures +specifically designed to work with graph-structured data. As such, they share +the inherent limitations and problems of deep learning, especially regarding +the issues of explainability and trustworthiness. We propose $\mu\mathcal{G}$, +an original domain-specific language for the specification of graph neural +networks that aims to overcome these issues. The language's syntax is +introduced, and its meaning is rigorously defined by a denotational semantics. +An equivalent characterization in the form of an operational semantics is also +provided and, together with a type system, is used to prove the type soundness +of $\mu\mathcal{G}$. We show how $\mu\mathcal{G}$ programs can be represented +in a more user-friendly graphical visualization, and provide examples of its +generality by showing how it can be used to define some of the most popular +graph neural network models, or to develop any custom graph processing +application. + +
+
+
+
+
+ + ♻ ☆ Evaluating the Predictive Features of Person-Centric Knowledge Graph + Embeddings: Unfolding Ablation Studies + + +
+ Developing novel predictive models with complex biomedical information is +challenging due to various idiosyncrasies related to heterogeneity, +standardization or sparseness of the data. We previously introduced a +person-centric ontology to organize information about individual patients, and +a representation learning framework to extract person-centric knowledge graphs +(PKGs) and to train Graph Neural Networks (GNNs). In this paper, we propose a +systematic approach to examine the results of GNN models trained with both +structured and unstructured information from the MIMIC-III dataset. Through +ablation studies on different clinical, demographic, and social data, we show +the robustness of this approach in identifying predictive features in PKGs for +the task of readmission prediction. + +
+
+ comment: Published in the 34th Medical Informatics Europe Conference +
+
+
+
+
+ + ♻ ☆ Can Synthetic Audio From Generative Foundation Models Assist Audio + Recognition and Speech Modeling? INTERSPEECH + + +
+ Recent advances in foundation models have enabled audio-generative models +that produce high-fidelity sounds associated with music, events, and human +actions. Despite the success achieved in modern audio-generative models, the +conventional approach to assessing the quality of the audio generation relies +heavily on distance metrics like Frechet Audio Distance. In contrast, we aim to +evaluate the quality of audio generation by examining the effectiveness of +using them as training data. Specifically, we conduct studies to explore the +use of synthetic audio for audio recognition. Moreover, we investigate whether +synthetic audio can serve as a resource for data augmentation in speech-related +modeling. Our comprehensive experiments demonstrate the potential of using +synthetic audio for audio recognition and speech-related modeling. Our code is +available at https://github.com/usc-sail/SynthAudio. + +
+
+ comment: Accepted to 2024 INTERSPEECH; corrections to ActivityNet labels +
+
+
+
+
+ + ♻ ☆ Efficient Topology-aware Data Augmentation for High-Degree Graph Neural + Networks KDD 2024 + + +
+ In recent years, graph neural networks (GNNs) have emerged as a potent tool +for learning on graph-structured data and won fruitful successes in varied +fields. The majority of GNNs follow the message-passing paradigm, where +representations of each node are learned by recursively aggregating features of +its neighbors. However, this mechanism brings severe over-smoothing and +efficiency issues over high-degree graphs (HDGs), wherein most nodes have +dozens (or even hundreds) of neighbors, such as social networks, transaction +graphs, power grids, etc. Additionally, such graphs usually encompass rich and +complex structure semantics, which are hard to capture merely by feature +aggregations in GNNs. Motivated by the above limitations, we propose TADA, an +efficient and effective front-mounted data augmentation framework for GNNs on +HDGs. Under the hood, TADA includes two key modules: (i) feature expansion with +structure embeddings, and (ii) topology- and attribute-aware graph +sparsification. The former obtains augmented node features and enhanced model +capacity by encoding the graph structure into high-quality structure embeddings +with our highly-efficient sketching method. Further, by exploiting +task-relevant features extracted from graph structures and attributes, the +second module enables the accurate identification and reduction of numerous +redundant/noisy edges from the input graph, thereby alleviating over-smoothing +and facilitating faster feature aggregations over HDGs. Empirically, TADA +considerably improves the predictive performance of mainstream GNN models on 8 +real homophilic/heterophilic HDGs in terms of node classification, while +achieving efficient training and inference processes. + +
+
+ comment: This is the technical report for the paper accepted to KDD 2024. 16 + pages +
+
+
+
+
+ + ♻ ☆ PsychoGAT: A Novel Psychological Measurement Paradigm through + Interactive Fiction Games with LLM Agents ACL 2024 + + +
+ Psychological measurement is essential for mental health, self-understanding, +and personal development. Traditional methods, such as self-report scales and +psychologist interviews, often face challenges with engagement and +accessibility. While game-based and LLM-based tools have been explored to +improve user interest and automate assessment, they struggle to balance +engagement with generalizability. In this work, we propose PsychoGAT +(Psychological Game AgenTs) to achieve a generic gamification of psychological +assessment. The main insight is that powerful LLMs can function both as adept +psychologists and innovative game designers. By incorporating LLM agents into +designated roles and carefully managing their interactions, PsychoGAT can +transform any standardized scales into personalized and engaging interactive +fiction games. To validate the proposed method, we conduct psychometric +evaluations to assess its effectiveness and employ human evaluators to examine +the generated content across various psychological constructs, including +depression, cognitive distortions, and personality traits. Results demonstrate +that PsychoGAT serves as an effective assessment tool, achieving statistically +significant excellence in psychometric metrics such as reliability, convergent +validity, and discriminant validity. Moreover, human evaluations confirm +PsychoGAT's enhancements in content coherence, interactivity, interest, +immersion, and satisfaction. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Neighborhood and Global Perturbations Supported SAM in Federated + Learning: From Local Tweaks To Global Awareness + + +
+ Federated Learning (FL) can be coordinated under the orchestration of a +central server to collaboratively build a privacy-preserving model without the +need for data exchange. However, participant data heterogeneity leads to local +optima divergence, subsequently affecting convergence outcomes. Recent research +has focused on global sharpness-aware minimization (SAM) and dynamic +regularization techniques to enhance consistency between global and local +generalization and optimization objectives. Nonetheless, the estimation of +global SAM introduces additional computational and memory overhead, while +dynamic regularization suffers from bias in the local and global dual variables +due to training isolation. In this paper, we propose a novel FL algorithm, +FedTOGA, designed to consider optimization and generalization objectives while +maintaining minimal uplink communication overhead. By linking local +perturbations to global updates, global generalization consistency is improved. +Additionally, global updates are used to correct local dynamic regularizers, +reducing dual variables bias and enhancing optimization consistency. Global +updates are passively received by clients, reducing overhead. We also propose +neighborhood perturbation to approximate local perturbation, analyzing its +strengths and limitations. Theoretical analysis shows FedTOGA achieves faster +convergence $O(1/T)$ under non-convex functions. Empirical studies demonstrate +that FedTOGA outperforms state-of-the-art algorithms, with a 1\% accuracy +increase and 30\% faster convergence, achieving state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ Uncertainty-based Fairness Measures + + +
+ Unfair predictions of machine learning (ML) models impede their broad +acceptance in real-world settings. Tackling this arduous challenge first +necessitates defining what it means for an ML model to be fair. This has been +addressed by the ML community with various measures of fairness that depend on +the prediction outcomes of the ML models, either at the group level or the +individual level. These fairness measures are limited in that they utilize +point predictions, neglecting their variances, or uncertainties, making them +susceptible to noise, missingness and shifts in data. In this paper, we first +show that an ML model may appear to be fair with existing point-based fairness +measures but biased against a demographic group in terms of prediction +uncertainties. Then, we introduce new fairness measures based on different +types of uncertainties, namely, aleatoric uncertainty and epistemic +uncertainty. We demonstrate on many datasets that (i) our uncertainty-based +measures are complementary to existing measures of fairness, and (ii) they +provide more insights about the underlying issues leading to bias. + +
+
+
+
+
+ + ♻ ☆ LaMAGIC: Language-Model-based Topology Generation for Analog Integrated + Circuits + + +
+ In the realm of electronic and electrical engineering, automation of analog +circuit is increasingly vital given the complexity and customized requirements +of modern applications. However, existing methods only develop search-based +algorithms that require many simulation iterations to design a custom circuit +topology, which is usually a time-consuming process. To this end, we introduce +LaMAGIC, a pioneering language model-based topology generation model that +leverages supervised finetuning for automated analog circuit design. LaMAGIC +can efficiently generate an optimized circuit design from the custom +specification in a single pass. Our approach involves a meticulous development +and analysis of various input and output formulations for circuit. These +formulations can ensure canonical representations of circuits and align with +the autoregressive nature of LMs to effectively addressing the challenges of +representing analog circuits as graphs. The experimental results show that +LaMAGIC achieves a success rate of up to 96\% under a strict tolerance of 0.01. +We also examine the scalability and adaptability of LaMAGIC, specifically +testing its performance on more complex circuits. Our findings reveal the +enhanced effectiveness of our adjacency matrix-based circuit formulation with +floating-point input, suggesting its suitability for handling intricate circuit +designs. This research not only demonstrates the potential of language models +in graph generation, but also builds a foundational framework for future +explorations in automated analog circuit design. + +
+
+ comment: Proceedings of the 41st International Conference on Machine Learning, + PMLR 235:6253-6262 https://proceedings.mlr.press/v235/chang24c.html +
+
+
+
+
+ + ♻ ☆ 1 From the Pursuit of Universal AGI Architecture to Systematic Approach + to Heterogenous AGI: Addressing Alignment, Energy, & AGI Grand Challenges + + +
+ AI faces a trifecta of grand challenges: the Energy Wall, the Alignment +Problem and the Leap from Narrow AI to AGI. Contemporary AI solutions consume +unsustainable amounts of energy during model training and daily operations. +Making things worse, the amount of computation required to train each new AI +model has been doubling every 2 months since 2020, directly translating to +unprecedented increases in energy consumption. + The leap from AI to AGI requires multiple functional subsystems operating in +a balanced manner, which requires a system architecture. However, the current +approach to artificial intelligence lacks system design; even though system +characteristics play a key role in the human brain; from the way it processes +information to how it makes decisions. System design is the key to alignment, +one of the most challenging goals in AI. This difficulty stems from the fact +that the complexity of human moral system requires a similarly sophisticated +system for alignment. Without accurately reflecting the complexity of these +core moral subsystems and systems, aligning AI with human values becomes +significantly more challenging. + In this paper, we posit that system design is the missing piece in overcoming +the grand challenges. We present a Systematic Approach to AGI that utilizes +system design principles to AGI, while providing ways to overcome the energy +wall and the alignment challenges. This paper asserts that artificial +intelligence can be realized through a multiplicity of design-specific +pathways, rather than a singular, overarching AGI architecture. AGI systems may +exhibit diverse architectural configurations and capabilities, contingent upon +their intended use cases. It advocates for a focus on employing system design +principles as a guiding framework, rather than solely concentrating on a +universal AGI architecture. + +
+
+ comment: International Journal on Semantic Computing (2024) Categories: + Artificial Intelligence; AI; Artificial General Intelligence; AGI; System + Design; System Architecture +
+
+
+
+
+ + ♻ ☆ Erasing Concepts from Text-to-Image Diffusion Models with Few-shot + Unlearning BMVC2024 + + +
+ Generating images from text has become easier because of the scaling of +diffusion models and advancements in the field of vision and language. These +models are trained using vast amounts of data from the Internet. Hence, they +often contain undesirable content such as copyrighted material. As it is +challenging to remove such data and retrain the models, methods for erasing +specific concepts from pre-trained models have been investigated. We propose a +novel concept-erasure method that updates the text encoder using few-shot +unlearning in which a few real images are used. The discussion regarding the +generated images after erasing a concept has been lacking. While there are +methods for specifying the transition destination for concepts, the validity of +the specified concepts is unclear. Our method implicitly achieves this by +transitioning to the latent concepts inherent in the model or the images. Our +method can erase a concept within 10 s, making concept erasure more accessible +than ever before. Implicitly transitioning to related concepts leads to more +natural concept erasure. We applied the proposed method to various concepts and +confirmed that concept erasure can be achieved tens to hundreds of times faster +than with current methods. By varying the parameters to be updated, we obtained +results suggesting that, like previous research, knowledge is primarily +accumulated in the feed-forward networks of the text encoder. Our code is +available at \url{https://github.com/fmp453/few-shot-erasing} + +
+
+ comment: 25 pages, 28 figures, accepted by BMVC2024 +
+
+
+
+
+ + ♻ ☆ A Best-of-Both-Worlds Algorithm for Constrained MDPs with Long-Term + Constraints + + +
+ We study online learning in episodic constrained Markov decision processes +(CMDPs), where the learner aims at collecting as much reward as possible over +the episodes, while satisfying some long-term constraints during the learning +process. Rewards and constraints can be selected either stochastically or +adversarially, and the transition function is not known to the learner. While +online learning in classical (unconstrained) MDPs has received considerable +attention over the last years, the setting of CMDPs is still largely +unexplored. This is surprising, since in real-world applications, such as, +e.g., autonomous driving, automated bidding, and recommender systems, there are +usually additional constraints and specifications that an agent has to obey +during the learning process. In this paper, we provide the first +best-of-both-worlds algorithm for CMDPs with long-term constraints, in the +flavor of Balseiro et al. (2023). Our algorithm is capable of handling settings +in which rewards and constraints are selected either stochastically or +adversarially, without requiring any knowledge of the underling process. +Moreover, our algorithm matches state-of-the-art regret and constraint +violation bounds for settings in which constraints are selected stochastically, +while it is the first to provide guarantees in the case in which they are +chosen adversarially. + +
+
+
+
+
+ + ♻ ☆ Category-Theoretical and Topos-Theoretical Frameworks in Machine + Learning: A Survey + + +
+ In this survey, we provide an overview of category theory-derived machine +learning from four mainstream perspectives: gradient-based learning, +probability-based learning, invariance and equivalence-based learning, and +topos-based learning. For the first three topics, we primarily review research +in the past five years, updating and expanding on the previous survey by +Shiebler et al.. The fourth topic, which delves into higher category theory, +particularly topos theory, is surveyed for the first time in this paper. In +certain machine learning methods, the compositionality of functors plays a +vital role, prompting the development of specific categorical frameworks. +However, when considering how the global properties of a network reflect in +local structures and how geometric properties are expressed with logic, the +topos structure becomes particularly significant and profound. + +
+
+
+
+
+ + ♻ ☆ CAST: Cluster-Aware Self-Training for Tabular Data via Reliable + Confidence + + +
+ Tabular data is one of the most widely used data modalities, encompassing +numerous datasets with substantial amounts of unlabeled data. Despite this +prevalence, there is a notable lack of simple and versatile methods for +utilizing unlabeled data in the tabular domain, where both gradient-boosting +decision trees and neural networks are employed. In this context, self-training +has gained attraction due to its simplicity and versatility, yet it is +vulnerable to noisy pseudo-labels caused by erroneous confidence. Several +solutions have been proposed to handle this problem, but they often compromise +the inherent advantages of self-training, resulting in limited applicability in +the tabular domain. To address this issue, we explore a novel direction of +reliable confidence in self-training contexts and conclude that self-training +can be improved by making that the confidence, which represents the value of +the pseudo-label, aligns with the cluster assumption. In this regard, we +propose Cluster-Aware Self-Training (CAST) for tabular data, which enhances +existing self-training algorithms at a negligible cost while maintaining +simplicity and versatility. Concretely, CAST calibrates confidence by +regularizing the classifier's confidence based on local density for each class +in the labeled training data, resulting in lower confidence for pseudo-labels +in low-density regions. Extensive empirical evaluations on up to 21 real-world +datasets confirm not only the superior performance of CAST but also its +robustness in various setups in self-training contexts. + +
+
+ comment: 11 pages for main body, and 10 additional pages for appendix +
+
+
+
+
+ + ♻ ☆ Mitigating Label Noise on Graph via Topological Sample Selection ICML 2024 + + +
+ Despite the success of the carefully-annotated benchmarks, the effectiveness +of existing graph neural networks (GNNs) can be considerably impaired in +practice when the real-world graph data is noisily labeled. Previous +explorations in sample selection have been demonstrated as an effective way for +robust learning with noisy labels, however, the conventional studies focus on +i.i.d data, and when moving to non-iid graph data and GNNs, two notable +challenges remain: (1) nodes located near topological class boundaries are very +informative for classification but cannot be successfully distinguished by the +heuristic sample selection. (2) there is no available measure that considers +the graph topological information to promote sample selection in a graph. To +address this dilemma, we propose a $\textit{Topological Sample Selection}$ +(TSS) method that boosts the informative sample selection process in a graph by +utilising topological information. We theoretically prove that our procedure +minimizes an upper bound of the expected risk under target clean distribution, +and experimentally show the superiority of our method compared with +state-of-the-art baselines. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Satellite Sunroof: High-res Digital Surface Models and Roof Segmentation + for Global Solar Mapping + + +
+ The transition to renewable energy, particularly solar, is key to mitigating +climate change. Google's Solar API aids this transition by estimating solar +potential from aerial imagery, but its impact is constrained by geographical +coverage. This paper proposes expanding the API's reach using satellite +imagery, enabling global solar potential assessment. We tackle challenges +involved in building a Digital Surface Model (DSM) and roof instance +segmentation from lower resolution and single oblique views using deep learning +models. Our models, trained on aligned satellite and aerial datasets, produce +25cm DSMs and roof segments. With ~1m DSM MAE on buildings, ~5deg roof pitch +error and ~56% IOU on roof segmentation, they significantly enhance the Solar +API's potential to promote solar adoption. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ DiffiT: Diffusion Vision Transformers for Image Generation ECCV'24 + + +
+ Diffusion models with their powerful expressivity and high sample quality +have achieved State-Of-The-Art (SOTA) performance in the generative domain. The +pioneering Vision Transformer (ViT) has also demonstrated strong modeling +capabilities and scalability, especially for recognition tasks. In this paper, +we study the effectiveness of ViTs in diffusion-based generative learning and +propose a new model denoted as Diffusion Vision Transformers (DiffiT). +Specifically, we propose a methodology for finegrained control of the denoising +process and introduce the Time-dependant Multihead Self Attention (TMSA) +mechanism. DiffiT is surprisingly effective in generating high-fidelity images +with significantly better parameter efficiency. We also propose latent and +image space DiffiT models and show SOTA performance on a variety of +class-conditional and unconditional synthesis tasks at different resolutions. +The Latent DiffiT model achieves a new SOTA FID score of 1.73 on ImageNet256 +dataset while having 19.85%, 16.88% less parameters than other +Transformer-based diffusion models such as MDT and DiT,respectively. Code: +https://github.com/NVlabs/DiffiT + +
+
+ comment: Accepted to ECCV'24 +
+
+
+
+
+ + ♻ ☆ Learning from Heterogeneity: A Dynamic Learning Framework for + Hypergraphs + + +
+ Graph neural network (GNN) has gained increasing popularity in recent years +owing to its capability and flexibility in modeling complex graph structure +data. Among all graph learning methods, hypergraph learning is a technique for +exploring the implicit higher-order correlations when training the embedding +space of the graph. In this paper, we propose a hypergraph learning framework +named LFH that is capable of dynamic hyperedge construction and attentive +embedding update utilizing the heterogeneity attributes of the graph. +Specifically, in our framework, the high-quality features are first generated +by the pairwise fusion strategy that utilizes explicit graph structure +information when generating initial node embedding. Afterwards, a hypergraph is +constructed through the dynamic grouping of implicit hyperedges, followed by +the type-specific hypergraph learning process. To evaluate the effectiveness of +our proposed framework, we conduct comprehensive experiments on several popular +datasets with eleven state-of-the-art models on both node classification and +link prediction tasks, which fall into categories of homogeneous pairwise graph +learning, heterogeneous pairwise graph learning, and hypergraph learning. The +experiment results demonstrate a significant performance gain (average 12.5% in +node classification and 13.3% in link prediction) compared with recent +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Communication Optimization for Distributed Training: Architecture, + Advances, and Opportunities + + +
+ The past few years have witnessed the flourishing of large-scale deep neural +network models with ever-growing parameter numbers. Training such large-scale +models typically requires massive memory and computing resources, necessitating +distributed training. As GPU performance has rapidly evolved in recent years, +computation time has shrunk, making communication a larger portion of the +overall training time. Consequently, optimizing communication for distributed +training has become crucial. In this article, we briefly introduce the general +architecture of distributed deep neural network training and analyze +relationships among Parallelization Strategy, Collective Communication Library, +and Network from the perspective of communication optimization, which forms a +three-layer paradigm. We then review current representative research advances +within this three-layer paradigm. We find that layers in the current +three-layer paradigm are relatively independent and there is a rich design +space for cross-layer collaborative optimization in distributed training +scenarios. Therefore, we advocate "Vertical" and "Horizontal" co-designs which +extend the three-layer paradigm to a five-layer paradigm. We also advocate +"Intra-Inter" and "Host-Net" co-designs to further utilize the potential of +heterogeneous resources. We hope this article can shed some light on future +research on communication optimization for distributed training. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ GenRec: Generative Sequential Recommendation with Large Language Models + + +
+ Sequential recommendation is a task to capture hidden user preferences from +historical user item interaction data and recommend next items for the user. +Significant progress has been made in this domain by leveraging classification +based learning methods. Inspired by the recent paradigm of 'pretrain, prompt +and predict' in NLP, we consider sequential recommendation as a sequence to +sequence generation task and propose a novel model named Generative +Recommendation (GenRec). Unlike classification based models that learn explicit +user and item representations, GenRec utilizes the sequence modeling capability +of Transformer and adopts the masked item prediction objective to effectively +learn the hidden bidirectional sequential patterns. Different from existing +generative sequential recommendation models, GenRec does not rely on manually +designed hard prompts. The input to GenRec is textual user item sequence and +the output is top ranked next items. Moreover, GenRec is lightweight and +requires only a few hours to train effectively in low-resource settings, making +it highly applicable to real-world scenarios and helping to democratize large +language models in the sequential recommendation domain. Our extensive +experiments have demonstrated that GenRec generalizes on various public +real-world datasets and achieves state-of-the-art results. Our experiments also +validate the effectiveness of the the proposed masked item prediction objective +that improves the model performance by a large margin. + +
+
+
+
+
+ + ♻ ☆ Enhancing Data-Limited Graph Neural Networks by Actively Distilling + Knowledge from Large Language Models + + +
+ Graphs are pervasive in the real-world, such as social network analysis, +bioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great +ability in node classification, a fundamental task on graphs. Unfortunately, +conventional GNNs still face challenges in scenarios with few labeled nodes, +despite the prevalence of few-shot node classification tasks in real-world +applications. To address this challenge, various approaches have been proposed, +including graph meta-learning, transfer learning, and methods based on Large +Language Models (LLMs). However, traditional meta-learning and transfer +learning methods often require prior knowledge from base classes or fail to +exploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based +methods may overlook the zero-shot capabilities of LLMs and rely heavily on the +quality of generated contexts. In this paper, we propose a novel approach that +integrates LLMs and GNNs, leveraging the zero-shot inference and reasoning +capabilities of LLMs and employing a Graph-LLM-based active learning paradigm +to enhance GNNs' performance. Extensive experiments demonstrate the +effectiveness of our model in improving node classification accuracy with +considerably limited labeled data, surpassing state-of-the-art baselines by +significant margins. + +
+
+ comment: 10 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ VFLIP: A Backdoor Defense for Vertical Federated Learning via + Identification and Purification ESORICS 2024 + + +
+ Vertical Federated Learning (VFL) focuses on handling vertically partitioned +data over FL participants. Recent studies have discovered a significant +vulnerability in VFL to backdoor attacks which specifically target the distinct +characteristics of VFL. Therefore, these attacks may neutralize existing +defense mechanisms designed primarily for Horizontal Federated Learning (HFL) +and deep neural networks. In this paper, we present the first backdoor defense, +called VFLIP, specialized for VFL. VFLIP employs the identification and +purification techniques that operate at the inference stage, consequently +improving the robustness against backdoor attacks to a great extent. VFLIP +first identifies backdoor-triggered embeddings by adopting a participant-wise +anomaly detection approach. Subsequently, VFLIP conducts purification which +removes the embeddings identified as malicious and reconstructs all the +embeddings based on the remaining embeddings. We conduct extensive experiments +on CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate +that VFLIP can effectively mitigate backdoor attacks in VFL. +https://github.com/blingcho/VFLIP-esorics24 + +
+
+ comment: Accepted by 29th European Symposium on Research in Computer Security + (ESORICS 2024) +
+
+
+
+
+ + ♻ ☆ CityLight: A Universal Model for Coordinated Traffic Signal Control in + City-scale Heterogeneous Intersections + + +
+ The increasingly severe congestion problem in modern cities strengthens the +significance of developing city-scale traffic signal control (TSC) methods for +traffic efficiency enhancement. While reinforcement learning has been widely +explored in TSC, most of them still target small-scale optimization and cannot +directly scale to the city level due to unbearable resource demand. Only a few +of them manage to tackle city-level optimization, namely a thousand-scale +optimization, by incorporating parameter-sharing mechanisms, but hardly have +they fully tackled the heterogeneity of intersections and intricate +between-intersection interactions inherent in real-world city road networks. To +fill in the gap, we target at the two important challenges in adopting +parameter-sharing paradigms to solve TSC: inconsistency of inner state +representations for intersections heterogeneous in configuration, scale, and +orders of available traffic phases; intricacy of impacts from neighborhood +intersections that have various relative traffic relationships due to +inconsistent phase orders and diverse relative positioning. Our method, +CityLight, features a universal representation module that not only aligns the +state representations of intersections by reindexing their phases based on +their semantics and designing heterogeneity-preserving observations, but also +encodes the narrowed relative traffic relation types to project the +neighborhood intersections onto a uniform relative traffic impact space. We +further attentively fuse neighborhood representations based on their competing +relations and incorporate neighborhood-integrated rewards to boost +coordination. Extensive experiments with hundreds to tens of thousands of +intersections validate the surprising effectiveness and generalizability of +CityLight, with an overall performance gain of 11.68% and a 22.59% improvement +in transfer scenarios in throughput. + +
+
+
+
+
+ + ♻ ☆ Non-Stationary Bandit Learning via Predictive Sampling + + +
+ Thompson sampling has proven effective across a wide range of stationary +bandit environments. However, as we demonstrate in this paper, it can perform +poorly when applied to non-stationary environments. We attribute such failures +to the fact that, when exploring, the algorithm does not differentiate actions +based on how quickly the information acquired loses its usefulness due to +non-stationarity. Building upon this insight, we propose predictive sampling, +an algorithm that deprioritizes acquiring information that quickly loses +usefulness. A theoretical guarantee on the performance of predictive sampling +is established through a Bayesian regret bound. We provide versions of +predictive sampling for which computations tractably scale to complex bandit +environments of practical interest. Through numerical simulations, we +demonstrate that predictive sampling outperforms Thompson sampling in all +non-stationary environments examined. + +
+
+
+
+
+ + ♻ ☆ A Normalized Bottleneck Distance on Persistence Diagrams and Homology + Preservation under Dimension Reduction + + +
+ Persistence diagrams (PDs) are used as signatures of point cloud data. Two +clouds of points can be compared using the bottleneck distance d_B between +their PDs. A potential drawback of this pipeline is that point clouds sampled +from topologically similar manifolds can have arbitrarily large d_B when there +is a large scaling between them. This situation is typical in dimension +reduction frameworks. + We define, and study properties of, a new scale-invariant distance between +PDs termed normalized bottleneck distance, d_N. In defining d_N, we develop a +broader framework called metric decomposition for comparing finite metric +spaces of equal cardinality with a bijection. We utilize metric decomposition +to prove a stability result for d_N by deriving an explicit bound on the +distortion of the bijective map. We then study two popular dimension reduction +techniques, Johnson-Lindenstrauss (JL) projections and metric multidimensional +scaling (mMDS), and a third class of general biLipschitz mappings. We provide +new bounds on how well these dimension reduction techniques preserve homology +with respect to d_N. For a JL map f that transforms input X to f(X), we show +that d_N(dgm(X),dgm(f(X))) < e, where dgm(X) is the Vietoris-Rips PD of X, and +pairwise distances are preserved by f up to the tolerance 0 < \epsilon < 1. For +mMDS, we present new bounds for d_B and d_N between PDs of X and its projection +in terms of the eigenvalues of the covariance matrix. And for k-biLipschitz +maps, we show that d_N is bounded by the product of (k^2-1)/k and the ratio of +diameters of X and f(X). Finally, we use computational experiments to +demonstrate the increased effectiveness of using the normalized bottleneck +distance for clustering sets of point clouds sampled from different shapes. + +
+
+ comment: Added computational experiments; published in La Matematica +
+
+
+
+
+ + ♻ ☆ LlamaDuo: LLMOps Pipeline for Seamless Migration from Service LLMs to + Small-Scale Local LLMs + + +
+ The widespread adoption of cloud-based proprietary large language models +(LLMs) has introduced significant challenges, including operational +dependencies, privacy concerns, and the necessity of continuous internet +connectivity. In this work, we introduce an LLMOps pipeline, "LlamaDuo", for +the seamless migration of knowledge and abilities from service-oriented LLMs to +smaller, locally manageable models. This pipeline is crucial for ensuring +service continuity in the presence of operational failures, strict privacy +policies, or offline requirements. Our LlamaDuo involves fine-tuning a small +language model against the service LLM using a synthetic dataset generated by +the latter. If the performance of the fine-tuned model falls short of +expectations, it is enhanced by further fine-tuning with additional similar +data created by the service LLM. This iterative process guarantees that the +smaller model can eventually match or even surpass the service LLM's +capabilities in specific downstream tasks, offering a practical and scalable +solution for managing AI deployments in constrained environments. Extensive +experiments with leading edge LLMs are conducted to demonstrate the +effectiveness, adaptability, and affordability of LlamaDuo across various +downstream tasks. Our pipeline implementation is available at +https://github.com/deep-diver/llamaduo. + +
+
+ comment: 28 pages, 18 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Scalable Variational Causal Discovery Unconstrained by Acyclicity ECAI 2024 + + +
+ Bayesian causal discovery offers the power to quantify epistemic +uncertainties among a broad range of structurally diverse causal theories +potentially explaining the data, represented in forms of directed acyclic +graphs (DAGs). However, existing methods struggle with efficient DAG sampling +due to the complex acyclicity constraint. In this study, we propose a scalable +Bayesian approach to effectively learn the posterior distribution over causal +graphs given observational data thanks to the ability to generate DAGs without +explicitly enforcing acyclicity. Specifically, we introduce a novel +differentiable DAG sampling method that can generate a valid acyclic causal +graph by mapping an unconstrained distribution of implicit topological orders +to a distribution over DAGs. Given this efficient DAG sampling scheme, we are +able to model the posterior distribution over causal graphs using a simple +variational distribution over a continuous domain, which can be learned via the +variational inference framework. Extensive empirical experiments on both +simulated and real datasets demonstrate the superior performance of the +proposed model compared to several state-of-the-art baselines. + +
+
+ comment: Accepted at ECAI 2024 +
+
+
+
+
+ + ♻ ☆ Enabling Causal Discovery in Post-Nonlinear Models with Normalizing + Flows ECAI 2024 + + +
+ Post-nonlinear (PNL) causal models stand out as a versatile and adaptable +framework for modeling intricate causal relationships. However, accurately +capturing the invertibility constraint required in PNL models remains +challenging in existing studies. To address this problem, we introduce CAF-PoNo +(Causal discovery via Normalizing Flows for Post-Nonlinear models), harnessing +the power of the normalizing flows architecture to enforce the crucial +invertibility constraint in PNL models. Through normalizing flows, our method +precisely reconstructs the hidden noise, which plays a vital role in +cause-effect identification through statistical independence testing. +Furthermore, the proposed approach exhibits remarkable extensibility, as it can +be seamlessly expanded to facilitate multivariate causal discovery via causal +order identification, empowering us to efficiently unravel complex causal +relationships. Extensive experimental evaluations on both simulated and real +datasets consistently demonstrate that the proposed method outperforms several +state-of-the-art approaches in both bivariate and multivariate causal discovery +tasks. + +
+
+ comment: Acepted at ECAI 2024 +
+
+
+
+
+ + ♻ ☆ How to avoid machine learning pitfalls: a guide for academic researchers + + +
+ Mistakes in machine learning practice are commonplace, and can result in a +loss of confidence in the findings and products of machine learning. This guide +outlines common mistakes that occur when using machine learning, and what can +be done to avoid them. Whilst it should be accessible to anyone with a basic +understanding of machine learning techniques, it focuses on issues that are of +particular concern within academic research, such as the need to do rigorous +comparisons and reach valid conclusions. It covers five stages of the machine +learning process: what to do before model building, how to reliably build +models, how to robustly evaluate models, how to compare models fairly, and how +to report results. + +
+
+
+
+
+ + ♻ ☆ Estimating Direct and Indirect Causal Effects of Spatiotemporal + Interventions in Presence of Spatial Interference + + +
+ Spatial interference (SI) occurs when the treatment at one location affects +the outcomes at other locations. Accounting for spatial interference in +spatiotemporal settings poses further challenges as interference violates the +stable unit treatment value assumption, making it infeasible for standard +causal inference methods to quantify the effects of time-varying treatment at +spatially varying outcomes. In this paper, we first formalize the concept of +spatial interference in case of time-varying treatment assignments by extending +the potential outcome framework under the assumption of no unmeasured +confounding. We then propose our deep learning based potential outcome model +for spatiotemporal causal inference. We utilize latent factor modeling to +reduce the bias due to time-varying confounding while leveraging the power of +U-Net architecture to capture global and local spatial interference in data +over time. Our causal estimators are an extension of average treatment effect +(ATE) for estimating direct (DATE) and indirect effects (IATE) of spatial +interference on treated and untreated data. Being the first of its kind deep +learning based spatiotemporal causal inference technique, our approach shows +advantages over several baseline methods based on the experiment results on two +synthetic datasets, with and without spatial interference. Our results on +real-world climate dataset also align with domain knowledge, further +demonstrating the effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ♻ ☆ Joint Optimization of Piecewise Linear Ensembles SP 2024 + + +
+ Tree ensembles achieve state-of-the-art performance on numerous prediction +tasks. We propose $\textbf{J}$oint $\textbf{O}$ptimization of +$\textbf{P}$iecewise $\textbf{L}$inear $\textbf{En}$sembles (JOPLEn), which +jointly fits piecewise linear models at all leaf nodes of an existing tree +ensemble. In addition to enhancing the ensemble expressiveness, JOPLEn allows +several common penalties, including sparsity-promoting and subspace-norms, to +be applied to nonlinear prediction. For example, JOPLEn with a nuclear norm +penalty learns subspace-aligned functions. Additionally, JOPLEn (combined with +a Dirty LASSO penalty) is an effective feature selection method for nonlinear +prediction in multitask learning. Finally, we demonstrate the performance of +JOPLEn on 153 regression and classification datasets and with a variety of +penalties. JOPLEn leads to improved prediction performance relative to not only +standard random forest and boosted tree ensembles, but also other methods for +enhancing tree ensembles. + +
+
+ comment: 7 pages, 4 figures, accepted to IEEE MLSP 2024 While preparing the + code release, we found minor bugs in the penalty gradient computation and the + validation set preprocessing. Fixing these bugs provides the updated results + shown in Figure 1 and Section 3.1. The conclusions of the paper remain the + same +
+
+
+
+
+ + ♻ ☆ DAISY: Data Adaptive Self-Supervised Early Exit for Speech + Representation Models + + +
+ Self-supervised speech models have shown to be useful for various tasks, but +their large size limits the use in devices with low computing power and memory. +In this work, we explore early exit, an approach for reducing latency by +exiting the forward process of a network early. Most approaches of early exit +need a separate early exit model for each task, with some even requiring +fine-tuning of the entire pretrained model. We introduce Data Adaptive +Self-Supervised Early Exit (DAISY), an approach that decides when to exit based +on the self-supervised loss, eliminating the need for multiple round of +training and fine-tuning. DAISY matches the performance of HuBERT on the +MiniSUPERB benchmark, but with much faster inference times. Our analysis on the +adaptivity of DAISY shows that the model exits early (using fewer layers) on +clean data while exits late (using more layers) on noisy data, dynamically +adjusting the computational cost of inference based on the noise level of each +sample. + +
+
+ comment: Accepted by Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Attribute Graphs Underlying Molecular Generative Models: Path to + Learning with Limited Data + + +
+ Training generative models that capture rich semantics of the data and +interpreting the latent representations encoded by such models are very +important problems in un-/self-supervised learning. In this work, we provide a +simple algorithm that relies on perturbation experiments on latent codes of a +pre-trained generative autoencoder to uncover an attribute graph that is +implied by the generative model. We perform perturbation experiments to check +for influence of a given latent variable on a subset of attributes. Given this, +we show that one can fit an effective graphical model that models a structural +equation model between latent codes taken as exogenous variables and attributes +taken as observed variables. One interesting aspect is that a single latent +variable controls multiple overlapping subsets of attributes unlike +conventional approaches that try to impose full independence. Using a +pre-trained generative autoencoder trained on a large dataset of small +molecules, we demonstrate that the graphical model between various molecular +attributes and latent codes learned by our algorithm can be used to predict a +specific property for molecules which are drawn from a different distribution. +We compare prediction models trained on various feature subsets chosen by +simple baselines, as well as existing causal discovery and sparse +learning/feature selection methods, with the ones in the derived Markov blanket +from our method. Results show empirically that the predictor that relies on our +Markov blanket attributes is robust to distribution shifts when transferred or +fine-tuned with a few samples from the new distribution, especially when +training data is limited. + +
+
+ comment: New experiments; reframed contributions +
+
+
+
+
+ + ♻ ☆ MelHuBERT: A simplified HuBERT on Mel spectrograms + + +
+ Self-supervised models have had great success in learning speech +representations that can generalize to various downstream tasks. However, most +self-supervised models require a large amount of compute and multiple GPUs to +train, significantly hampering the development of self-supervised learning. In +an attempt to reduce the computation of training, we revisit the training of +HuBERT, a highly successful self-supervised model. We improve and simplify +several key components, including the loss function, input representation, and +training in multiple stages. Our model, MelHuBERT, is able to achieve favorable +performance on phone recognition, speaker identification, and automatic speech +recognition against HuBERT, while saving 31.2% of the pre-training time, or +equivalently 33.5% MACs per one second speech. The code and pre-trained models +are available in https://github.com/nervjack2/MelHuBERT. + +
+
+ comment: ASRU 2023 +
+
+
+
+
+ + ♻ ☆ Are Small Language Models Ready to Compete with Large Language Models + for Practical Applications? + + +
+ The rapid rise of Language Models (LMs) has expanded their use in several +applications. Yet, due to constraints of model size, associated cost, or +proprietary restrictions, utilizing state-of-the-art (SOTA) LLMs is not always +feasible. With open, smaller LMs emerging, more applications can leverage their +capabilities, but selecting the right LM can be challenging as smaller LMs +don't perform well universally. This work tries to bridge this gap by proposing +a framework to experimentally evaluate small, open LMs in practical settings +through measuring semantic correctness of outputs across three practical +aspects: task types, application domains and reasoning types, using diverse +prompt styles. It also conducts an in-depth comparison of 10 small, open LMs to +identify best LM and prompt style depending on specific application requirement +using the proposed framework. We also show that if selected appropriately, they +can outperform SOTA LLMs like DeepSeek-v2, GPT-4o-mini, Gemini-1.5-Pro, and +even compete with GPT-4o. + +
+
+ comment: Submitted to ARR +
+
+
+
+
+ + ♻ ☆ BiomedBench: A benchmark suite of TinyML biomedical applications for + low-power wearables + + +
+ The design of low-power wearables for the biomedical domain has received a +lot of attention in recent decades, as technological advances in chip +manufacturing have allowed real-time monitoring of patients using +low-complexity ML within the mW range. Despite advances in application and +hardware design research, the domain lacks a systematic approach to hardware +evaluation. In this work, we propose BiomedBench, a new benchmark suite +composed of complete end-to-end TinyML biomedical applications for real-time +monitoring of patients using wearable devices. Each application presents +different requirements during typical signal acquisition and processing phases, +including varying computational workloads and relations between active and idle +times. Furthermore, our evaluation of five state-of-the-art low-power platforms +in terms of energy efficiency shows that modern platforms cannot effectively +target all types of biomedical applications. BiomedBench is released as an +open-source suite to standardize hardware evaluation and guide hardware and +application design in the TinyML wearable domain. + +
+
+ comment: 7 pages, 5 figures. Sumbitted to Design & Test Special Issue TinyML +
+
+
+
+
+ + ♻ ☆ Loop Copilot: Conducting AI Ensembles for Music Generation and Iterative + Editing + + +
+ Creating music is iterative, requiring varied methods at each stage. However, +existing AI music systems fall short in orchestrating multiple subsystems for +diverse needs. To address this gap, we introduce Loop Copilot, a novel system +that enables users to generate and iteratively refine music through an +interactive, multi-round dialogue interface. The system uses a large language +model to interpret user intentions and select appropriate AI models for task +execution. Each backend model is specialized for a specific task, and their +outputs are aggregated to meet the user's requirements. To ensure musical +coherence, essential attributes are maintained in a centralized table. We +evaluate the effectiveness of the proposed system through semi-structured +interviews and questionnaires, highlighting its utility not only in +facilitating music creation but also its potential for broader applications. + +
+
+ comment: Source code and demo video are available at + \url{https://sites.google.com/view/loop-copilot} +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ MultiMediate'24: Multi-Domain Engagement Estimation + + +
+ Estimating the momentary level of participant's engagement is an important +prerequisite for assistive systems that support human interactions. Previous +work has addressed this task in within-domain evaluation scenarios, i.e. +training and testing on the same dataset. This is in contrast to real-life +scenarios where domain shifts between training and testing data frequently +occur. With MultiMediate'24, we present the first challenge addressing +multi-domain engagement estimation. As training data, we utilise the NOXI +database of dyadic novice-expert interactions. In addition to within-domain +test data, we add two new test domains. First, we introduce recordings +following the NOXI protocol but covering languages that are not present in the +NOXI training data. Second, we collected novel engagement annotations on the +MPIIGroupInteraction dataset which consists of group discussions between three +to four people. In this way, MultiMediate'24 evaluates the ability of +approaches to generalise across factors such as language and cultural +background, group size, task, and screen-mediated vs. face-to-face interaction. +This paper describes the MultiMediate'24 challenge and presents baseline +results. In addition, we discuss selected challenge solutions. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2308.08256 +
+
+
+
+
+ + ☆ Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing + Interaction and Causal Processing + + +
+ Humans naturally perform audiovisual speech recognition (AVSR), enhancing the +accuracy and robustness by integrating auditory and visual information. Spiking +neural networks (SNNs), which mimic the brain's information-processing +mechanisms, are well-suited for emulating the human capability of AVSR. Despite +their potential, research on SNNs for AVSR is scarce, with most existing +audio-visual multimodal methods focused on object or digit recognition. These +models simply integrate features from both modalities, neglecting their unique +characteristics and interactions. Additionally, they often rely on future +information for current processing, which increases recognition latency and +limits real-time applicability. Inspired by human speech perception, this paper +proposes a novel human-inspired SNN named HI-AVSNN for AVSR, incorporating +three key characteristics: cueing interaction, causal processing and spike +activity. For cueing interaction, we propose a visual-cued auditory attention +module (VCA2M) that leverages visual cues to guide attention to auditory +features. We achieve causal processing by aligning the SNN's temporal dimension +with that of visual and auditory features and applying temporal masking to +utilize only past and current information. To implement spike activity, in +addition to using SNNs, we leverage the event camera to capture lip movement as +spikes, mimicking the human retina and providing efficient visual data. We +evaluate HI-AVSNN on an audiovisual speech recognition dataset combining the +DVS-Lip dataset with its corresponding audio samples. Experimental results +demonstrate the superiority of our proposed fusion method, outperforming +existing audio-visual SNN fusion methods and achieving a 2.27% improvement in +accuracy over the only existing SNN-based AVSR method. + +
+
+
+
+
+ + ☆ WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio + Language Modeling + + +
+ Language models have been effectively applied to modeling natural signals, +such as images, video, speech, and audio. A crucial component of these models +is the codec tokenizer, which compresses high-dimensional natural signals into +lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, +which offers several advantages over previous SOTA acoustic codec models in the +audio domain: 1)extreme compression. By compressing the layers of quantizers +and the temporal dimension of the discrete codec, one-second audio of 24kHz +sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved +subjective quality. Despite the reduced number of tokens, WavTokenizer achieves +state-of-the-art reconstruction quality with outstanding UTMOS scores and +inherently contains richer semantic information. Specifically, we achieve these +results by designing a broader VQ space, extended contextual windows, and +improved attention networks, as well as introducing a powerful multi-scale +discriminator and an inverse Fourier transform structure. We conducted +extensive reconstruction experiments in the domains of speech, audio, and +music. WavTokenizer exhibited strong performance across various objective and +subjective metrics compared to state-of-the-art models. We also tested semantic +information, VQ utilization, and adaptability to generative models. +Comprehensive ablation studies confirm the necessity of each module in +WavTokenizer. The related code, demos, and pre-trained models are available at +https://github.com/jishengpeng/WavTokenizer. + +
+
+ comment: Working in progress. arXiv admin note: text overlap with + arXiv:2402.12208 +
+
+
+
+
+ + ☆ MSLIQA: Enhancing Learning Representations for Image Quality Assessment + through Multi-Scale Learning + + +
+ No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due +to the diversity of distortions and the lack of large annotated datasets. Many +studies have attempted to tackle these challenges by developing more accurate +NR-IQA models, often employing complex and computationally expensive networks, +or by bridging the domain gap between various distortions to enhance +performance on test datasets. In our work, we improve the performance of a +generic lightweight NR-IQA model by introducing a novel augmentation strategy +that boosts its performance by almost 28\%. This augmentation strategy enables +the network to better discriminate between different distortions in various +parts of the image by zooming in and out. Additionally, the inclusion of +test-time augmentation further enhances performance, making our lightweight +network's results comparable to the current state-of-the-art models, simply +through the use of augmentations. + +
+
+
+
+
+ + ☆ See or Guess: Counterfactually Regularized Image Captioning ACM MM 2024 + + +
+ Image captioning, which generates natural language descriptions of the visual +information in an image, is a crucial task in vision-language research. +Previous models have typically addressed this task by aligning the generative +capabilities of machines with human intelligence through statistical fitting of +existing datasets. While effective for normal images, they may struggle to +accurately describe those where certain parts of the image are obscured or +edited, unlike humans who excel in such cases. These weaknesses they exhibit, +including hallucinations and limited interpretability, often hinder performance +in scenarios with shifted association patterns. In this paper, we present a +generic image captioning framework that employs causal inference to make +existing models more capable of interventional tasks, and counterfactually +explainable. Our approach includes two variants leveraging either total effect +or natural direct effect. Integrating them into the training process enables +models to handle counterfactual scenarios, increasing their generalizability. +Extensive experiments on various datasets show that our method effectively +reduces hallucinations and improves the model's faithfulness to images, +demonstrating high portability across both small-scale and large-scale +image-to-text models. The code is available at +https://github.com/Aman-4-Real/See-or-Guess. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 71 + +
+
+
+ + ☆ CoGen: Learning from Feedback with Coupled Comprehension and Generation + + +
+ Systems with both language comprehension and generation capabilities can +benefit from the tight connection between the two. This work studies coupling +comprehension and generation with focus on continually learning from +interaction with users. We propose techniques to tightly integrate the two +capabilities for both learning and inference. We situate our studies in +two-player reference games, and deploy various models for thousands of +interactions with human users, while learning from interaction feedback +signals. We show dramatic improvements in performance over time, with +comprehension-generation coupling leading to performance improvements up to 26% +in absolute terms and up to 17% higher accuracies compared to a non-coupled +system. Our analysis also shows coupling has substantial qualitative impact on +the system's language, making it significantly more human-like. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ BattleAgentBench: A Benchmark for Evaluating Cooperation and Competition + Capabilities of Language Models in Multi-Agent Systems + + +
+ Large Language Models (LLMs) are becoming increasingly powerful and capable +of handling complex tasks, e.g., building single agents and multi-agent +systems. Compared to single agents, multi-agent systems have higher +requirements for the collaboration capabilities of language models. Many +benchmarks are proposed to evaluate their collaborative abilities. However, +these benchmarks lack fine-grained evaluations of LLM collaborative +capabilities. Additionally, multi-agent collaborative and competitive scenarios +are ignored in existing works. To address these two problems, we propose a +benchmark, called BattleAgentBench, which defines seven sub-stages of three +varying difficulty levels and conducts a fine-grained evaluation of language +models in terms of single-agent scenario navigation capabilities, paired-agent +task execution abilities, and multi-agent collaboration and competition +capabilities. We conducted extensive evaluations on leading four closed-source +and seven open-source models. Experimental results indicate that API-based +models perform excellently on simple tasks but open-source small models +struggle with simple tasks. Regarding difficult tasks that require +collaborative and competitive abilities, although API-based models have +demonstrated some collaborative capabilities, there is still enormous room for +improvement. + +
+
+
+
+
+ + ☆ More Text, Less Point: Towards 3D Data-Efficient Point-Language + Understanding + + +
+ Enabling Large Language Models (LLMs) to comprehend the 3D physical world +remains a significant challenge. Due to the lack of large-scale 3D-text pair +datasets, the success of LLMs has yet to be replicated in 3D understanding. In +this paper, we rethink this issue and propose a new task: 3D Data-Efficient +Point-Language Understanding. The goal is to enable LLMs to achieve robust 3D +object understanding with minimal 3D point cloud and text data pairs. To +address this task, we introduce GreenPLM, which leverages more text data to +compensate for the lack of 3D data. First, inspired by using CLIP to align +images and text, we utilize a pre-trained point cloud-text encoder to map the +3D point cloud space to the text space. This mapping leaves us to seamlessly +connect the text space with LLMs. Once the point-text-LLM connection is +established, we further enhance text-LLM alignment by expanding the +intermediate text space, thereby reducing the reliance on 3D point cloud data. +Specifically, we generate 6M free-text descriptions of 3D objects, and design a +three-stage training strategy to help LLMs better explore the intrinsic +connections between different modalities. To achieve efficient modality +alignment, we design a zero-parameter cross-attention module for token pooling. +Extensive experimental results show that GreenPLM requires only 12% of the 3D +training data used by existing state-of-the-art models to achieve superior 3D +understanding. Remarkably, GreenPLM also achieves competitive performance using +text-only data. The code and weights are available at: +https://github.com/TangYuan96/GreenPLM. + +
+
+
+
+
+ + ☆ Leveraging Open Knowledge for Advancing Task Expertise in Large Language + Models + + +
+ The cultivation of expertise for large language models (LLMs) to solve tasks +of specific areas often requires special-purpose tuning with calibrated +behaviors on the expected stable outputs. To avoid huge cost brought by manual +preparation of instruction datasets and training resources up to hundreds of +hours, the exploitation of open knowledge including a wealth of low rank +adaptation (LoRA) models and instruction datasets serves as a good starting +point. However, existing methods on model and data selection focus on the +performance of general-purpose capabilities while neglecting the knowledge gap +exposed in domain-specific deployment. In the present study, we propose to +bridge such gap by introducing few human-annotated samples (i.e., K-shot) for +advancing task expertise of LLMs with open knowledge. Specifically, we develop +an efficient and scalable pipeline to cost-efficiently produce task experts +where K-shot data intervene in selecting the most promising expert candidates +and the task-relevant instructions. A mixture-of-expert (MoE) system is built +to make the best use of individual-yet-complementary knowledge between multiple +experts. We unveil the two keys to the success of a MoE system, 1) the abidance +by K-shot, and 2) the insistence on diversity. For the former, we ensure that +models that truly possess problem-solving abilities on K-shot are selected +rather than those blind guessers. Besides, during data selection, instructions +that share task-relevant contexts with K-shot are prioritized. For the latter, +we highlight the diversity of constituting experts and that of the fine-tuning +instructions throughout the model and data selection process. Extensive +experimental results confirm the superiority of our approach over existing +methods on utilization of open knowledge across various tasks. Codes and models +will be released later. + +
+
+ comment: 28 pages, 12 tables, 10 figures +
+
+
+
+
+ + ☆ LLM-Based Multi-Hop Question Answering with Knowledge Graph Integration + in Evolving Environments + + +
+ The rapid obsolescence of information in Large Language Models (LLMs) has +driven the development of various techniques to incorporate new facts. However, +existing methods for knowledge editing still face difficulties with multi-hop +questions that require accurate fact identification and sequential logical +reasoning, particularly among numerous fact updates. To tackle these +challenges, this paper introduces Graph Memory-based Editing for Large Language +Models (GMeLLo), a straitforward and effective method that merges the explicit +knowledge representation of Knowledge Graphs (KGs) with the linguistic +flexibility of LLMs. Beyond merely leveraging LLMs for question answering, +GMeLLo employs these models to convert free-form language into structured +queries and fact triples, facilitating seamless interaction with KGs for rapid +updates and precise multi-hop reasoning. Our results show that GMeLLo +significantly surpasses current state-of-the-art knowledge editing methods in +the multi-hop question answering benchmark, MQuAKE, especially in scenarios +with extensive knowledge edits. + +
+
+
+
+
+ + ☆ Nexus: Specialization meets Adaptability for Efficiently Training + Mixture of Experts + + +
+ Efficiency, specialization, and adaptability to new data distributions are +qualities that are hard to combine in current Large Language Models. The +Mixture of Experts (MoE) architecture has been the focus of significant +research because its inherent conditional computation enables such desirable +properties. In this work, we focus on "upcycling" dense expert models into an +MoE, aiming to improve specialization while also adding the ability to adapt to +new tasks easily. We introduce Nexus, an enhanced MoE architecture with +adaptive routing where the model learns to project expert embeddings from +domain representations. This approach allows Nexus to flexibly add new experts +after the initial upcycling through separately trained dense models, without +requiring large-scale MoE training for unseen data domains. Our experiments +show that Nexus achieves a relative gain of up to 2.1% over the baseline for +initial upcycling, and a 18.8% relative gain for extending the MoE with a new +expert by using limited finetuning data. This flexibility of Nexus is crucial +to enable an open-source ecosystem where every user continuously assembles +their own MoE-mix according to their needs. + +
+
+
+
+
+ + ☆ A New Method for Cross-Lingual-based Semantic Role Labeling + + +
+ Semantic role labeling is a crucial task in natural language processing, +enabling better comprehension of natural language. However, the lack of +annotated data in multiple languages has posed a challenge for researchers. To +address this, a deep learning algorithm based on model transfer has been +proposed. The algorithm utilizes a dataset consisting of the English portion of +CoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency +of training, only ten percent of the educational data from each language is +used. The results of the proposed model demonstrate significant improvements +compared to Niksirt et al.'s model. In monolingual mode, the proposed model +achieved a 2.05 percent improvement on F1-score, while in cross-lingual mode, +the improvement was even more substantial, reaching 6.23 percent. Worth noting +is that the compared model only trained two of the four stages of semantic role +labeling and employed golden data for the remaining two stages. This suggests +that the actual superiority of the proposed model surpasses the reported +numbers by a significant margin. The development of cross-lingual methods for +semantic role labeling holds promise, particularly in addressing the scarcity +of annotated data for various languages. These advancements pave the way for +further research in understanding and processing natural language across +different linguistic contexts. + +
+
+
+
+
+ + ☆ Bias in LLMs as Annotators: The Effect of Party Cues on Labelling + Decision by Large Language Models + + +
+ Human coders are biased. We test similar biases in Large Language Models +(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and +Meyer (2018), we find evidence that LLMs use political information, and +specifically party cues, to judge political statements. Not only do LLMs use +relevant information to contextualize whether a statement is positive, +negative, or neutral based on the party cue, they also reflect the biases of +the human-generated data upon which they have been trained. We also find that +unlike humans, who are only biased when faced with statements from extreme +parties, LLMs exhibit significant bias even when prompted with statements from +center-left and center-right parties. The implications of our findings are +discussed in the conclusion. + +
+
+
+
+
+ + ☆ Persuasion Games using Large Language Models + + +
+ Large Language Models (LLMs) have emerged as formidable instruments capable +of comprehending and producing human-like text. This paper explores the +potential of LLMs, to shape human perspectives and subsequently influence their +decisions on particular tasks. This capability finds applications in diverse +domains such as Investment, Credit cards and Insurance, wherein they assist +users in selecting appropriate insurance policies, investment plans, Credit +cards, Retail, as well as in Behavioral Change Support Systems (BCSS). + We present a sophisticated multi-agent framework wherein a consortium of +agents operate in collaborative manner. The primary agent engages directly with +users through persuasive dialogue, while the auxiliary agents perform tasks +such as information retrieval, response analysis, development of persuasion +strategies, and validation of facts. Empirical evidence from our experiments +demonstrates that this collaborative methodology significantly enhances the +persuasive efficacy of the LLM. We analyze user resistance to persuasive +efforts continuously and counteract it by employing a combination of rule-based +and LLM-based resistance-persuasion mapping techniques. + We employ simulated personas and generate conversations in insurance, +banking, and retail domains to evaluate the proficiency of large language +models (LLMs) in recognizing, adjusting to, and influencing various personality +types. Concurrently, we examine the resistance mechanisms employed by LLM +simulated personas. Persuasion is quantified via measurable surveys before and +after interaction, LLM-generated scores on conversation, and user decisions +(purchase or non-purchase). + +
+
+
+
+
+ + ☆ Knowledge Navigator: LLM-guided Browsing Framework for Exploratory + Search in Scientific Literature + + +
+ The exponential growth of scientific literature necessitates advanced tools +for effective knowledge exploration. We present Knowledge Navigator, a system +designed to enhance exploratory search abilities by organizing and structuring +the retrieved documents from broad topical queries into a navigable, two-level +hierarchy of named and descriptive scientific topics and subtopics. This +structured organization provides an overall view of the research themes in a +domain, while also enabling iterative search and deeper knowledge discovery +within specific subtopics by allowing users to refine their focus and retrieve +additional relevant documents. Knowledge Navigator combines LLM capabilities +with cluster-based methods to enable an effective browsing method. We +demonstrate our approach's effectiveness through automatic and manual +evaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code, +prompts, and benchmarks are made publicly available. + +
+
+
+
+
+ + ☆ Automatic Differential Diagnosis using Transformer-Based Multi-Label + Sequence Classification + + +
+ As the field of artificial intelligence progresses, assistive technologies +are becoming more widely used across all industries. The healthcare industry is +no different, with numerous studies being done to develop assistive tools for +healthcare professionals. Automatic diagnostic systems are one such beneficial +tool that can assist with a variety of tasks, including collecting patient +information, analyzing test results, and diagnosing patients. However, the idea +of developing systems that can provide a differential diagnosis has been +largely overlooked in most of these research studies. In this study, we propose +a transformer-based approach for providing differential diagnoses based on a +patient's age, sex, medical history, and symptoms. We use the DDXPlus dataset, +which provides differential diagnosis information for patients based on 49 +disease types. Firstly, we propose a method to process the tabular patient data +from the dataset and engineer them into patient reports to make them suitable +for our research. In addition, we introduce two data modification modules to +diversify the training data and consequently improve the robustness of the +models. We approach the task as a multi-label classification problem and +conduct extensive experiments using four transformer models. All the models +displayed promising results by achieving over 97% F1 score on the held-out test +set. Moreover, we design additional behavioral tests to get a broader +understanding of the models. In particular, for one of our test cases, we +prepared a custom test set of 100 samples with the assistance of a doctor. The +results on the custom set showed that our proposed data modification modules +improved the model's generalization capabilities. We hope our findings will +provide future researchers with valuable insights and inspire them to develop +reliable systems for automatic differential diagnosis. + +
+
+ comment: 25 pages, 7 figures +
+
+
+
+
+ + ☆ Scaling Up Summarization: Leveraging Large Language Models for Long Text + Extractive Summarization + + +
+ In an era where digital text is proliferating at an unprecedented rate, +efficient summarization tools are becoming indispensable. While Large Language +Models (LLMs) have been successfully applied in various NLP tasks, their role +in extractive text summarization remains underexplored. This paper introduces +EYEGLAXS (Easy Yet Efficient larGe LAnguage model for eXtractive +Summarization), a framework that leverages LLMs, specifically LLAMA2-7B and +ChatGLM2-6B, for extractive summarization of lengthy text documents. Instead of +abstractive methods, which often suffer from issues like factual inaccuracies +and hallucinations, EYEGLAXS focuses on extractive summarization to ensure +factual and grammatical integrity. Utilizing state-of-the-art techniques such +as Flash Attention and Parameter-Efficient Fine-Tuning (PEFT), EYEGLAXS +addresses the computational and resource challenges typically associated with +LLMs. The system sets new performance benchmarks on well-known datasets like +PubMed and ArXiv. Furthermore, we extend our research through additional +analyses that explore the adaptability of LLMs in handling different sequence +lengths and their efficiency in training on smaller datasets. These +contributions not only set a new standard in the field but also open up +promising avenues for future research in extractive text summarization. + +
+
+
+
+
+ + ☆ Language Adaptation on a Tight Academic Compute Budget: Tokenizer + Swapping Works and Pure bfloat16 Is Enough ICML 2024 + + +
+ We investigate continued pretraining of LLMs for language adaptation on a +tight academic budget: a setting in which only a few GPUs can be used in +parallel, for a heavily constrained duration. We focus on adapting Mistral-7B +to German or Arabic and evaluate several techniques to improve efficiency and +effectiveness in this setting. Our German models adapted on this tight compute +budget underperform compared to the base Mistral-7B, while our Arabic models +outperform several baselines, showing that for sufficiently well-represented +languages, continued pretraining for specialization is not always helpful. Our +main findings focus on training precision and tokenizer swapping. Our results +show that pure bfloat16 training is a viable alternative to mixed-precision +training, while being much faster when only using a few GPUs. Swapping the +tokenizer for a specialized one yields more efficient tokenization and is +competitive with the original tokenizer, which already contains some German +tokens, but did not significantly increase performance for German. Code and +model weights are available at on GitHub. + +
+
+ comment: WANT@ICML 2024 +
+
+
+
+
+ + ☆ Interactive Agents: Simulating Counselor-Client Psychological Counseling + via Role-Playing LLM-to-LLM Interactions + + +
+ Virtual counselors powered by large language models (LLMs) aim to create +interactive support systems that effectively assist clients struggling with +mental health challenges. To replicate counselor-client conversations, +researchers have built an online mental health platform that allows +professional counselors to provide clients with text-based counseling services +for about an hour per session. Notwithstanding its effectiveness, challenges +exist as human annotation is time-consuming, cost-intensive, privacy-protected, +and not scalable. To address this issue and investigate the applicability of +LLMs in psychological counseling conversation simulation, we propose a +framework that employs two LLMs via role-playing for simulating +counselor-client interactions. Our framework involves two LLMs, one acting as a +client equipped with a specific and real-life user profile and the other +playing the role of an experienced counselor, generating professional responses +using integrative therapy techniques. We implement both the counselor and the +client by zero-shot prompting the GPT-4 model. In order to assess the +effectiveness of LLMs in simulating counselor-client interactions and +understand the disparities between LLM- and human-generated conversations, we +evaluate the synthetic data from various perspectives. We begin by assessing +the client's performance through automatic evaluations. Next, we analyze and +compare the disparities between dialogues generated by the LLM and those +generated by professional counselors. Furthermore, we conduct extensive +experiments to thoroughly examine the performance of our LLM-based counselor +trained with synthetic interactive dialogues by benchmarking against +state-of-the-art models for mental health. + +
+
+
+
+
+ + ☆ LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language + Models + + +
+ Large Language Models (LLMs) have demonstrated notable capabilities across +various tasks, showcasing complex problem-solving abilities. Understanding and +executing complex rules, along with multi-step planning, are fundamental to +logical reasoning and critical for practical LLM agents and decision-making +systems. However, evaluating LLMs as effective rule-based executors and +planners remains underexplored. In this paper, we introduce LogicGame, a novel +benchmark designed to evaluate the comprehensive rule understanding, execution, +and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame +provides diverse games that contain a series of rules with an initial state, +requiring models to comprehend and apply predefined regulations to solve +problems. We create simulated scenarios in which models execute or plan +operations to achieve specific outcomes. These game scenarios are specifically +designed to distinguish logical reasoning from mere knowledge by relying +exclusively on predefined rules. This separation allows for a pure assessment +of rule-based reasoning capabilities. The evaluation considers not only final +outcomes but also intermediate steps, providing a comprehensive assessment of +model performance. Moreover, these intermediate steps are deterministic and can +be automatically verified. LogicGame defines game scenarios with varying +difficulty levels, from simple rule applications to complex reasoning chains, +in order to offer a precise evaluation of model performance on rule +understanding and multi-step execution. Utilizing LogicGame, we test various +LLMs and identify notable shortcomings in their rule-based logical reasoning +abilities. + +
+
+
+
+
+ + ☆ A Survey on Evaluation of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) mimic human perception and reasoning +system by integrating powerful Large Language Models (LLMs) with various +modality encoders (e.g., vision, audio), positioning LLMs as the "brain" and +various modality encoders as sensory organs. This framework endows MLLMs with +human-like capabilities, and suggests a potential pathway towards achieving +artificial general intelligence (AGI). With the emergence of all-round MLLMs +like GPT-4V and Gemini, a multitude of evaluation methods have been developed +to assess their capabilities across different dimensions. This paper presents a +systematic and comprehensive review of MLLM evaluation methods, covering the +following key aspects: (1) the background of MLLMs and their evaluation; (2) +"what to evaluate" that reviews and categorizes existing MLLM evaluation tasks +based on the capabilities assessed, including general multimodal recognition, +perception, reasoning and trustworthiness, and domain-specific applications +such as socioeconomic, natural sciences and engineering, medical usage, AI +agent, remote sensing, video and audio processing, 3D point cloud analysis, and +others; (3) "where to evaluate" that summarizes MLLM evaluation benchmarks into +general and specific benchmarks; (4) "how to evaluate" that reviews and +illustrates MLLM evaluation steps and metrics; Our overarching goal is to +provide valuable insights for researchers in the field of MLLM evaluation, +thereby facilitating the development of more capable and reliable MLLMs. We +emphasize that evaluation should be regarded as a critical discipline, +essential for advancing the field of MLLMs. + +
+
+
+
+
+ + ☆ Harmonized Speculative Sampling + + +
+ Speculative sampling has proven to be an effective solution to accelerate +decoding from large language models, where the acceptance rate significantly +determines the performance. Most previous works on improving the acceptance +rate focus on aligned training and efficient decoding, implicitly paying less +attention to the linkage of training and decoding. In this work, we first +investigate the linkage of training and decoding for speculative sampling and +then propose a solution named HArmonized Speculative Sampling (HASS). HASS +improves the acceptance rate without extra inference overhead by harmonizing +training and decoding on their objectives and contexts. Experiments on three +LLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup +ratio averaging across three datasets, which is 8%-15% faster than EAGLE-2. + +
+
+
+
+
+ + ☆ Form and meaning co-determine the realization of tone in Taiwan Mandarin + spontaneous speech: the case of Tone 3 sandhi + + +
+ In Standard Chinese, Tone 3 (the dipping tone) becomes Tone 2 (rising tone) +when followed by another Tone 3. Previous studies have noted that this sandhi +process may be incomplete, in the sense that the assimilated Tone 3 is still +distinct from a true Tone 2. While Mandarin Tone 3 sandhi is widely studied +using carefully controlled laboratory speech (Xu, 1997) and more formal +registers of Beijing Mandarin (Yuan and Chen, 2014), less is known about its +realization in spontaneous speech, and about the effect of contextual factors +on tonal realization. The present study investigates the pitch contours of +two-character words with T2-T3 and T3-T3 tone patterns in spontaneous Taiwan +Mandarin conversations. Our analysis makes use of the Generative Additive Mixed +Model (GAMM, Wood, 2017) to examine fundamental frequency (f0) contours as a +function of normalized time. We consider various factors known to influence +pitch contours, including gender, speaking rate, speaker, neighboring tones, +word position, bigram probability, and also novel predictors, word and word +sense (Chuang et al., 2024). Our analyses revealed that in spontaneous Taiwan +Mandarin, T3-T3 words become indistinguishable from T2-T3 words, indicating +complete sandhi, once the strong effect of word (or word sense) is taken into +account. For our data, the shape of f0 contours is not co-determined by word +frequency. In contrast, the effect of word meaning on f0 contours is robust, as +strong as the effect of adjacent tones, and is present for both T2-T3 and T3-T3 +words. + +
+
+
+
+
+ + ☆ LM-PUB-QUIZ: A Comprehensive Framework for Zero-Shot Evaluation of + Relational Knowledge in Language Models + + +
+ Knowledge probing evaluates the extent to which a language model (LM) has +acquired relational knowledge during its pre-training phase. It provides a +cost-effective means of comparing LMs of different sizes and training setups +and is useful for monitoring knowledge gained or lost during continual learning +(CL). In prior work, we presented an improved knowledge probe called BEAR +(Wiland et al., 2024), which enables the comparison of LMs trained with +different pre-training objectives (causal and masked LMs) and addresses issues +of skewed distributions in previous probes to deliver a more unbiased reading +of LM knowledge. With this paper, we present LM-PUB- QUIZ, a Python framework +and leaderboard built around the BEAR probing mechanism that enables +researchers and practitioners to apply it in their work. It provides options +for standalone evaluation and direct integration into the widely-used training +pipeline of the Hugging Face TRANSFORMERS library. Further, it provides a +fine-grained analysis of different knowledge types to assist users in better +understanding the knowledge in each evaluated LM. We publicly release +LM-PUB-QUIZ as an open-source project. + +
+
+
+
+
+ + ☆ An Evaluation of Sindhi Word Embedding in Semantic Analogies and + Downstream Tasks + + +
+ In this paper, we propose a new word embedding based corpus consisting of +more than 61 million words crawled from multiple web resources. We design a +preprocessing pipeline for the filtration of unwanted text from crawled data. +Afterwards, the cleaned vocabulary is fed to state-of-the-art +continuous-bag-of-words, skip-gram, and GloVe word embedding algorithms. For +the evaluation of pretrained embeddings, we use popular intrinsic and extrinsic +evaluation approaches. The evaluation results reveal that +continuous-bag-of-words and skip-gram perform better than GloVe and existing +Sindhi fastText word embedding on both intrinsic and extrinsic evaluation +approaches + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:1911.12579 +
+
+
+
+
+ + ☆ Conan-embedding: General Text Embedding with More and Better Negative + Samples + + +
+ With the growing popularity of RAG, the capabilities of embedding models are +gaining increasing attention. Embedding models are primarily trained through +contrastive loss learning, with negative examples being a key component. +Previous work has proposed various hard negative mining strategies, but these +strategies are typically employed as preprocessing steps. In this paper, we +propose the conan-embedding model, which maximizes the utilization of more and +higher-quality negative examples. Specifically, since the model's ability to +handle preprocessed negative examples evolves during training, we propose +dynamic hard negative mining method to expose the model to more challenging +negative examples throughout the training process. Secondly, contrastive +learning requires as many negative examples as possible but is limited by GPU +memory constraints. Therefore, we use a Cross-GPU balancing Loss to provide +more negative examples for embedding training and balance the batch size across +multiple tasks. Moreover, we also discovered that the prompt-response pairs +from LLMs can be used for embedding training. Our approach effectively enhances +the capabilities of embedding models, currently ranking first on the Chinese +leaderboard of Massive text embedding benchmark + +
+
+
+
+
+ + ☆ TempoFormer: A Transformer for Temporally-aware Representations in + Change Detection + + +
+ Dynamic representation learning plays a pivotal role in understanding the +evolution of linguistic content over time. On this front both context and time +dynamics as well as their interplay are of prime importance. Current approaches +model context via pre-trained representations, which are typically temporally +agnostic. Previous work on modeling context and temporal dynamics has used +recurrent methods, which are slow and prone to overfitting. Here we introduce +TempoFormer, the fist task-agnostic transformer-based and temporally-aware +model for dynamic representation learning. Our approach is jointly trained on +inter and intra context dynamics and introduces a novel temporal variation of +rotary positional embeddings. The architecture is flexible and can be used as +the temporal representation foundation of other models or applied to different +transformer-based architectures. We show new SOTA performance on three +different real-time change detection tasks. + +
+
+
+
+
+ + ☆ StyleRemix: Interpretable Authorship Obfuscation via Distillation and + Perturbation of Style Elements + + +
+ Authorship obfuscation, rewriting a text to intentionally obscure the +identity of the author, is an important but challenging task. Current methods +using large language models (LLMs) lack interpretability and controllability, +often ignoring author-specific stylistic features, resulting in less robust +performance overall. + To address this, we develop StyleRemix, an adaptive and interpretable +obfuscation method that perturbs specific, fine-grained style elements of the +original input text. StyleRemix uses pre-trained Low Rank Adaptation (LoRA) +modules to rewrite an input specifically along various stylistic axes (e.g., +formality and length) while maintaining low computational cost. StyleRemix +outperforms state-of-the-art baselines and much larger LLMs in a variety of +domains as assessed by both automatic and human evaluation. + Additionally, we release AuthorMix, a large set of 30K high-quality, +long-form texts from a diverse set of 14 authors and 4 domains, and DiSC, a +parallel corpus of 1,500 texts spanning seven style axes in 16 unique +directions + +
+
+
+
+
+ + ☆ Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts + + +
+ For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to +routing collapse or increased computational overhead. Existing methods commonly +employ an auxiliary loss to encourage load balance, but a large auxiliary loss +will introduce non-negligible interference gradients into training and thus +impair the model performance. In order to control load balance while not +producing undesired gradients during training, we propose Loss-Free Balancing, +featured by an auxiliary-loss-free load balancing strategy. To be specific, +before the top-K routing decision, Loss-Free Balancing will first apply an +expert-wise bias to the routing scores of each expert. By dynamically updating +the bias of each expert according to its recent load, Loss-Free Balancing can +consistently maintain a balanced distribution of expert load. In addition, +since Loss-Free Balancing does not produce any interference gradients, it also +elevates the upper bound of model performance gained from MoE training. We +validate the performance of Loss-Free Balancing on MoE models with up to 3B +parameters trained on up to 200B tokens. Experimental results show that +Loss-Free Balancing achieves both better performance and better load balance +compared with traditional auxiliary-loss-controlled load balancing strategies. + +
+
+
+
+
+ + ☆ Harnessing the Intrinsic Knowledge of Pretrained Language Models for + Challenging Text Classification Settings + + +
+ Text classification is crucial for applications such as sentiment analysis +and toxic text filtering, but it still faces challenges due to the complexity +and ambiguity of natural language. Recent advancements in deep learning, +particularly transformer architectures and large-scale pretraining, have +achieved inspiring success in NLP fields. Building on these advancements, this +thesis explores three challenging settings in text classification by leveraging +the intrinsic knowledge of pretrained language models (PLMs). Firstly, to +address the challenge of selecting misleading yet incorrect distractors for +cloze questions, we develop models that utilize features based on +contextualized word representations from PLMs, achieving performance that +rivals or surpasses human accuracy. Secondly, to enhance model generalization +to unseen labels, we create small finetuning datasets with domain-independent +task label descriptions, improving model performance and robustness. Lastly, we +tackle the sensitivity of large language models to in-context learning prompts +by selecting effective demonstrations, focusing on misclassified examples and +resolving model ambiguity regarding test example labels. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ CBF-LLM: Safe Control for LLM Alignment + + +
+ This paper proposes a control-based framework for aligning large language +models (LLMs) by leveraging a control barrier function (CBF) to ensure +user-desirable text generation. The presented framework applies the safety +filter, designed based on the CBF, to the output generation of the baseline +LLM, i.e., the sequence of the token, with the aim of intervening in the +generated text. The overall text-generation system is implemented with Llama 3 +and a RoBERTa model, and the source code is available at +https://github.com/Mya-Mya/CBF-LLM. The experiment demonstrates its control +ability and effectiveness in reducing the number of interventions needed for +user-specified alignment tasks. + +
+
+
+
+
+ + ☆ Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error + Rate Computations And Granular Error Classifications INTERSPEECH 2024 + + +
+ The Word Error Rate (WER) is the common measure of accuracy for Automatic +Speech Recognition (ASR). Transcripts are usually pre-processed by substituting +specific characters to account for non-semantic differences. As a result of +this normalisation, information on the accuracy of punctuation or +capitalisation is lost. We present a non-destructive, token-based approach +using an extended Levenshtein distance algorithm to compute a robust WER and +additional orthographic metrics. Transcription errors are also classified more +granularly by existing string similarity and phonetic algorithms. An evaluation +on several datasets demonstrates the practical equivalence of our approach +compared to common WER computations. We also provide an exemplary analysis of +derived use cases, such as a punctuation error rate, and a web application for +interactive use and visualisation of our implementation. The code is available +open-source. + +
+
+ comment: Accepted in INTERSPEECH 2024 +
+
+
+
+
+ + ☆ SIaM: Self-Improving Code-Assisted Mathematical Reasoning of Large + Language Models + + +
+ There is a growing trend of teaching large language models (LLMs) to solve +mathematical problems through coding. Existing studies primarily focus on +prompting powerful, closed-source models to generate seed training data +followed by in-domain data augmentation, equipping LLMs with considerable +capabilities for code-aided mathematical reasoning. However, continually +training these models on augmented data derived from a few datasets such as +GSM8K may impair their generalization abilities and restrict their +effectiveness to a narrow range of question types. Conversely, the potential of +improving such LLMs by leveraging large-scale, expert-written, diverse math +question-answer pairs remains unexplored. To utilize these resources and tackle +unique challenges such as code response assessment, we propose a novel paradigm +that uses a code-based critic model to guide steps including question-code data +construction, quality control, and complementary evaluation. We also explore +different alignment algorithms with self-generated instruction/preference data +to foster continuous improvement. Experiments across both in-domain (up to ++5.7%) and out-of-domain (+4.4%) benchmarks in English and Chinese demonstrate +the effectiveness of the proposed paradigm. + +
+
+
+
+
+ + ☆ Boosting Lossless Speculative Decoding via Feature Sampling and Partial + Alignment Distillation AAAI 2025 + + +
+ Lossless speculative decoding accelerates target large language model (LLM) +inference by employing a lightweight draft model for generating tree-structured +candidates, which are subsequently verified in parallel by the target LLM. +Currently, effective approaches leverage feature-level rather than token-level +autoregression within the draft model to facilitate more straightforward +predictions and enhanced knowledge distillation. In this paper, we reassess +these approaches and propose FSPAD (Feature Sampling and Partial Alignment +Distillation for Lossless Speculative Decoding), which introduces two +straightforward and effective components within the existing framework to boost +lossless speculative decoding. Firstly, FSPAD utilizes token embeddings to +sample features of the target LLM in high-dimensional space before feeding them +into the draft model, due to the inherent uncertainty of the features +preventing the draft model from obtaining the specific token output by the +target LLM. Secondly, FSPAD introduces partial alignment distillation to weaken +the draft model's connection between features and logits, aiming to reduce the +conflict between feature alignment and logit confidence during training. Our +experiments include both greedy and non-greedy decoding on the largest and +smallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in +multi-turn conversation, translation, summarization, question answering, +mathematical reasoning, and retrieval-augmented generation. The results show +that FSPAD outperforms the state-of-the-art method across all the +aforementioned tasks and target LLMs. + +
+
+ comment: The work was not submitted to AAAI 2025 +
+
+
+
+
+ + ☆ WildFeedback: Aligning LLMs With In-situ User Interactions And Feedback + + +
+ As large language models (LLMs) continue to advance, aligning these models +with human preferences has emerged as a critical challenge. Traditional +alignment methods, relying on human or LLM annotated datasets, are limited by +their resource-intensive nature, inherent subjectivity, and the risk of +feedback loops that amplify model biases. To overcome these limitations, we +introduce WildFeedback, a novel framework that leverages real-time, in-situ +user interactions to create preference datasets that more accurately reflect +authentic human values. WildFeedback operates through a three-step process: +feedback signal identification, preference data construction, and user-guided +evaluation. We applied this framework to a large corpus of user-LLM +conversations, resulting in a rich preference dataset that reflects genuine +user preferences. This dataset captures the nuances of user preferences by +identifying and classifying feedback signals within natural conversations, +thereby enabling the construction of more representative and context-sensitive +alignment data. Our extensive experiments demonstrate that LLMs fine-tuned on +WildFeedback exhibit significantly improved alignment with user preferences, as +evidenced by both traditional benchmarks and our proposed user-guided +evaluation. By incorporating real-time feedback from actual users, WildFeedback +addresses the scalability, subjectivity, and bias challenges that plague +existing approaches, marking a significant step toward developing LLMs that are +more responsive to the diverse and evolving needs of their users. In summary, +WildFeedback offers a robust, scalable solution for aligning LLMs with true +human values, setting a new standard for the development and evaluation of +user-centric language models. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding + + +
+ Scientific literature understanding is crucial for extracting targeted +information and garnering insights, thereby significantly advancing scientific +discovery. Despite the remarkable success of Large Language Models (LLMs), they +face challenges in scientific literature understanding, primarily due to (1) a +lack of scientific knowledge and (2) unfamiliarity with specialized scientific +tasks. + To develop an LLM specialized in scientific literature understanding, we +propose a hybrid strategy that integrates continual pre-training (CPT) and +supervised fine-tuning (SFT), to simultaneously infuse scientific domain +knowledge and enhance instruction-following capabilities for domain-specific +tasks.cIn this process, we identify two key challenges: (1) constructing +high-quality CPT corpora, and (2) generating diverse SFT instructions. We +address these challenges through a meticulous pipeline, including PDF text +extraction, parsing content error correction, quality filtering, and synthetic +instruction creation. Applying this strategy, we present a suite of LLMs: +SciLitLLM, specialized in scientific literature understanding. These models +demonstrate promising performance on scientific literature understanding +benchmarks. + Our contributions are threefold: (1) We present an effective framework that +integrates CPT and SFT to adapt LLMs to scientific literature understanding, +which can also be easily adapted to other domains. (2) We propose an LLM-based +synthesis method to generate diverse and high-quality scientific instructions, +resulting in a new instruction set -- SciLitIns -- for supervised fine-tuning +in less-represented scientific domains. (3) SciLitLLM achieves promising +performance improvements on scientific literature understanding benchmarks. + +
+
+
+
+
+ + ☆ An Investigation of Warning Erroneous Chat Translations in Cross-lingual + Communication + + +
+ The complexities of chats pose significant challenges for machine translation +models. Recognizing the need for a precise evaluation metric to address the +issues of chat translation, this study introduces Multidimensional Quality +Metrics for Chat Translation (MQM-Chat). Through the experiments of five models +using MQM-Chat, we observed that all models generated certain fundamental +errors, while each of them has different shortcomings, such as omission, overly +correcting ambiguous source content, and buzzword issues, resulting in the loss +of stylized information. Our findings underscore the effectiveness of MQM-Chat +in evaluating chat translation, emphasizing the importance of stylized content +and dialogue consistency for future studies. + +
+
+
+
+
+ + ☆ LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via + Layer-wise Relevance Propagation + + +
+ Retrieval-Augmented Generation (RAG) has become a primary technique for +mitigating hallucinations in large language models (LLMs). However, incomplete +knowledge extraction and insufficient understanding can still mislead LLMs to +produce irrelevant or even contradictory responses, which means hallucinations +persist in RAG. In this paper, we propose LRP4RAG, a method based on the +Layer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations +in RAG. Specifically, we first utilize LRP to compute the relevance between the +input and output of the RAG generator. We then apply further extraction and +resampling to the relevance matrix. The processed relevance data are input into +multiple classifiers to determine whether the output contains hallucinations. +To the best of our knowledge, this is the first time that LRP has been used for +detecting RAG hallucinations, and extensive experiments demonstrate that +LRP4RAG outperforms existing baselines. + +
+
+
+
+
+ + ☆ Dolphin: Long Context as a New Modality for Energy-Efficient On-Device + Language Models + + +
+ This paper presents Dolphin, a novel decoder-decoder architecture for +energy-efficient processing of long contexts in language models. Our approach +addresses the significant energy consumption and latency challenges inherent in +on-device models. Dolphin employs a compact 0.5B parameter decoder to distill +extensive contextual information into a memory embedding, substantially +reducing the input length for the primary 7B parameter decoder model. Inspired +by vision-language models, we repurpose the image embedding projector to encode +long textual contexts, effectively treating extended context as a distinct +modality. This innovative method enables processing of substantially longer +contexts without the typical computational overhead associated with extended +input sequences. Empirical evaluations demonstrate a 10-fold improvement in +energy efficiency and a 5-fold reduction in latency compared to conventional +full-length context processing methods without losing quality of the response. +Our work contributes to the development of more sustainable and scalable +language models for on-device applications, addressing the critical need for +energy-efficient and responsive AI technologies in resource-constrained +environments while maintaining the accuracy to understand long contexts. This +research has implications for the broader field of natural language processing, +particularly in the domain of efficient model design for resource-limited +settings. By enabling more sophisticated AI capabilities on edge devices, +Dolphin paves the way for advanced language processing in a wide range of +applications where computational resources are at a premium. The Dolphin model +is publicly available at https://huggingface.co/NexaAIDev/Dolphin. + +
+
+
+
+
+ + ☆ Towards Fully Autonomous Research Powered by LLMs: Case Study on + Simulations + + +
+ The advent of Large Language Models (LLMs) has created new opportunities for +the automation of scientific research, spanning both experimental processes and +computational simulations. This study explores the feasibility of constructing +an autonomous simulation agent (ASA) powered by LLM, through sophisticated API +integration, to automate the entire research process, from experimental design, +remote upload and simulation execution, data analysis, to report compilation. +Using a simulation problem of polymer chain conformations as a case study, we +assessed the performance of ASAs powered by different LLMs including +GPT-4-Turbo. Our findings revealed that ASA-GPT-4o achieved near-flawless +execution on designated research missions, underscoring the potential of LLMs +to manage complete scientific investigations autonomously. The outlined +automation can be iteratively performed up to twenty cycles without human +intervention, illustrating the potential of LLMs for large-scale autonomous +research endeavors. Additionally, we discussed the intrinsic traits of ASAs in +managing extensive tasks, focusing on self-validation mechanisms and the +balance between local attention and global oversight. + +
+
+ comment: For additional code and data, please visit our GitHub repository: + https://github.com/zokaraa/autonomous_simulation_agent +
+
+
+
+
+ + ☆ Measuring the Reliability of Causal Probing Methods: Tradeoffs, + Limitations, and the Plight of Nullifying Interventions + + +
+ Causal probing is an approach to interpreting foundation models, such as +large language models, by training probes to recognize latent properties of +interest from embeddings, intervening on probes to modify this representation, +and analyzing the resulting changes in the model's behavior. While some recent +works have cast doubt on the theoretical basis of several leading causal +probing intervention methods, it has been unclear how to systematically and +empirically evaluate their effectiveness in practice. To address this problem, +we propose a general empirical analysis framework to evaluate the reliability +of causal probing interventions, formally defining and quantifying two key +causal probing desiderata: completeness (fully transforming the representation +of the target property) and selectivity (minimally impacting other properties). +Our formalism allows us to make the first direct comparisons between different +families of causal probing methods (e.g., linear vs. nonlinear or +counterfactual vs. nullifying interventions). We conduct extensive experiments +across several leading methods, finding that (1) there is an inherent tradeoff +between these criteria, and no method is able to consistently satisfy both at +once; and (2) across the board, nullifying interventions are always far less +complete than counterfactual interventions, indicating that nullifying methods +may not be an effective approach to causal probing. + +
+
+
+
+
+ + ☆ ReMamba: Equip Mamba with Effective Long-Sequence Modeling + + +
+ While the Mamba architecture demonstrates superior inference efficiency and +competitive performance on short-context natural language processing (NLP) +tasks, empirical evidence suggests its capacity to comprehend long contexts is +limited compared to transformer-based models. In this study, we investigate the +long-context efficiency issues of the Mamba models and propose ReMamba, which +enhances Mamba's ability to comprehend long contexts. ReMamba incorporates +selective compression and adaptation techniques within a two-stage re-forward +process, incurring minimal additional inference costs overhead. Experimental +results on the LongBench and L-Eval benchmarks demonstrate ReMamba's efficacy, +improving over the baselines by 3.2 and 1.6 points, respectively, and attaining +performance almost on par with same-size transformer models. + +
+
+
+
+
+ + ☆ Enhancing and Accelerating Large Language Models via Instruction-Aware + Contextual Compression + + +
+ Large Language Models (LLMs) have garnered widespread attention due to their +remarkable performance across various tasks. However, to mitigate the issue of +hallucinations, LLMs often incorporate retrieval-augmented pipeline to provide +them with rich external knowledge and context. Nevertheless, challenges stem +from inaccurate and coarse-grained context retrieved from the retriever. +Supplying irrelevant context to the LLMs can result in poorer responses, +increased inference latency, and higher costs. This paper introduces a method +called Instruction-Aware Contextual Compression, which filters out less +informative content, thereby accelerating and enhancing the use of LLMs. The +experimental results demonstrate that Instruction-Aware Contextual Compression +notably reduces memory consumption and minimizes generation latency while +maintaining performance levels comparable to those achieved with the use of the +full context. Specifically, we achieved a 50% reduction in context-related +costs, resulting in a 5% reduction in inference memory usage and a 2.2-fold +increase in inference speed, with only a minor drop of 0.047 in Rouge-1. These +findings suggest that our method strikes an effective balance between +efficiency and performance. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Legilimens: Practical and Unified Content Moderation for Large Language + Model Services CCS + + +
+ Given the societal impact of unsafe content generated by large language +models (LLMs), ensuring that LLM services comply with safety standards is a +crucial concern for LLM service providers. Common content moderation methods +are limited by an effectiveness-and-efficiency dilemma, where simple models are +fragile while sophisticated models consume excessive computational resources. +In this paper, we reveal for the first time that effective and efficient +content moderation can be achieved by extracting conceptual features from +chat-oriented LLMs, despite their initial fine-tuning for conversation rather +than content moderation. We propose a practical and unified content moderation +framework for LLM services, named Legilimens, which features both effectiveness +and efficiency. Our red-team model-based data augmentation enhances the +robustness of Legilimens against state-of-the-art jailbreaking. Additionally, +we develop a framework to theoretically analyze the cost-effectiveness of +Legilimens compared to other methods. We have conducted extensive experiments +on five host LLMs, seventeen datasets, and nine jailbreaking methods to verify +the effectiveness, efficiency, and robustness of Legilimens against normal and +adaptive adversaries. A comparison of Legilimens with both commercial and +academic baselines demonstrates the superior performance of Legilimens. +Furthermore, we confirm that Legilimens can be applied to few-shot scenarios +and extended to multi-label classification tasks. + +
+
+ comment: Accepted by ACM Conference on Computer and Communications Security + (CCS) 2024 +
+
+
+
+
+ + ☆ FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational + Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench + + +
+ This paper introduces FRACTURED-SORRY-Bench, a framework for evaluating the +safety of Large Language Models (LLMs) against multi-turn conversational +attacks. Building upon the SORRY-Bench dataset, we propose a simple yet +effective method for generating adversarial prompts by breaking down harmful +queries into seemingly innocuous sub-questions. Our approach achieves a maximum +increase of +46.22\% in Attack Success Rates (ASRs) across GPT-4, GPT-4o, +GPT-4o-mini, and GPT-3.5-Turbo models compared to baseline methods. We +demonstrate that this technique poses a challenge to current LLM safety +measures and highlights the need for more robust defenses against subtle, +multi-turn attacks. + +
+
+ comment: 4 pages, 2 tables +
+
+
+
+
+ + ☆ Evaluating Computational Representations of Character: An Austen + Character Similarity Benchmark + + +
+ Several systems have been developed to extract information about characters +to aid computational analysis of English literature. We propose character +similarity grouping as a holistic evaluation task for these pipelines. We +present AustenAlike, a benchmark suite of character similarities in Jane +Austen's novels. Our benchmark draws on three notions of character similarity: +a structurally defined notion of similarity; a socially defined notion of +similarity; and an expert defined set extracted from literary criticism. + We use AustenAlike to evaluate character features extracted using two +pipelines, BookNLP and FanfictionNLP. We build character representations from +four kinds of features and compare them to the three AustenAlike benchmarks and +to GPT-4 similarity rankings. We find that though computational representations +capture some broad similarities based on shared social and narrative roles, the +expert pairings in our third benchmark are challenging for all systems, +highlighting the subtler aspects of similarity noted by human readers. + +
+
+
+
+
+ + ☆ Structured Event Reasoning with Large Language Models + + +
+ Reasoning about real-life events is a unifying challenge in AI and NLP that +has profound utility in a variety of domains, while fallacy in high-stake +applications could be catastrophic. Able to work with diverse text in these +domains, large language models (LLMs) have proven capable of answering +questions and solving problems. However, I show that end-to-end LLMs still +systematically fail to reason about complex events, and they lack +interpretability due to their black-box nature. To address these issues, I +propose three general approaches to use LLMs in conjunction with a structured +representation of events. The first is a language-based representation +involving relations of sub-events that can be learned by LLMs via fine-tuning. +The second is a semi-symbolic representation involving states of entities that +can be predicted and leveraged by LLMs via few-shot prompting. The third is a +fully symbolic representation that can be predicted by LLMs trained with +structured data and be executed by symbolic solvers. On a suite of event +reasoning tasks spanning common-sense inference and planning, I show that each +approach greatly outperforms end-to-end LLMs with more interpretability. These +results suggest manners of synergy between LLMs and structured representations +for event reasoning and beyond. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ Is Personality Prediction Possible Based on Reddit Comments? + + +
+ In this assignment, we examine whether there is a correlation between the +personality type of a person and the texts they wrote. In order to do this, we +aggregated datasets of Reddit comments labeled with the Myers-Briggs Type +Indicator (MBTI) of the author and built different supervised classifiers based +on BERT to try to predict the personality of an author given a text. Despite +experiencing issues with the unfiltered character of the dataset, we can +observe potential in the classification. + +
+
+
+
+
+ + ☆ Logic-Enhanced Language Model Agents for Trustworthy Social Simulations + + +
+ We introduce the Logic-Enhanced Language Model Agents (LELMA) framework, a +novel approach to enhance the trustworthiness of social simulations that +utilize large language models (LLMs). While LLMs have gained attention as +agents for simulating human behaviour, their applicability in this role is +limited by issues such as inherent hallucinations and logical inconsistencies. +LELMA addresses these challenges by integrating LLMs with symbolic AI, enabling +logical verification of the reasoning generated by LLMs. This verification +process provides corrective feedback, refining the reasoning output. The +framework consists of three main components: an LLM-Reasoner for producing +strategic reasoning, an LLM-Translator for mapping natural language reasoning +to logic queries, and a Solver for evaluating these queries. This study focuses +on decision-making in game-theoretic scenarios as a model of human interaction. +Experiments involving the Hawk-Dove game, Prisoner's Dilemma, and Stag Hunt +highlight the limitations of state-of-the-art LLMs, GPT-4 Omni and Gemini 1.0 +Pro, in producing correct reasoning in these contexts. LELMA demonstrates high +accuracy in error detection and improves the reasoning correctness of LLMs via +self-refinement, particularly in GPT-4 Omni. + +
+
+ comment: Source code: https://github.com/dicelab-rhul/LELMA +
+
+
+
+
+ + ☆ Using Large Language Models to Create AI Personas for Replication and + Prediction of Media Effects: An Empirical Test of 133 Published Experimental + Research Findings + + +
+ This report analyzes the potential for large language models (LLMs) to +expedite accurate replication of published message effects studies. We tested +LLM-powered participants (personas) by replicating 133 experimental findings +from 14 papers containing 45 recent studies in the Journal of Marketing +(January 2023-May 2024). We used a new software tool, Viewpoints AI +(https://viewpoints.ai/), that takes study designs, stimuli, and measures as +input, automatically generates prompts for LLMs to act as a specified sample of +unique personas, and collects their responses to produce a final output in the +form of a complete dataset and statistical analysis. The underlying LLM used +was Anthropic's Claude Sonnet 3.5. We generated 19,447 AI personas to replicate +these studies with the exact same sample attributes, study designs, stimuli, +and measures reported in the original human research. Our LLM replications +successfully reproduced 76% of the original main effects (84 out of 111), +demonstrating strong potential for AI-assisted replication of studies in which +people respond to media stimuli. When including interaction effects, the +overall replication rate was 68% (90 out of 133). The use of LLMs to replicate +and accelerate marketing research on media effects is discussed with respect to +the replication crisis in social science, potential solutions to +generalizability problems in sampling subjects and experimental conditions, and +the ability to rapidly test consumer responses to various media stimuli. We +also address the limitations of this approach, particularly in replicating +complex interaction effects in media response studies, and suggest areas for +future research and improvement in AI-assisted experimental replication of +media effects. + +
+
+ comment: 24 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Flextron: Many-in-One Flexible Large Language Model + + +
+ Training modern LLMs is extremely resource intensive, and customizing them +for various deployment scenarios characterized by limited compute and memory +resources through repeated training is impractical. In this paper, we introduce +Flextron, a network architecture and post-training model optimization framework +supporting flexible model deployment. The Flextron architecture utilizes a +nested elastic structure to rapidly adapt to specific user-defined latency and +accuracy targets during inference with no additional fine-tuning required. It +is also input-adaptive, and can automatically route tokens through its +sub-networks for improved performance and efficiency. We present a +sample-efficient training method and associated routing algorithms for +systematically transforming an existing trained LLM into a Flextron model. We +evaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate +superior performance over multiple end-to-end trained variants and other +state-of-the-art elastic networks, all with a single pretraining run that +consumes a mere 7.63% tokens compared to original pretraining. + +
+
+
+
+
+ + ♻ ☆ Towards Human-Level Text Coding with LLMs: The Case of Fatherhood Roles + in Public Policy Documents + + +
+ Recent advances in large language models (LLMs) like GPT-3.5 and GPT-4 +promise automation with better results and less programming, opening up new +opportunities for text analysis in political science. In this study, we +evaluate LLMs on three original coding tasks involving typical complexities +encountered in political science settings: a non-English language, legal and +political jargon, and complex labels based on abstract constructs. Along the +paper, we propose a practical workflow to optimize the choice of the model and +the prompt. We find that the best prompting strategy consists of providing the +LLMs with a detailed codebook, as the one provided to human coders. In this +setting, an LLM can be as good as or possibly better than a human annotator +while being much faster, considerably cheaper, and much easier to scale to +large amounts of text. We also provide a comparison of GPT and popular +open-source LLMs, discussing the trade-offs in the model's choice. Our software +allows LLMs to be easily used as annotators and is publicly available: +https://github.com/lorelupo/pappa. + +
+
+
+
+
+ + ♻ ☆ HC3 Plus: A Semantic-Invariant Human ChatGPT Comparison Corpus CIKM2023 + + +
+ ChatGPT has garnered significant interest due to its impressive performance; +however, there is growing concern about its potential risks, particularly in +the detection of AI-generated content (AIGC), which is often challenging for +untrained individuals to identify. Current datasets used for detecting +ChatGPT-generated text primarily focus on question-answering tasks, often +overlooking tasks with semantic-invariant properties, such as summarization, +translation, and paraphrasing. In this paper, we demonstrate that detecting +model-generated text in semantic-invariant tasks is more challenging. To +address this gap, we introduce a more extensive and comprehensive dataset that +incorporates a wider range of tasks than previous work, including those with +semantic-invariant properties. + +
+
+ comment: This paper has been accepted by CIKM2023 workshop +
+
+
+
+
+ + ♻ ☆ From Complexity to Clarity: How AI Enhances Perceptions of Scientists + and the Public's Understanding of Science + + +
+ This paper evaluated the effectiveness of using generative AI to simplify +science communication and enhance the public's understanding of science. By +comparing lay summaries of journal articles from PNAS, yoked to those generated +by AI, this work first assessed linguistic simplicity differences across such +summaries and public perceptions in follow-up experiments. Specifically, Study +1a analyzed simplicity features of PNAS abstracts (scientific summaries) and +significance statements (lay summaries), observing that lay summaries were +indeed linguistically simpler, but effect size differences were small. Study 1b +used a large language model, GPT-4, to create significance statements based on +paper abstracts and this more than doubled the average effect size without +fine-tuning. Study 2 experimentally demonstrated that simply-written GPT +summaries facilitated more favorable perceptions of scientists (they were +perceived as more credible and trustworthy, but less intelligent) than more +complexly-written human PNAS summaries. Crucially, Study 3 experimentally +demonstrated that participants comprehended scientific writing better after +reading simple GPT summaries compared to complex PNAS summaries. In their own +words, participants also summarized scientific papers in a more detailed and +concrete manner after reading GPT summaries compared to PNAS summaries of the +same article. AI has the potential to engage scientific communities and the +public via a simple language heuristic, advocating for its integration into +scientific dissemination for a more informed society. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ RecurrentGemma: Moving Past Transformers for Efficient Open Language + Models + + +
+ We introduce RecurrentGemma, a family of open language models which uses +Google's novel Griffin architecture. Griffin combines linear recurrences with +local attention to achieve excellent performance on language. It has a +fixed-sized state, which reduces memory use and enables efficient inference on +long sequences. We provide two sizes of models, containing 2B and 9B +parameters, and provide pre-trained and instruction tuned variants for both. +Our models achieve comparable performance to similarly-sized Gemma baselines +despite being trained on fewer tokens. + +
+
+
+
+
+ + ♻ ☆ A Statistical Framework of Watermarks for Large Language Models: Pivot, + Detection Efficiency and Optimal Rules + + +
+ Since ChatGPT was introduced in November 2022, embedding (nearly) +unnoticeable statistical signals into text generated by large language models +(LLMs), also known as watermarking, has been used as a principled approach to +provable detection of LLM-generated text from its human-written counterpart. In +this paper, we introduce a general and flexible framework for reasoning about +the statistical efficiency of watermarks and designing powerful detection +rules. Inspired by the hypothesis testing formulation of watermark detection, +our framework starts by selecting a pivotal statistic of the text and a secret +key -- provided by the LLM to the verifier -- to enable controlling the false +positive rate (the error of mistakenly detecting human-written text as +LLM-generated). Next, this framework allows one to evaluate the power of +watermark detection rules by obtaining a closed-form expression of the +asymptotic false negative rate (the error of incorrectly classifying +LLM-generated text as human-written). Our framework further reduces the problem +of determining the optimal detection rule to solving a minimax optimization +program. We apply this framework to two representative watermarks -- one of +which has been internally implemented at OpenAI -- and obtain several findings +that can be instrumental in guiding the practice of implementing watermarks. In +particular, we derive optimal detection rules for these watermarks under our +framework. These theoretically derived detection rules are demonstrated to be +competitive and sometimes enjoy a higher power than existing detection +approaches through numerical experiments. + +
+
+
+
+
+ + ♻ ☆ Downstream bias mitigation is all you need + + +
+ The advent of transformer-based architectures and large language models +(LLMs) have significantly advanced the performance of natural language +processing (NLP) models. Since these LLMs are trained on huge corpuses of data +from the web and other sources, there has been a major concern about harmful +prejudices that may potentially be transferred from the data. In many +applications, these pre-trained LLMs are fine-tuned on task specific datasets, +which can further contribute to biases. This paper studies the extent of biases +absorbed by LLMs during pre-training as well as task-specific behaviour after +fine-tuning. We found that controlled interventions on pre-trained LLMs, prior +to fine-tuning, have minimal effect on lowering biases in classifiers. However, +the biases present in domain-specific datasets play a much bigger role, and +hence mitigating them at this stage has a bigger impact. While pre-training +does matter, but after the model has been pre-trained, even slight changes to +co-occurrence rates in the fine-tuning dataset has a significant effect on the +bias of the model. + +
+
+ comment: arXiv admin note: This work has been withdrawn by arXiv + administrators due to inappropriate text reuse from external sources +
+
+
+
+
+ + ♻ ☆ Look Before You Leap: Towards Decision-Aware and Generalizable + Tool-Usage for Large Language Models + + +
+ Tool-augmented large language models (LLMs) are attracting widespread +attention when accessing up-to-date knowledge and alleviating hallucination +issues. Nowadays, advanced closed-source LLMs (e.g., ChatGPT) have demonstrated +surprising tool-usage capabilities through prompting and in-context learning +techniques. To empower the capabilities of open-source LLMs (e.g., LLaMA) in +manipulating tools, current efforts focus on either template-driven or +token-triggered tool-usage. However, the former hampers LLMs' flexibility to +address diverse user's queries due to constrained tool interactions, while the +latter limits the generalizability when engaging with new tools, since +tool-usage learning is based on task- and tool-specific datasets. To alleviate +these concerns, in this paper, we propose a decision-aware and generalizable +tool-usage framework (DEER). Specifically, we first construct the tool-usage +samples with multiple decision branches via an automatic generation pipeline, +thereby inspiring the decision-making awareness of LLMs under diverse +scenarios. Meanwhile, we propose a novel tool sampling strategy to enhance the +generalizability of LLMs over unseen tools. Extensive experiments demonstrate +that our proposed DEER is effective and significantly outperforms baselines +across various datasets. + +
+
+ comment: 20 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ eRST: A Signaled Graph Theory of Discourse Relations and Organization + + +
+ In this article we present Enhanced Rhetorical Structure Theory (eRST), a new +theoretical framework for computational discourse analysis, based on an +expansion of Rhetorical Structure Theory (RST). The framework encompasses +discourse relation graphs with tree-breaking, non-projective and concurrent +relations, as well as implicit and explicit signals which give explainable +rationales to our analyses. We survey shortcomings of RST and other existing +frameworks, such as Segmented Discourse Representation Theory (SDRT), the Penn +Discourse Treebank (PDTB) and Discourse Dependencies, and address these using +constructs in the proposed theory. We provide annotation, search and +visualization tools for data, and present and evaluate a freely available +corpus of English annotated according to our framework, encompassing 12 spoken +and written genres with over 200K tokens. Finally, we discuss automatic +parsing, evaluation metrics and applications for data in our framework. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Statistical Foundations of Chain-of-Thought Prompting + Methods + + +
+ Chain-of-Thought (CoT) prompting and its variants have gained popularity as +effective methods for solving multi-step reasoning problems using pretrained +large language models (LLMs). In this work, we analyze CoT prompting from a +statistical estimation perspective, providing a comprehensive characterization +of its sample complexity. To this end, we introduce a multi-step latent +variable model that encapsulates the reasoning process, where the latent +variable encodes the task information. Under this framework, we demonstrate +that when the pretraining dataset is sufficiently large, the estimator formed +by CoT prompting is equivalent to a Bayesian estimator. This estimator +effectively solves the multi-step reasoning problem by aggregating a posterior +distribution inferred from the demonstration examples in the prompt. Moreover, +we prove that the statistical error of the CoT estimator can be decomposed into +two main components: (i) a prompting error, which arises from inferring the +true task using CoT prompts, and (ii) the statistical error of the pretrained +LLM. We establish that, under appropriate assumptions, the prompting error +decays exponentially to zero as the number of demonstrations increases. +Additionally, we explicitly characterize the approximation and generalization +errors of the pretrained LLM. Notably, we construct a transformer model that +approximates the target distribution of the multi-step reasoning problem with +an error that decreases exponentially in the number of transformer blocks. Our +analysis extends to other variants of CoT, including Self-Consistent CoT, +Tree-of-Thought, and Selection-Inference, offering a broad perspective on the +efficacy of these methods. We also provide numerical experiments to validate +the theoretical findings. + +
+
+ comment: 150 pages, 18 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Stick to your Role! Stability of Personal Values Expressed in Large + Language Models + + +
+ The standard way to study Large Language Models (LLMs) with benchmarks or +psychology questionnaires is to provide many different queries from similar +minimal contexts (e.g. multiple choice questions). However, due to LLMs' highly +context-dependent nature, conclusions from such minimal-context evaluations may +be little informative about the model's behavior in deployment (where it will +be exposed to many new contexts). We argue that context-dependence +(specifically, value stability) should be studied as a specific property of +LLMs and used as another dimension of LLM comparison (alongside others such as +cognitive abilities, knowledge, or model size). We present a case-study on the +stability of value expression over different contexts (simulated conversations +on different topics) as measured using a standard psychology questionnaire +(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we +study Rank-order stability on the population (interpersonal) level, and +Ipsative stability on the individual (intrapersonal) level. We consider two +settings (with and without instructing LLMs to simulate particular personas), +two simulated populations, and three downstream tasks. We observe consistent +trends in the stability of models and model families - Mixtral, Mistral, +GPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency +of these trends implies that some models exhibit higher value stability than +others, and that stability can be estimated with the set of introduced +methodological tools. When instructed to simulate particular personas, LLMs +exhibit low Rank-order stability, which further diminishes with conversation +length. This highlights the need for future research on LLMs that coherently +simulate different personas. This paper provides a foundational step in that +direction, and, to our knowledge, it is the first study of value stability in +LLMs. + +
+
+ comment: The project website and code are available at + https://sites.google.com/view/llmvaluestability Published in PLOS ONE ( + https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ), + and a shorter version at CogSci 24 ( + https://escholarship.org/uc/item/7w4823c6 ) +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models on Spatial Tasks: A Multi-Task + Benchmarking Study + + +
+ The advent of large language models such as ChatGPT, Gemini, and others has +underscored the importance of evaluating their diverse capabilities, ranging +from natural language understanding to code generation. However, their +performance on spatial tasks has not been comprehensively assessed. This study +addresses this gap by introducing a novel multi-task spatial evaluation +dataset, designed to systematically explore and compare the performance of +several advanced models on spatial tasks. The dataset encompasses twelve +distinct task types, including spatial understanding and path planning, each +with verified, accurate answers. We evaluated multiple models, including +OpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase +testing approach. Initially, we conducted zero-shot testing, followed by +categorizing the dataset by difficulty and performing prompt tuning tests. +Results indicate that gpt-4o achieved the highest overall accuracy in the first +phase, with an average of 71.3%. Although moonshot-v1-8k slightly +underperformed overall, it surpassed gpt-4o in place name recognition tasks. +The study also highlights the impact of prompt strategies on model performance +in specific tasks. For example, the Chain-of-Thought (COT) strategy increased +gpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot +strategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to +76.3%. + +
+
+
+
+
+ + ♻ ☆ Language-specific Calibration for Pruning Multilingual Language Models + + +
+ Recent advances in large language model (LLM) pruning have shown +state-of-the-art compression results in post-training and retraining-free +settings while maintaining high predictive performance. However, such research +mainly considers calibrating pruning using English text, despite the +multilingual nature of modern LLMs and their frequent uses in non-English +languages. In this paper, we set out to explore effective strategies for +calibrating the pruning of multilingual language models. We present the first +comprehensive empirical study, comparing different calibration languages for +pruning multilingual models across diverse tasks, models, and state-of-the-art +pruning techniques. Our results present practical suggestions, for example, +calibrating in the target language can efficiently yield lower perplexity, but +does not necessarily benefit downstream tasks. Our further analysis experiments +unveil that calibration in the target language mainly contributes to preserving +language-specific features related to fluency and coherence, but might not +contribute to capturing language-agnostic features such as language +understanding and reasoning. Last, we provide practical recommendations for +future practitioners. + +
+
+
+
+
+ + ♻ ☆ Evading AI-Generated Content Detectors using Homoglyphs + + +
+ The advent of large language models (LLMs) has enabled the generation of text +that increasingly exhibits human-like characteristics. As the detection of such +content is of significant importance, numerous studies have been conducted with +the aim of developing reliable AI-generated text detectors. These detectors +have demonstrated promising results on test data, but recent research has +revealed that they can be circumvented by employing different techniques. In +this paper, we present homoglyph-based attacks ($a \rightarrow {\alpha}$) as a +means of circumventing existing detectors. A comprehensive evaluation was +conducted to assess the effectiveness of these attacks on seven detectors, +including ArguGPT, Binoculars, DetectGPT, Fast-DetectGPT, Ghostbuster, OpenAI's +detector, and watermarking techniques, on five different datasets. Our findings +demonstrate that homoglyph-based attacks can effectively circumvent +state-of-the-art detectors, leading them to classify all texts as either +AI-generated or human-written (decreasing the average Matthews Correlation +Coefficient from 0.64 to -0.01). We then examine the effectiveness of these +attacks by analyzing how homoglyphs impact different families of detectors. +Finally, we discuss the implications of these findings and potential defenses +against such attacks. + +
+
+
+
+
+ + ♻ ☆ Deciphering the Impact of Pretraining Data on Large Language Models + through Machine Unlearning ACL 2024 + + +
+ Through pretraining on a corpus with various sources, Large Language Models +(LLMs) have gained impressive performance. However, the impact of each +component of the pretraining corpus remains opaque. As a result, the +organization of the pretraining corpus is still empirical and may deviate from +the optimal. To address this issue, we systematically analyze the impact of 48 +datasets from 5 major categories of pretraining data of LLMs and measure their +impacts on LLMs using benchmarks about nine major categories of model +capabilities. Our analyses provide empirical results about the contribution of +multiple corpora on the performances of LLMs, along with their joint impact +patterns, including complementary, orthogonal, and correlational relationships. +We also identify a set of ``high-impact data'' such as Books that is +significantly related to a set of model capabilities. These findings provide +insights into the organization of data to support more efficient pretraining of +LLMs. + +
+
+ comment: Accepted by ACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for + Multi-stage Ranking + + +
+ This paper describes the PASH participation in TREC 2021 Deep Learning Track. +In the recall stage, we adopt a scheme combining sparse and dense retrieval +method. In the multi-stage ranking phase, point-wise and pair-wise ranking +strategies are used one after another based on model continual pre-trained on +general knowledge and document-level data. Compared to TREC 2020 Deep Learning +Track, we have additionally introduced the generative model T5 to further +enhance the performance. + +
+
+ comment: TREC 2021 +
+
+
+
+
+ + ♻ ☆ Large Language Model Sentinel: LLM Agent for Adversarial Purification + + +
+ Over the past two years, the use of large language models (LLMs) has advanced +rapidly. While these LLMs offer considerable convenience, they also raise +security concerns, as LLMs are vulnerable to adversarial attacks by some +well-designed textual perturbations. In this paper, we introduce a novel +defense technique named Large LAnguage MOdel Sentinel (LLAMOS), which is +designed to enhance the adversarial robustness of LLMs by purifying the +adversarial textual examples before feeding them into the target LLM. Our +method comprises two main components: a) Agent instruction, which can simulate +a new agent for adversarial defense, altering minimal characters to maintain +the original meaning of the sentence while defending against attacks; b) +Defense guidance, which provides strategies for modifying clean or adversarial +examples to ensure effective defense and accurate outputs from the target LLMs. +Remarkably, the defense agent demonstrates robust defensive capabilities even +without learning from adversarial examples. Additionally, we conduct an +intriguing adversarial experiment where we develop two agents, one for defense +and one for attack, and engage them in mutual confrontation. During the +adversarial interactions, neither agent completely beat the other. Extensive +experiments on both open-source and closed-source LLMs demonstrate that our +method effectively defends against adversarial attacks, thereby enhancing +adversarial robustness. + +
+
+
+
+
+ + ♻ ☆ AI-native Memory: A Pathway from LLMs Towards AGI + + +
+ Large language models (LLMs) have demonstrated the world with the sparks of +artificial general intelligence (AGI). One opinion, especially from some +startups working on LLMs, argues that an LLM with nearly unlimited context +length can realize AGI. However, they might be too optimistic about the +long-context capability of (existing) LLMs -- (1) Recent literature has shown +that their effective context length is significantly smaller than their claimed +context length; and (2) Our reasoning-in-a-haystack experiments further +demonstrate that simultaneously finding the relevant information from a long +context and conducting (simple) reasoning is nearly impossible. In this paper, +we envision a pathway from LLMs to AGI through the integration of +\emph{memory}. We believe that AGI should be a system where LLMs serve as core +processors. In addition to raw data, the memory in this system would store a +large number of important conclusions derived from reasoning processes. +Compared with retrieval-augmented generation (RAG) that merely processing raw +data, this approach not only connects semantically related information closer, +but also simplifies complex inferences at the time of querying. As an +intermediate stage, the memory will likely be in the form of natural language +descriptions, which can be directly consumed by users too. Ultimately, every +agent/person should have its own large personal model, a deep neural network +model (thus \emph{AI-native}) that parameterizes and compresses all types of +memory, even the ones cannot be described by natural languages. Finally, we +discuss the significant potential of AI-native memory as the transformative +infrastructure for (proactive) engagement, personalization, distribution, and +social in the AGI era, as well as the incurred privacy and security challenges +with preliminary solutions. + +
+
+
+
+
+ + ♻ ☆ SkyScript-100M: 1,000,000,000 Pairs of Scripts and Shooting Scripts for + Short Drama + + +
+ Generating high-quality shooting scripts containing information such as scene +and shot language is essential for short drama script generation. We collect +6,660 popular short drama episodes from the Internet, each with an average of +100 short episodes, and the total number of short episodes is about 80,000, +with a total duration of about 2,000 hours and totaling 10 terabytes (TB). We +perform keyframe extraction and annotation on each episode to obtain about +10,000,000 shooting scripts. We perform 100 script restorations on the +extracted shooting scripts based on our self-developed large short drama +generation model SkyReels. This leads to a dataset containing 1,000,000,000 +pairs of scripts and shooting scripts for short dramas, called SkyScript-100M. +We compare SkyScript-100M with the existing dataset in detail and demonstrate +some deeper insights that can be achieved based on SkyScript-100M. Based on +SkyScript-100M, researchers can achieve several deeper and more far-reaching +script optimization goals, which may drive a paradigm shift in the entire field +of text-to-video and significantly advance the field of short drama video +generation. The data and code are available at +https://github.com/vaew/SkyScript-100M. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with + Flow-based Scalar Latent Transformer Diffusion Models + + +
+ Scaling Text-to-speech (TTS) to large-scale datasets has been demonstrated as +an effective method for improving the diversity and naturalness of synthesized +speech. At the high level, previous large-scale TTS models can be categorized +into either Auto-regressive (AR) based (\textit{e.g.}, VALL-E) or +Non-auto-regressive (NAR) based models (\textit{e.g.}, NaturalSpeech 2/3). +Although these works demonstrate good performance, they still have potential +weaknesses. For instance, AR-based models are plagued by unstable generation +quality and slow generation speed; meanwhile, some NAR-based models need +phoneme-level duration alignment information, thereby increasing the complexity +of data pre-processing, model design, and loss design. In this work, we build +upon our previous publication by implementing a simple and efficient +non-autoregressive (NAR) TTS framework, termed SimpleSpeech 2. SimpleSpeech 2 +effectively combines the strengths of both autoregressive (AR) and +non-autoregressive (NAR) methods, offering the following key advantages: (1) +simplified data preparation; (2) straightforward model and loss design; and (3) +stable, high-quality generation performance with fast inference speed. Compared +to our previous publication, we present ({\romannumeral1}) a detailed analysis +of the influence of speech tokenizer and noisy label for TTS performance; +({\romannumeral2}) four distinct types of sentence duration predictors; +({\romannumeral3}) a novel flow-based scalar latent transformer diffusion +model. With these improvement, we show a significant improvement in generation +performance and generation speed compared to our previous work and other +state-of-the-art (SOTA) large-scale TTS models. Furthermore, we show that +SimpleSpeech 2 can be seamlessly extended to multilingual TTS by training it on +multilingual speech datasets. Demos are available on: +{https://dongchaoyang.top/SimpleSpeech2\_demo/}. + +
+
+ comment: Submit to TASLP +
+
+
+
+
+ + ♻ ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models + + +
+ This report introduces xGen-MM (also known as BLIP-3), a framework for +developing Large Multimodal Models (LMMs). The framework comprises meticulously +curated datasets, a training recipe, model architectures, and a resulting suite +of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen +initiative on foundation AI models. Our models undergo rigorous evaluation +across a range of tasks, including both single and multi-image benchmarks. Our +pre-trained base model exhibits strong in-context learning capabilities and the +instruction-tuned model demonstrates competitive performance among open-source +LMMs with similar model sizes. In addition, we introduce a safety-tuned model +with DPO, aiming to mitigate harmful behaviors such as hallucinations and +improve safety. We open-source our models, curated large-scale datasets, and +our fine-tuning codebase to facilitate further advancements in LMM research. +Associated resources will be available on our project page above. + +
+
+
+
+
+ + ♻ ☆ A Survey of Large Language Models for European Languages + + +
+ Large Language Models (LLMs) have gained significant attention due to their +high performance on a wide range of natural language tasks since the release of +ChatGPT. The LLMs learn to understand and generate language by training +billions of model parameters on vast volumes of text data. Despite being a +relatively new field, LLM research is rapidly advancing in various directions. +In this paper, we present an overview of LLM families, including LLaMA, PaLM, +GPT, and MoE, and the methods developed to create and enhance LLMs for official +European Union (EU) languages. We provide a comprehensive summary of common +monolingual and multilingual datasets used for pretraining large language +models. + +
+
+
+
+
+ + ♻ ☆ WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation + Integrating Web Search and Knowledge Graphs KDD + + +
+ Large Language Models (LLMs) have greatly contributed to the development of +adaptive intelligent agents and are positioned as an important way to achieve +Artificial General Intelligence (AGI). However, LLMs are prone to produce +factually incorrect information and often produce "phantom" content that +undermines their reliability, which poses a serious challenge for their +deployment in real-world scenarios. Enhancing LLMs by combining external +databases and information retrieval mechanisms is an effective path. To address +the above challenges, we propose a new approach called WeKnow-RAG, which +integrates Web search and Knowledge Graphs into a "Retrieval-Augmented +Generation (RAG)" system. First, the accuracy and reliability of LLM responses +are improved by combining the structured representation of Knowledge Graphs +with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes +domain-specific knowledge graphs to satisfy a variety of queries and domains, +thereby improving performance on factual information and complex reasoning +tasks by employing multi-stage web page retrieval techniques using both sparse +and dense retrieval methods. Our approach effectively balances the efficiency +and accuracy of information retrieval, thus improving the overall retrieval +process. Finally, we also integrate a self-assessment mechanism for the LLM to +evaluate the trustworthiness of the answers it generates. Our approach proves +its outstanding effectiveness in a wide range of offline experiments and online +submissions. + +
+
+ comment: 8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta + KDD Cup 2024 CRAG Challenge +
+
+
+
+
+ + ♻ ☆ Large Language Models Understand Layout ECAI-2024 + + +
+ Large language models (LLMs) demonstrate extraordinary abilities in a wide +range of natural language processing (NLP) tasks. In this paper, we show that, +beyond text understanding capability, LLMs are capable of processing text +layouts that are denoted by spatial markers. They are able to answer questions +that require explicit spatial perceiving and reasoning, while a drastic +performance drop is observed when the spatial markers from the original data +are excluded. We perform a series of experiments with the GPT-3.5, Baichuan2, +Llama2 and ChatGLM3 models on various types of layout-sensitive datasets for +further analysis. The experimental results reveal that the layout understanding +ability of LLMs is mainly introduced by the coding data for pretraining, which +is further enhanced at the instruction-tuning stage. In addition, layout +understanding can be enhanced by integrating low-cost, auto-generated data +approached by a novel text game. Finally, we show that layout understanding +ability is beneficial for building efficient visual question-answering (VQA) +systems. + +
+
+ comment: This paper has been accepted by ECAI-2024 +
+
+
+
+
+ + ♻ ☆ VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view + Videos of Daily Activities CIKM2024 + + +
+ Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data +(e.g., images and videos) into symbols, have attracted attention as resources +enabling knowledge processing and machine learning across modalities. However, +the construction of MMKGs for videos consisting of multiple events, such as +daily activities, is still in the early stages. In this paper, we construct an +MMKG based on synchronized multi-view simulated videos of daily activities. +Besides representing the content of daily life videos as event-centric +knowledge, our MMKG also includes frame-by-frame fine-grained changes, such as +bounding boxes within video frames. In addition, we provide support tools for +querying our MMKG. As an application example, we demonstrate that our MMKG +facilitates benchmarking vision-language models by providing the necessary +vision-language datasets for a tailored task. + +
+
+ comment: 5 pages, 4 figures, accepted by CIKM2024 Resource Track +
+
+
+
+
+ + ♻ ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation + + +
+ Diffusion-based video generation models have made significant strides, +producing outputs with improved visual fidelity, temporal coherence, and user +control. These advancements hold great promise for improving surgical education +by enabling more realistic, diverse, and interactive simulation environments. +In this study, we introduce SurGen, a text-guided diffusion model tailored for +surgical video synthesis, producing the highest resolution and longest duration +videos among existing surgical video generation models. We validate the visual +and temporal quality of the outputs using standard image and video generation +metrics. Additionally, we assess their alignment to the corresponding text +prompts through a deep learning classifier trained on surgical data. Our +results demonstrate the potential of diffusion models to serve as valuable +educational tools for surgical trainees. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 148 + +
+
+
+ + ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of + Encoders + + +
+ The ability to accurately interpret complex visual information is a crucial +topic of multimodal large language models (MLLMs). Recent work indicates that +enhanced visual perception significantly reduces hallucinations and improves +performance on resolution-sensitive tasks, such as optical character +recognition and document analysis. A number of recent MLLMs achieve this goal +using a mixture of vision encoders. Despite their success, there is a lack of +systematic comparisons and detailed ablation studies addressing critical +aspects, such as expert selection and the integration of multiple vision +experts. This study provides an extensive exploration of the design space for +MLLMs using a mixture of vision encoders and resolutions. Our findings reveal +several underlying principles common to various existing strategies, leading to +a streamlined yet effective design approach. We discover that simply +concatenating visual tokens from a set of complementary vision encoders is as +effective as more complex mixing architectures or strategies. We additionally +introduce Pre-Alignment to bridge the gap between vision-focused encoders and +language tokens, enhancing model coherence. The resulting family of MLLMs, +Eagle, surpasses other leading open-source models on major MLLM benchmarks. +Models and code: https://github.com/NVlabs/Eagle + +
+
+ comment: Github: https://github.com/NVlabs/Eagle, HuggingFace: + https://huggingface.co/NVEagle +
+
+
+
+
+ + ☆ Spatio-Temporal Context Prompting for Zero-Shot Action Detection + + +
+ Spatio-temporal action detection encompasses the tasks of localizing and +classifying individual actions within a video. Recent works aim to enhance this +process by incorporating interaction modeling, which captures the relationship +between people and their surrounding context. However, these approaches have +primarily focused on fully-supervised learning, and the current limitation lies +in the lack of generalization capability to recognize unseen action categories. +In this paper, we aim to adapt the pretrained image-language models to detect +unseen actions. To this end, we propose a method which can effectively leverage +the rich knowledge of visual-language models to perform Person-Context +Interaction. Meanwhile, our Context Prompting module will utilize contextual +information to prompt labels, thereby enhancing the generation of more +representative text features. Moreover, to address the challenge of recognizing +distinct actions by multiple people at the same timestamp, we design the +Interest Token Spotting mechanism which employs pretrained visual knowledge to +find each person's interest context tokens, and then these tokens will be used +for prompting to generate text features tailored to each individual. To +evaluate the ability to detect unseen actions, we propose a comprehensive +benchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our +method achieves superior results compared to previous approaches and can be +further extended to multi-action videos, bringing it closer to real-world +applications. The code and data can be found in +https://webber2933.github.io/ST-CLIP-project-page. + +
+
+
+
+
+ + ☆ TEDRA: Text-based Editing of Dynamic and Photoreal Actors + + +
+ Over the past years, significant progress has been made in creating +photorealistic and drivable 3D avatars solely from videos of real humans. +However, a core remaining challenge is the fine-grained and user-friendly +editing of clothing styles by means of textual descriptions. To this end, we +present TEDRA, the first method allowing text-based edits of an avatar, which +maintains the avatar's high fidelity, space-time coherency, as well as +dynamics, and enables skeletal pose and view control. We begin by training a +model to create a controllable and high-fidelity digital replica of the real +actor. Next, we personalize a pretrained generative diffusion model by +fine-tuning it on various frames of the real character captured from different +camera angles, ensuring the digital representation faithfully captures the +dynamics and movements of the real person. This two-stage process lays the +foundation for our approach to dynamic human avatar editing. Utilizing this +personalized diffusion model, we modify the dynamic avatar based on a provided +text prompt using our Personalized Normal Aligned Score Distillation Sampling +(PNA-SDS) within a model-based guidance framework. Additionally, we propose a +time step annealing strategy to ensure high-quality edits. Our results +demonstrate a clear improvement over prior work in functionality and visual +quality. + +
+
+ comment: For project page, see this https://vcai.mpi-inf.mpg.de/projects/Tedra +
+
+
+
+
+ + ☆ Perceive-IR: Learning to Perceive Degradation Better for All-in-One + Image Restoration + + +
+ The limitations of task-specific and general image restoration methods for +specific degradation have prompted the development of all-in-one image +restoration techniques. However, the diversity of patterns among multiple +degradation, along with the significant uncertainties in mapping between +degraded images of different severities and their corresponding undistorted +versions, pose significant challenges to the all-in-one restoration tasks. To +address these challenges, we propose Perceive-IR, an all-in-one image restorer +designed to achieve fine-grained quality control that enables restored images +to more closely resemble their undistorted counterparts, regardless of the type +or severity of degradation. Specifically, Perceive-IR contains two stages: (1) +prompt learning stage and (2) restoration stage. In the prompt learning stage, +we leverage prompt learning to acquire a fine-grained quality perceiver capable +of distinguishing three-tier quality levels by constraining the prompt-image +similarity in the CLIP perception space. Subsequently, this quality perceiver +and difficulty-adaptive perceptual loss are integrated as a quality-aware +learning strategy to realize fine-grained quality control in restoration stage. +For the restoration stage, a semantic guidance module (SGM) and compact feature +extraction (CFE) are proposed to further promote the restoration process by +utilizing the robust semantic information from the pre-trained large scale +vision models and distinguishing degradation-specific features. Extensive +experiments demonstrate that our Perceive-IR outperforms state-of-the-art +methods in all-in-one image restoration tasks and exhibit superior +generalization ability when dealing with unseen tasks. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ ClimDetect: A Benchmark Dataset for Climate Change Detection and + Attribution + + +
+ Detecting and attributing temperature increases due to climate change is +crucial for understanding global warming and guiding adaptation strategies. The +complexity of distinguishing human-induced climate signals from natural +variability has challenged traditional detection and attribution (D&A) +approaches, which seek to identify specific "fingerprints" in climate response +variables. Deep learning offers potential for discerning these complex patterns +in expansive spatial datasets. However, lack of standard protocols has hindered +consistent comparisons across studies. We introduce ClimDetect, a standardized +dataset of over 816k daily climate snapshots, designed to enhance model +accuracy in identifying climate change signals. ClimDetect integrates various +input and target variables used in past research, ensuring comparability and +consistency. We also explore the application of vision transformers (ViT) to +climate data, a novel and modernizing approach in this context. Our open-access +data and code serve as a benchmark for advancing climate science through +improved model evaluations. ClimDetect is publicly accessible via Huggingface +dataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect. + +
+
+
+
+
+ + ☆ CoGen: Learning from Feedback with Coupled Comprehension and Generation + + +
+ Systems with both language comprehension and generation capabilities can +benefit from the tight connection between the two. This work studies coupling +comprehension and generation with focus on continually learning from +interaction with users. We propose techniques to tightly integrate the two +capabilities for both learning and inference. We situate our studies in +two-player reference games, and deploy various models for thousands of +interactions with human users, while learning from interaction feedback +signals. We show dramatic improvements in performance over time, with +comprehension-generation coupling leading to performance improvements up to 26% +in absolute terms and up to 17% higher accuracies compared to a non-coupled +system. Our analysis also shows coupling has substantial qualitative impact on +the system's language, making it significantly more human-like. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Distribution Backtracking Builds A Faster Convergence Trajectory for + One-step Diffusion Distillation + + +
+ Accelerating the sampling speed of diffusion models remains a significant +challenge. Recent score distillation methods distill a heavy teacher model into +an one-step student generator, which is optimized by calculating the difference +between the two score functions on the samples generated by the student model. +However, there is a score mismatch issue in the early stage of the distillation +process, because existing methods mainly focus on using the endpoint of +pre-trained diffusion models as teacher models, overlooking the importance of +the convergence trajectory between the student generator and the teacher model. +To address this issue, we extend the score distillation process by introducing +the entire convergence trajectory of teacher models and propose Distribution +Backtracking Distillation (DisBack) for distilling student generators. DisBask +is composed of two stages: Degradation Recording and Distribution Backtracking. +Degradation Recording is designed to obtain the convergence trajectory of +teacher models, which records the degradation path from the trained teacher +model to the untrained initial student generator. The degradation path +implicitly represents the intermediate distributions of teacher models. Then +Distribution Backtracking trains a student generator to backtrack the +intermediate distributions for approximating the convergence trajectory of +teacher models. Extensive experiments show that DisBack achieves faster and +better convergence than the existing distillation method and accomplishes +comparable generation performance. Notably, DisBack is easy to implement and +can be generalized to existing distillation methods to boost performance. Our +code is publicly available on https://github.com/SYZhang0805/DisBack. + +
+
+
+
+
+ + ☆ More Text, Less Point: Towards 3D Data-Efficient Point-Language + Understanding + + +
+ Enabling Large Language Models (LLMs) to comprehend the 3D physical world +remains a significant challenge. Due to the lack of large-scale 3D-text pair +datasets, the success of LLMs has yet to be replicated in 3D understanding. In +this paper, we rethink this issue and propose a new task: 3D Data-Efficient +Point-Language Understanding. The goal is to enable LLMs to achieve robust 3D +object understanding with minimal 3D point cloud and text data pairs. To +address this task, we introduce GreenPLM, which leverages more text data to +compensate for the lack of 3D data. First, inspired by using CLIP to align +images and text, we utilize a pre-trained point cloud-text encoder to map the +3D point cloud space to the text space. This mapping leaves us to seamlessly +connect the text space with LLMs. Once the point-text-LLM connection is +established, we further enhance text-LLM alignment by expanding the +intermediate text space, thereby reducing the reliance on 3D point cloud data. +Specifically, we generate 6M free-text descriptions of 3D objects, and design a +three-stage training strategy to help LLMs better explore the intrinsic +connections between different modalities. To achieve efficient modality +alignment, we design a zero-parameter cross-attention module for token pooling. +Extensive experimental results show that GreenPLM requires only 12% of the 3D +training data used by existing state-of-the-art models to achieve superior 3D +understanding. Remarkably, GreenPLM also achieves competitive performance using +text-only data. The code and weights are available at: +https://github.com/TangYuan96/GreenPLM. + +
+
+
+
+
+ + ☆ Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume + + +
+ Current anomaly detection methods excel with benchmark industrial data but +struggle with natural images and medical data due to varying definitions of +'normal' and 'abnormal.' This makes accurate identification of deviations in +these fields particularly challenging. Especially for 3D brain MRI data, all +the state-of-the-art models are reconstruction-based with 3D convolutional +neural networks which are memory-intensive, time-consuming and producing noisy +outputs that require further post-processing. We propose a framework called +Simple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained +on ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature +extractor to reduce computational cost. We aggregate the extracted features to +perform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a +conditional normalizing flow to calculate log likelihood of features and +employs the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The +results indicate improved performance, showcasing our model's remarkable +adaptability and effectiveness when addressing the challenges exists in brain +MRI data. In addition, for the large-scale 3D brain volumes, our model +SimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of +accuracy, memory usage and time consumption. Code is available at: +https://anonymous.4open.science/r/SimpleSliceNet-8EA3. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Generating Binary Species Range Maps + + +
+ Accurately predicting the geographic ranges of species is crucial for +assisting conservation efforts. Traditionally, range maps were manually created +by experts. However, species distribution models (SDMs) and, more recently, +deep learning-based variants offer a potential automated alternative. Deep +learning-based SDMs generate a continuous probability representing the +predicted presence of a species at a given location, which must be binarized by +setting per-species thresholds to obtain binary range maps. However, selecting +appropriate per-species thresholds to binarize these predictions is non-trivial +as different species can require distinct thresholds. In this work, we evaluate +different approaches for automatically identifying the best thresholds for +binarizing range maps using presence-only data. This includes approaches that +require the generation of additional pseudo-absence data, along with ones that +only require presence data. We also propose an extension of an existing +presence-only technique that is more robust to outliers. We perform a detailed +evaluation of different thresholding techniques on the tasks of binary range +estimation and large-scale fine-grained visual classification, and we +demonstrate improved performance over existing pseudo-absence free approaches +using our method. + +
+
+
+
+
+ + ☆ Fall Detection for Smart Living using YOLOv5 + + +
+ This work introduces a fall detection system using the YOLOv5mu model, which +achieved a mean average precision (mAP) of 0.995, demonstrating exceptional +accuracy in identifying fall events within smart home environments. Enhanced by +advanced data augmentation techniques, the model demonstrates significant +robustness and adaptability across various conditions. The integration of +YOLOv5mu offers precise, real-time fall detection, which is crucial for +improving safety and emergency response for residents. Future research will +focus on refining the system by incorporating contextual data and exploring +multi-sensor approaches to enhance its performance and practical applicability +in diverse environments. + +
+
+
+
+
+ + ☆ InstanSeg: an embedding-based instance segmentation algorithm optimized + for accurate, efficient and portable cell segmentation + + +
+ Cell and nucleus segmentation are fundamental tasks for quantitative bioimage +analysis. Despite progress in recent years, biologists and other domain experts +still require novel algorithms to handle increasingly large and complex +real-world datasets. These algorithms must not only achieve state-of-the-art +accuracy, but also be optimized for efficiency, portability and +user-friendliness. Here, we introduce InstanSeg: a novel embedding-based +instance segmentation pipeline designed to identify cells and nuclei in +microscopy images. Using six public cell segmentation datasets, we demonstrate +that InstanSeg can significantly improve accuracy when compared to the most +widely used alternative methods, while reducing the processing time by at least +60%. Furthermore, InstanSeg is designed to be fully serializable as TorchScript +and supports GPU acceleration on a range of hardware. We provide an open-source +implementation of InstanSeg in Python, in addition to a user-friendly, +interactive QuPath extension for inference written in Java. Our code and +pre-trained models are available at https://github.com/instanseg/instanseg . + +
+
+ comment: 12 pages,6 figures +
+
+
+
+
+ + ☆ Auxiliary Input in Training: Incorporating Catheter Features into Deep + Learning Models for ECG-Free Dynamic Coronary Roadmapping MICCAI 2024 + + +
+ Dynamic coronary roadmapping is a technology that overlays the vessel maps +(the "roadmap") extracted from an offline image sequence of X-ray angiography +onto a live stream of X-ray fluoroscopy in real-time. It aims to offer +navigational guidance for interventional surgeries without the need for +repeated contrast agent injections, thereby reducing the risks associated with +radiation exposure and kidney failure. The precision of the roadmaps is +contingent upon the accurate alignment of angiographic and fluoroscopic images +based on their cardiac phases, as well as precise catheter tip tracking. The +former ensures the selection of a roadmap that closely matches the vessel shape +in the current frame, while the latter uses catheter tips as reference points +to adjust for translational motion between the roadmap and the present vessel +tree. Training deep learning models for both tasks is challenging and +underexplored. However, incorporating catheter features into the models could +offer substantial benefits, given humans heavily rely on catheters to complete +the tasks. To this end, we introduce a simple but effective method, auxiliary +input in training (AIT), and demonstrate that it enhances model performance +across both tasks, outperforming baseline methods in knowledge incorporation +and transfer learning. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ☆ Sigma Flows for Image and Data Labeling and Learning Structured + Prediction + + +
+ This paper introduces the sigma flow model for the prediction of structured +labelings of data observed on Riemannian manifolds, including Euclidean image +domains as special case. The approach combines the Laplace-Beltrami framework +for image denoising and enhancement, introduced by Sochen, Kimmel and Malladi +about 25 years ago, and the assignment flow approach introduced and studied by +the authors. + The sigma flow arises as Riemannian gradient flow of generalized harmonic +energies and thus is governed by a nonlinear geometric PDE which determines a +harmonic map from a closed Riemannian domain manifold to a statistical +manifold, equipped with the Fisher-Rao metric from information geometry. A +specific ingredient of the sigma flow is the mutual dependency of the +Riemannian metric of the domain manifold on the evolving state. This makes the +approach amenable to machine learning in a specific way, by realizing this +dependency through a mapping with compact time-variant parametrization that can +be learned from data. Proof of concept experiments demonstrate the expressivity +of the sigma flow model and prediction performance. + Structural similarities to transformer network architectures and networks +generated by the geometric integration of sigma flows are pointed out, which +highlights the connection to deep learning and, conversely, may stimulate the +use of geometric design principles for structured prediction in other areas of +scientific machine learning. + +
+
+ comment: 51 pages +
+
+
+
+
+ + ☆ Local Descriptors Weighted Adaptive Threshold Filtering For Few-Shot + Learning + + +
+ Few-shot image classification is a challenging task in the field of machine +learning, involving the identification of new categories using a limited number +of labeled samples. In recent years, methods based on local descriptors have +made significant progress in this area. However, the key to improving +classification accuracy lies in effectively filtering background noise and +accurately selecting critical local descriptors highly relevant to image +category information. + To address this challenge, we propose an innovative weighted adaptive +threshold filtering (WATF) strategy for local descriptors. This strategy can +dynamically adjust based on the current task and image context, thereby +selecting local descriptors most relevant to the image category. This enables +the model to better focus on category-related information while effectively +mitigating interference from irrelevant background regions. + To evaluate the effectiveness of our method, we adopted the N-way K-shot +experimental framework. Experimental results show that our method not only +improves the clustering effect of selected local descriptors but also +significantly enhances the discriminative ability between image categories. +Notably, our method maintains a simple and lightweight design philosophy +without introducing additional learnable parameters. This feature ensures +consistency in filtering capability during both training and testing phases, +further enhancing the reliability and practicality of the method. + +
+
+
+
+
+ + ☆ DiffAge3D: Diffusion-based 3D-aware Face Aging + + +
+ Face aging is the process of converting an individual's appearance to a +younger or older version of themselves. Existing face aging techniques have +been limited to 2D settings, which often weaken their applications as there is +a growing demand for 3D face modeling. Moreover, existing aging methods +struggle to perform faithful aging, maintain identity, and retain the fine +details of the input images. Given these limitations and the need for a +3D-aware aging method, we propose DiffAge3D, the first 3D-aware aging framework +that not only performs faithful aging and identity preservation but also +operates in a 3D setting. Our aging framework allows to model the aging and +camera pose separately by only taking a single image with a target age. Our +framework includes a robust 3D-aware aging dataset generation pipeline by +utilizing a pre-trained 3D GAN and the rich text embedding capabilities within +CLIP model. Notably, we do not employ any inversion bottleneck in dataset +generation. Instead, we randomly generate training samples from the latent +space of 3D GAN, allowing us to manipulate the rich latent space of GAN to +generate ages even with large gaps. With the generated dataset, we train a +viewpoint-aware diffusion-based aging model to control the camera pose and +facial age. Through quantitative and qualitative evaluations, we demonstrate +that DiffAge3D outperforms existing methods, particularly in +multiview-consistent aging and fine details preservation. + +
+
+
+
+
+ + ☆ Leveraging Open Knowledge for Advancing Task Expertise in Large Language + Models + + +
+ The cultivation of expertise for large language models (LLMs) to solve tasks +of specific areas often requires special-purpose tuning with calibrated +behaviors on the expected stable outputs. To avoid huge cost brought by manual +preparation of instruction datasets and training resources up to hundreds of +hours, the exploitation of open knowledge including a wealth of low rank +adaptation (LoRA) models and instruction datasets serves as a good starting +point. However, existing methods on model and data selection focus on the +performance of general-purpose capabilities while neglecting the knowledge gap +exposed in domain-specific deployment. In the present study, we propose to +bridge such gap by introducing few human-annotated samples (i.e., K-shot) for +advancing task expertise of LLMs with open knowledge. Specifically, we develop +an efficient and scalable pipeline to cost-efficiently produce task experts +where K-shot data intervene in selecting the most promising expert candidates +and the task-relevant instructions. A mixture-of-expert (MoE) system is built +to make the best use of individual-yet-complementary knowledge between multiple +experts. We unveil the two keys to the success of a MoE system, 1) the abidance +by K-shot, and 2) the insistence on diversity. For the former, we ensure that +models that truly possess problem-solving abilities on K-shot are selected +rather than those blind guessers. Besides, during data selection, instructions +that share task-relevant contexts with K-shot are prioritized. For the latter, +we highlight the diversity of constituting experts and that of the fine-tuning +instructions throughout the model and data selection process. Extensive +experimental results confirm the superiority of our approach over existing +methods on utilization of open knowledge across various tasks. Codes and models +will be released later. + +
+
+ comment: 28 pages, 12 tables, 10 figures +
+
+
+
+
+ + ☆ CoRe: Context-Regularized Text Embedding Learning for Text-to-Image + Personalization + + +
+ Recent advances in text-to-image personalization have enabled high-quality +and controllable image synthesis for user-provided concepts. However, existing +methods still struggle to balance identity preservation with text alignment. +Our approach is based on the fact that generating prompt-aligned images +requires a precise semantic understanding of the prompt, which involves +accurately processing the interactions between the new concept and its +surrounding context tokens within the CLIP text encoder. To address this, we +aim to embed the new concept properly into the input embedding space of the +text encoder, allowing for seamless integration with existing tokens. We +introduce Context Regularization (CoRe), which enhances the learning of the new +concept's text embedding by regularizing its context tokens in the prompt. This +is based on the insight that appropriate output vectors of the text encoder for +the context tokens can only be achieved if the new concept's text embedding is +correctly learned. CoRe can be applied to arbitrary prompts without requiring +the generation of corresponding images, thus improving the generalization of +the learned text embedding. Additionally, CoRe can serve as a test-time +optimization technique to further enhance the generations for specific prompts. +Comprehensive experiments demonstrate that our method outperforms several +baseline methods in both identity preservation and text alignment. Code will be +made publicly available. + +
+
+
+
+
+ + ☆ Gen-Swarms: Adapting Deep Generative Models to Swarms of Drones + + +
+ Gen-Swarms is an innovative method that leverages and combines the +capabilities of deep generative models with reactive navigation algorithms to +automate the creation of drone shows. Advancements in deep generative models, +particularly diffusion models, have demonstrated remarkable effectiveness in +generating high-quality 2D images. Building on this success, various works have +extended diffusion models to 3D point cloud generation. In contrast, +alternative generative models such as flow matching have been proposed, +offering a simple and intuitive transition from noise to meaningful outputs. +However, the application of flow matching models to 3D point cloud generation +remains largely unexplored. Gen-Swarms adapts these models to automatically +generate drone shows. Existing 3D point cloud generative models create point +trajectories which are impractical for drone swarms. In contrast, our method +not only generates accurate 3D shapes but also guides the swarm motion, +producing smooth trajectories and accounting for potential collisions through a +reactive navigation algorithm incorporated into the sampling process. For +example, when given a text category like Airplane, Gen-Swarms can rapidly and +continuously generate numerous variations of 3D airplane shapes. Our +experiments demonstrate that this approach is particularly well-suited for +drone shows, providing feasible trajectories, creating representative final +shapes, and significantly enhancing the overall performance of drone show +generation. + +
+
+
+
+
+ + ☆ Disentangled Diffusion Autoencoder for Harmonization of Multi-site + Neuroimaging Data + + +
+ Combining neuroimaging datasets from multiple sites and scanners can help +increase statistical power and thus provide greater insight into subtle +neuroanatomical effects. However, site-specific effects pose a challenge by +potentially obscuring the biological signal and introducing unwanted variance. +Existing harmonization techniques, which use statistical models to remove such +effects, have been shown to incompletely remove site effects while also failing +to preserve biological variability. More recently, generative models using GANs +or autoencoder-based approaches, have been proposed for site adjustment. +However, such methods are known for instability during training or blurry image +generation. In recent years, diffusion models have become increasingly popular +for their ability to generate high-quality synthetic images. In this work, we +introduce the disentangled diffusion autoencoder (DDAE), a novel diffusion +model designed for controlling specific aspects of an image. We apply the DDAE +to the task of harmonizing MR images by generating high-quality site-adjusted +images that preserve biological variability. We use data from 7 different sites +and demonstrate the DDAE's superiority in generating high-resolution, +harmonized 2D MR images over previous approaches. As far as we are aware, this +work marks the first diffusion-based model for site adjustment of neuroimaging +data. + +
+
+
+
+
+ + ☆ SpineMamba: Enhancing 3D Spinal Segmentation in Clinical Imaging through + Residual Visual Mamba Layers and Shape Priors + + +
+ Accurate segmentation of 3D clinical medical images is critical in the +diagnosis and treatment of spinal diseases. However, the inherent complexity of +spinal anatomy and uncertainty inherent in current imaging technologies, poses +significant challenges for semantic segmentation of spinal images. Although +convolutional neural networks (CNNs) and Transformer-based models have made +some progress in spinal segmentation, their limitations in handling long-range +dependencies hinder further improvements in segmentation accuracy.To address +these challenges, we introduce a residual visual Mamba layer to effectively +capture and model the deep semantic features and long-range spatial +dependencies of 3D spinal data. To further enhance the structural semantic +understanding of the vertebrae, we also propose a novel spinal shape prior +module that captures specific anatomical information of the spine from medical +images, significantly enhancing the model's ability to extract structural +semantic information of the vertebrae. Comparative and ablation experiments on +two datasets demonstrate that SpineMamba outperforms existing state-of-the-art +models. On the CT dataset, the average Dice similarity coefficient for +segmentation reaches as high as 94.40, while on the MR dataset, it reaches +86.95. Notably, compared to the renowned nnU-Net, SpineMamba achieves superior +segmentation performance, exceeding it by up to 2 percentage points. This +underscores its accuracy, robustness, and excellent generalization +capabilities. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation + + +
+ We introduce LLaVA-MoD, a novel framework designed to enable the efficient +training of small-scale Multimodal Language Models (s-MLLM) by distilling +knowledge from large-scale MLLM (l-MLLM). Our approach tackles two fundamental +challenges in MLLM distillation. First, we optimize the network structure of +s-MLLM by integrating a sparse Mixture of Experts (MoE) architecture into the +language model, striking a balance between computational efficiency and model +expressiveness. Second, we propose a progressive knowledge transfer strategy to +ensure comprehensive knowledge migration. This strategy begins with mimic +distillation, where we minimize the Kullback-Leibler (KL) divergence between +output distributions to enable the student model to emulate the teacher +network's understanding. Following this, we introduce preference distillation +via Direct Preference Optimization (DPO), where the key lies in treating l-MLLM +as the reference model. During this phase, the s-MLLM's ability to discriminate +between superior and inferior examples is significantly enhanced beyond l-MLLM, +leading to a better student that surpasses its teacher, particularly in +hallucination benchmarks. Extensive experiments demonstrate that LLaVA-MoD +outperforms existing models across various multimodal benchmarks while +maintaining a minimal number of activated parameters and low computational +costs. Remarkably, LLaVA-MoD, with only 2B activated parameters, surpasses +Qwen-VL-Chat-7B by an average of 8.8% across benchmarks, using merely 0.3% of +the training data and 23% trainable parameters. These results underscore +LLaVA-MoD's ability to effectively distill comprehensive knowledge from its +teacher model, paving the way for the development of more efficient MLLMs. The +code will be available on: https://github.com/shufangxun/LLaVA-MoD. + +
+
+
+
+
+ + ☆ Unleashing the Temporal-Spatial Reasoning Capacity of GPT for + Training-Free Audio and Language Referenced Video Object Segmentation + + +
+ In this paper, we propose an Audio-Language-Referenced SAM 2 (AL-Ref-SAM 2) +pipeline to explore the training-free paradigm for audio and +language-referenced video object segmentation, namely AVS and RVOS tasks. The +intuitive solution leverages GroundingDINO to identify the target object from a +single frame and SAM 2 to segment the identified object throughout the video, +which is less robust to spatiotemporal variations due to a lack of video +context exploration. Thus, in our AL-Ref-SAM 2 pipeline, we propose a novel +GPT-assisted Pivot Selection (GPT-PS) module to instruct GPT-4 to perform +two-step temporal-spatial reasoning for sequentially selecting pivot frames and +pivot boxes, thereby providing SAM 2 with a high-quality initial object prompt. +Within GPT-PS, two task-specific Chain-of-Thought prompts are designed to +unleash GPT's temporal-spatial reasoning capacity by guiding GPT to make +selections based on a comprehensive understanding of video and reference +information. Furthermore, we propose a Language-Binded Reference Unification +(LBRU) module to convert audio signals into language-formatted references, +thereby unifying the formats of AVS and RVOS tasks in the same pipeline. +Extensive experiments on both tasks show that our training-free AL-Ref-SAM 2 +pipeline achieves performances comparable to or even better than +fully-supervised fine-tuning methods. The code is available at: +https://github.com/appletea233/AL-Ref-SAM2. + +
+
+
+
+
+ + ☆ GenDDS: Generating Diverse Driving Video Scenarios with Prompt-to-Video + Generative Model + + +
+ Autonomous driving training requires a diverse range of datasets encompassing +various traffic conditions, weather scenarios, and road types. Traditional data +augmentation methods often struggle to generate datasets that represent rare +occurrences. To address this challenge, we propose GenDDS, a novel approach for +generating driving scenarios generation by leveraging the capabilities of +Stable Diffusion XL (SDXL), an advanced latent diffusion model. Our methodology +involves the use of descriptive prompts to guide the synthesis process, aimed +at producing realistic and diverse driving scenarios. With the power of the +latest computer vision techniques, such as ControlNet and Hotshot-XL, we have +built a complete pipeline for video generation together with SDXL. We employ +the KITTI dataset, which includes real-world driving videos, to train the +model. Through a series of experiments, we demonstrate that our model can +generate high-quality driving videos that closely replicate the complexity and +variability of real-world driving scenarios. This research contributes to the +development of sophisticated training data for autonomous driving systems and +opens new avenues for creating virtual environments for simulation and +validation purposes. + +
+
+
+
+
+ + ☆ microYOLO: Towards Single-Shot Object Detection on Microcontrollers ECML + + +
+ This work-in-progress paper presents results on the feasibility of +single-shot object detection on microcontrollers using YOLO. Single-shot object +detectors like YOLO are widely used, however due to their complexity mainly on +larger GPU-based platforms. We present microYOLO, which can be used on Cortex-M +based microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when +classifying 128x128 RGB images while using less than 800 KB Flash and less than +350 KB RAM. Furthermore, we share experimental results for three different +object detection tasks, analyzing the accuracy of microYOLO on them. + +
+
+ comment: Published at the ECML PKDD Conference 2023, at the 4th Workshop on + IoT, Edge, and Mobile for Embedded Machine Learning +
+
+
+
+
+ + ☆ What is YOLOv8: An In-Depth Exploration of the Internal Features of the + Next-Generation Object Detector + + +
+ This study presents a detailed analysis of the YOLOv8 object detection model, +focusing on its architecture, training techniques, and performance improvements +over previous iterations like YOLOv5. Key innovations, including the CSPNet +backbone for enhanced feature extraction, the FPN+PAN neck for superior +multi-scale object detection, and the transition to an anchor-free approach, +are thoroughly examined. The paper reviews YOLOv8's performance across +benchmarks like Microsoft COCO and Roboflow 100, highlighting its high accuracy +and real-time capabilities across diverse hardware platforms. Additionally, the +study explores YOLOv8's developer-friendly enhancements, such as its unified +Python package and CLI, which streamline model training and deployment. +Overall, this research positions YOLOv8 as a state-of-the-art solution in the +evolving object detection field. + +
+
+
+
+
+ + ☆ Shot Segmentation Based on Von Neumann Entropy for Key Frame Extraction + + +
+ Video key frame extraction is important in various fields, such as video +summary, retrieval, and compression. Therefore, we suggest a video key frame +extraction algorithm based on shot segmentation using Von Neumann entropy. The +segmentation of shots is achieved through the computation of Von Neumann +entropy of the similarity matrix among frames within the video sequence. The +initial frame of each shot is selected as key frames, which combines the +temporal sequence information of frames. The experimental results show the +extracted key frames can fully and accurately represent the original video +content while minimizing the number of repeated frames. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Network transferability of adversarial patches in real-time object + detection + + +
+ Adversarial patches in computer vision can be used, to fool deep neural +networks and manipulate their decision-making process. One of the most +prominent examples of adversarial patches are evasion attacks for object +detectors. By covering parts of objects of interest, these patches suppress the +detections and thus make the target object 'invisible' to the object detector. +Since these patches are usually optimized on a specific network with a specific +train dataset, the transferability across multiple networks and datasets is not +given. This paper addresses these issues and investigates the transferability +across numerous object detector architectures. Our extensive evaluation across +various models on two distinct datasets indicates that patches optimized with +larger models provide better network transferability than patches that are +optimized with smaller models. + +
+
+ comment: 7 pages, 6 figures, 1 table +
+
+
+
+
+ + ☆ SITransformer: Shared Information-Guided Transformer for Extreme + Multimodal Summarization + + +
+ Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an +attractive summarization approach by integrating various types of information +to create extremely concise yet informative summaries for individual +modalities. Existing methods overlook the issue that multimodal data often +contains more topic irrelevant information, which can mislead the model into +producing inaccurate summaries especially for extremely short ones. In this +paper, we propose SITransformer, a \textbf{S}hared \textbf{I}nformation-guided +\textbf{T}ransformer for extreme multimodal summarization. It has a shared +information guided pipeline which involves a cross-modal shared information +extractor and a cross-modal interaction module. The extractor formulates +semantically shared salient information from different modalities by devising a +novel filtering process consisting of a differentiable top-k selector and a +shared-information guided gating unit. As a result, the common, salient, and +relevant contents across modalities are identified. Next, a transformer with +cross-modal attentions is developed for intra- and inter-modality learning with +the shared information guidance to produce the extreme summary. Comprehensive +experiments demonstrate that SITransformer significantly enhances the +summarization quality for both video and text summaries for XMSMO. Our code +will be publicly available at https://github.com/SichengLeoLiu/MMAsia24-XMSMO. + +
+
+ comment: 8 pages, 5 figures, submitted to ACM Multimedia Asia 2024 +
+
+
+
+
+ + ☆ Benchmarking foundation models as feature extractors for + weakly-supervised computational pathology + + +
+ Advancements in artificial intelligence have driven the development of +numerous pathology foundation models capable of extracting clinically relevant +information. However, there is currently limited literature independently +evaluating these foundation models on truly external cohorts and +clinically-relevant tasks to uncover adjustments for future improvements. In +this study, we benchmarked ten histopathology foundation models on 13 patient +cohorts with 6,791 patients and 9,493 slides from lung, colorectal, gastric, +and breast cancers. The models were evaluated on weakly-supervised tasks +related to biomarkers, morphological properties, and prognostic outcomes. We +show that a vision-language foundation model, CONCH, yielded the highest +performance in 42% of tasks when compared to vision-only foundation models. The +experiments reveal that foundation models trained on distinct cohorts learn +complementary features to predict the same label, and can be fused to +outperform the current state of the art. Creating an ensemble of complementary +foundation models outperformed CONCH in 66% of tasks. Moreover, our findings +suggest that data diversity outweighs data volume for foundation models. Our +work highlights actionable adjustments to improve pathology foundation models. + +
+
+
+
+
+ + ☆ Mining Field Data for Tree Species Recognition at Scale + + +
+ Individual tree species labels are particularly hard to acquire due to the +expert knowledge needed and the limitations of photointerpretation. Here, we +present a methodology to automatically mine species labels from public forest +inventory data, using available pretrained tree detection models. We identify +tree instances in aerial imagery and match them with field data with close to +zero human involvement. We conduct a series of experiments on the resulting +dataset, and show a beneficial effect when adding noisy or even unlabeled data +points, highlighting a strong potential for large-scale individual species +mapping. + +
+
+
+
+
+ + ☆ DQFormer: Towards Unified LiDAR Panoptic Segmentation with Decoupled + Queries + + +
+ LiDAR panoptic segmentation, which jointly performs instance and semantic +segmentation for things and stuff classes, plays a fundamental role in LiDAR +perception tasks. While most existing methods explicitly separate these two +segmentation tasks and utilize different branches (i.e., semantic and instance +branches), some recent methods have embraced the query-based paradigm to unify +LiDAR panoptic segmentation. However, the distinct spatial distribution and +inherent characteristics of objects(things) and their surroundings(stuff) in 3D +scenes lead to challenges, including the mutual competition of things/stuff and +the ambiguity of classification/segmentation. In this paper, we propose +decoupling things/stuff queries according to their intrinsic properties for +individual decoding and disentangling classification/segmentation to mitigate +ambiguity. To this end, we propose a novel framework dubbed DQFormer to +implement semantic and instance segmentation in a unified workflow. +Specifically, we design a decoupled query generator to propose informative +queries with semantics by localizing things/stuff positions and fusing +multi-level BEV embeddings. Moreover, a query-oriented mask decoder is +introduced to decode corresponding segmentation masks by performing masked +cross-attention between queries and mask embeddings. Finally, the decoded masks +are combined with the semantics of the queries to produce panoptic results. +Extensive experiments on nuScenes and SemanticKITTI datasets demonstrate the +superiority of our DQFormer framework. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ Multi-view Pose Fusion for Occlusion-Aware 3D Human Pose Estimation ECCV + + +
+ Robust 3D human pose estimation is crucial to ensure safe and effective +human-robot collaboration. Accurate human perception,however, is particularly +challenging in these scenarios due to strong occlusions and limited camera +viewpoints. Current 3D human pose estimation approaches are rather vulnerable +in such conditions. In this work we present a novel approach for robust 3D +human pose estimation in the context of human-robot collaboration. Instead of +relying on noisy 2D features triangulation, we perform multi-view fusion on 3D +skeletons provided by absolute monocular methods. Accurate 3D pose estimation +is then obtained via reprojection error optimization, introducing limbs length +symmetry constraints. We evaluate our approach on the public dataset Human3.6M +and on a novel version Human3.6M-Occluded, derived adding synthetic occlusions +on the camera views with the purpose of testing pose estimation algorithms +under severe occlusions. We further validate our method on real human-robot +collaboration workcells, in which we strongly surpass current 3D human pose +estimation methods. Our approach outperforms state-of-the-art multi-view human +pose estimation techniques and demonstrates superior capabilities in handling +challenging scenarios with strong occlusions, representing a reliable and +effective solution for real human-robot collaboration setups. + +
+
+ comment: ECCV workshops 2024 +
+
+
+
+
+ + ☆ Object Detection for Vehicle Dashcams using Transformers + + +
+ The use of intelligent automation is growing significantly in the automotive +industry, as it assists drivers and fleet management companies, thus increasing +their productivity. Dash cams are now been used for this purpose which enables +the instant identification and understanding of multiple objects and +occurrences in the surroundings. In this paper, we propose a novel approach for +object detection in dashcams using transformers. Our system is based on the +state-of-the-art DEtection TRansformer (DETR), which has demonstrated strong +performance in a variety of conditions, including different weather and +illumination scenarios. The use of transformers allows for the consideration of +contextual information in decisionmaking, improving the accuracy of object +detection. To validate our approach, we have trained our DETR model on a +dataset that represents real-world conditions. Our results show that the use of +intelligent automation through transformers can significantly enhance the +capabilities of dashcam systems. The model achieves an mAP of 0.95 on +detection. + +
+
+ comment: 7 Pages, and 6 Figures +
+
+
+
+
+ + ☆ Visual Prompt Engineering for Medical Vision Language Models in + Radiology ECCV 2024 + + +
+ Medical image classification in radiology faces significant challenges, +particularly in generalizing to unseen pathologies. In contrast, CLIP offers a +promising solution by leveraging multimodal learning to improve zero-shot +classification performance. However, in the medical domain, lesions can be +small and might not be well represented in the embedding space. Therefore, in +this paper, we explore the potential of visual prompt engineering to enhance +the capabilities of Vision Language Models (VLMs) in radiology. Leveraging +BiomedCLIP, trained on extensive biomedical image-text pairs, we investigate +the impact of embedding visual markers directly within radiological images to +guide the model's attention to critical regions. Our evaluation on the JSRT +dataset, focusing on lung nodule malignancy classification, demonstrates that +incorporating visual prompts $\unicode{x2013}$ such as arrows, circles, and +contours $\unicode{x2013}$ significantly improves classification metrics +including AUROC, AUPRC, F1 score, and accuracy. Moreover, the study provides +attention maps, showcasing enhanced model interpretability and focus on +clinically relevant areas. These findings underscore the efficacy of visual +prompt engineering as a straightforward yet powerful approach to advance VLM +performance in medical image analysis. + +
+
+ comment: Accepted at ECCV 2024 Workshop on Emergent Visual Abilities and + Limits of Foundation Models +
+
+
+
+
+ + ☆ A Survey on Facial Expression Recognition of Static and Dynamic Emotions + + +
+ Facial expression recognition (FER) aims to analyze emotional states from +static images and dynamic sequences, which is pivotal in enhancing +anthropomorphic communication among humans, robots, and digital avatars by +leveraging AI technologies. As the FER field evolves from controlled laboratory +environments to more complex in-the-wild scenarios, advanced methods have been +rapidly developed and new challenges and apporaches are encounted, which are +not well addressed in existing reviews of FER. This paper offers a +comprehensive survey of both image-based static FER (SFER) and video-based +dynamic FER (DFER) methods, analyzing from model-oriented development to +challenge-focused categorization. We begin with a critical comparison of recent +reviews, an introduction to common datasets and evaluation criteria, and an +in-depth workflow on FER to establish a robust research foundation. We then +systematically review representative approaches addressing eight main +challenges in SFER (such as expression disturbance, uncertainties, compound +emotions, and cross-domain inconsistency) as well as seven main challenges in +DFER (such as key frame sampling, expression intensity variations, and +cross-modal alignment). Additionally, we analyze recent advancements, benchmark +performances, major applications, and ethical considerations. Finally, we +propose five promising future directions and development trends to guide +ongoing research. The project page for this paper can be found at +https://github.com/wangyanckxx/SurveyFER. + +
+
+
+
+
+ + ☆ A Survey on Evaluation of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) mimic human perception and reasoning +system by integrating powerful Large Language Models (LLMs) with various +modality encoders (e.g., vision, audio), positioning LLMs as the "brain" and +various modality encoders as sensory organs. This framework endows MLLMs with +human-like capabilities, and suggests a potential pathway towards achieving +artificial general intelligence (AGI). With the emergence of all-round MLLMs +like GPT-4V and Gemini, a multitude of evaluation methods have been developed +to assess their capabilities across different dimensions. This paper presents a +systematic and comprehensive review of MLLM evaluation methods, covering the +following key aspects: (1) the background of MLLMs and their evaluation; (2) +"what to evaluate" that reviews and categorizes existing MLLM evaluation tasks +based on the capabilities assessed, including general multimodal recognition, +perception, reasoning and trustworthiness, and domain-specific applications +such as socioeconomic, natural sciences and engineering, medical usage, AI +agent, remote sensing, video and audio processing, 3D point cloud analysis, and +others; (3) "where to evaluate" that summarizes MLLM evaluation benchmarks into +general and specific benchmarks; (4) "how to evaluate" that reviews and +illustrates MLLM evaluation steps and metrics; Our overarching goal is to +provide valuable insights for researchers in the field of MLLM evaluation, +thereby facilitating the development of more capable and reliable MLLMs. We +emphasize that evaluation should be regarded as a critical discipline, +essential for advancing the field of MLLMs. + +
+
+
+
+
+ + ☆ Addressing the challenges of loop detection in agricultural environments + + +
+ While visual SLAM systems are well studied and achieve impressive results in +indoor and urban settings, natural, outdoor and open-field environments are +much less explored and still present relevant research challenges. Visual +navigation and local mapping have shown a relatively good performance in +open-field environments. However, globally consistent mapping and long-term +localization still depend on the robustness of loop detection and closure, for +which the literature is scarce. In this work we propose a novel method to pave +the way towards robust loop detection in open fields, particularly in +agricultural settings, based on local feature search and stereo geometric +refinement, with a final stage of relative pose estimation. Our method +consistently achieves good loop detections, with a median error of 15cm. We aim +to characterize open fields as a novel environment for loop detection, +understanding the limitations and problems that arise when dealing with them. + +
+
+
+
+
+ + ☆ Str-L Pose: Integrating Point and Structured Line for Relative Pose + Estimation in Dual-Graph + + +
+ Relative pose estimation is crucial for various computer vision applications, +including Robotic and Autonomous Driving. Current methods primarily depend on +selecting and matching feature points prone to incorrect matches, leading to +poor performance. Consequently, relying solely on point-matching relationships +for pose estimation is a huge challenge. To overcome these limitations, we +propose a Geometric Correspondence Graph neural network that integrates point +features with extra structured line segments. This integration of matched +points and line segments further exploits the geometry constraints and enhances +model performance across different environments. We employ the Dual-Graph +module and Feature Weighted Fusion Module to aggregate geometric and visual +features effectively, facilitating complex scene understanding. We demonstrate +our approach through extensive experiments on the DeMoN and KITTI Odometry +datasets. The results show that our method is competitive with state-of-the-art +techniques. + +
+
+
+
+
+ + ☆ Segmentation-guided Layer-wise Image Vectorization with Gradient Fills + + +
+ The widespread use of vector graphics creates a significant demand for +vectorization methods. While recent learning-based techniques have shown their +capability to create vector images of clear topology, filling these primitives +with gradients remains a challenge. In this paper, we propose a +segmentation-guided vectorization framework to convert raster images into +concise vector graphics with radial gradient fills. With the guidance of an +embedded gradient-aware segmentation subroutine, our approach progressively +appends gradient-filled B\'ezier paths to the output, where primitive +parameters are initiated with our newly designed initialization technique and +are optimized to minimize our novel loss function. We build our method on a +differentiable renderer with traditional segmentation algorithms to develop it +as a model-free tool for raster-to-vector conversion. It is tested on various +inputs to demonstrate its feasibility, independent of datasets, to synthesize +vector graphics with improved visual quality and layer-wise topology compared +to prior work. + +
+
+
+
+
+ + ☆ MambaPlace:Text-to-Point-Cloud Cross-Modal Place Recognition with + Attention Mamba Mechanisms + + +
+ Vision Language Place Recognition (VLVPR) enhances robot localization +performance by incorporating natural language descriptions from images. By +utilizing language information, VLVPR directs robot place matching, overcoming +the constraint of solely depending on vision. The essence of multimodal fusion +lies in mining the complementary information between different modalities. +However, general fusion methods rely on traditional neural architectures and +are not well equipped to capture the dynamics of cross modal interactions, +especially in the presence of complex intra modal and inter modal correlations. +To this end, this paper proposes a novel coarse to fine and end to end +connected cross modal place recognition framework, called MambaPlace. In the +coarse localization stage, the text description and 3D point cloud are encoded +by the pretrained T5 and instance encoder, respectively. They are then +processed using Text Attention Mamba (TAM) and Point Clouds Mamba (PCM) for +data enhancement and alignment. In the subsequent fine localization stage, the +features of the text description and 3D point cloud are cross modally fused and +further enhanced through cascaded Cross Attention Mamba (CCAM). Finally, we +predict the positional offset from the fused text point cloud features, +achieving the most accurate localization. Extensive experiments show that +MambaPlace achieves improved localization accuracy on the KITTI360Pose dataset +compared to the state of the art methods. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Defending Text-to-image Diffusion Models: Surprising Efficacy of Textual + Perturbations Against Backdoor Attacks ECCV 2024 + + +
+ Text-to-image diffusion models have been widely adopted in real-world +applications due to their ability to generate realistic images from textual +descriptions. However, recent studies have shown that these methods are +vulnerable to backdoor attacks. Despite the significant threat posed by +backdoor attacks on text-to-image diffusion models, countermeasures remain +under-explored. In this paper, we address this research gap by demonstrating +that state-of-the-art backdoor attacks against text-to-image diffusion models +can be effectively mitigated by a surprisingly simple defense strategy - +textual perturbation. Experiments show that textual perturbations are effective +in defending against state-of-the-art backdoor attacks with minimal sacrifice +to generation quality. We analyze the efficacy of textual perturbation from two +angles: text embedding space and cross-attention maps. They further explain how +backdoor attacks have compromised text-to-image diffusion models, providing +insights for studying future attack and defense strategies. Our code is +available at https://github.com/oscarchew/t2i-backdoor-defense. + +
+
+ comment: ECCV 2024 Workshop The Dark Side of Generative AIs and Beyond +
+
+
+
+
+ + ☆ Pixels to Prose: Understanding the art of Image Captioning + + +
+ In the era of evolving artificial intelligence, machines are increasingly +emulating human-like capabilities, including visual perception and linguistic +expression. Image captioning stands at the intersection of these domains, +enabling machines to interpret visual content and generate descriptive text. +This paper provides a thorough review of image captioning techniques, catering +to individuals entering the field of machine learning who seek a comprehensive +understanding of available options, from foundational methods to +state-of-the-art approaches. Beginning with an exploration of primitive +architectures, the review traces the evolution of image captioning models to +the latest cutting-edge solutions. By dissecting the components of these +architectures, readers gain insights into the underlying mechanisms and can +select suitable approaches tailored to specific problem requirements without +duplicating efforts. The paper also delves into the application of image +captioning in the medical domain, illuminating its significance in various +real-world scenarios. + Furthermore, the review offers guidance on evaluating the performance of +image captioning systems, highlighting key metrics for assessment. By +synthesizing theoretical concepts with practical application, this paper equips +readers with the knowledge needed to navigate the complex landscape of image +captioning and harness its potential for diverse applications in machine +learning and beyond. + +
+
+
+
+
+ + ☆ Towards Realistic Example-based Modeling via 3D Gaussian Stitching + + +
+ Using parts of existing models to rebuild new models, commonly termed as +example-based modeling, is a classical methodology in the realm of computer +graphics. Previous works mostly focus on shape composition, making them very +hard to use for realistic composition of 3D objects captured from real-world +scenes. This leads to combining multiple NeRFs into a single 3D scene to +achieve seamless appearance blending. However, the current SeamlessNeRF method +struggles to achieve interactive editing and harmonious stitching for +real-world scenes due to its gradient-based strategy and grid-based +representation. To this end, we present an example-based modeling method that +combines multiple Gaussian fields in a point-based representation using +sample-guided synthesis. Specifically, as for composition, we create a GUI to +segment and transform multiple fields in real time, easily obtaining a +semantically meaningful composition of models represented by 3D Gaussian +Splatting (3DGS). For texture blending, due to the discrete and irregular +nature of 3DGS, straightforwardly applying gradient propagation as SeamlssNeRF +is not supported. Thus, a novel sampling-based cloning method is proposed to +harmonize the blending while preserving the original rich texture and content. +Our workflow consists of three steps: 1) real-time segmentation and +transformation of a Gaussian model using a well-tailored GUI, 2) KNN analysis +to identify boundary points in the intersecting area between the source and +target models, and 3) two-phase optimization of the target model using +sampling-based cloning and gradient constraints. Extensive experimental results +validate that our approach significantly outperforms previous works in terms of +realistic synthesis, demonstrating its practicality. More demos are available +at https://ingra14m.github.io/gs_stitching_website. + +
+
+
+
+
+ + ☆ G-Style: Stylized Gaussian Splatting + + +
+ We introduce G-Style, a novel algorithm designed to transfer the style of an +image onto a 3D scene represented using Gaussian Splatting. Gaussian Splatting +is a powerful 3D representation for novel view synthesis, as -- compared to +other approaches based on Neural Radiance Fields -- it provides fast scene +renderings and user control over the scene. Recent pre-prints have demonstrated +that the style of Gaussian Splatting scenes can be modified using an image +exemplar. However, since the scene geometry remains fixed during the +stylization process, current solutions fall short of producing satisfactory +results. Our algorithm aims to address these limitations by following a +three-step process: In a pre-processing step, we remove undesirable Gaussians +with large projection areas or highly elongated shapes. Subsequently, we +combine several losses carefully designed to preserve different scales of the +style in the image, while maintaining as much as possible the integrity of the +original scene content. During the stylization process and following the +original design of Gaussian Splatting, we split Gaussians where additional +detail is necessary within our scene by tracking the gradient of the stylized +color. Our experiments demonstrate that G-Style generates high-quality +stylizations within just a few minutes, outperforming existing methods both +qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Synthetic Forehead-creases Biometric Generation for Reliable User + Verification + + +
+ Recent studies have emphasized the potential of forehead-crease patterns as +an alternative for face, iris, and periocular recognition, presenting +contactless and convenient solutions, particularly in situations where faces +are covered by surgical masks. However, collecting forehead data presents +challenges, including cost and time constraints, as developing and optimizing +forehead verification methods requires a substantial number of high-quality +images. To tackle these challenges, the generation of synthetic biometric data +has gained traction due to its ability to protect privacy while enabling +effective training of deep learning-based biometric verification methods. In +this paper, we present a new framework to synthesize forehead-crease image data +while maintaining important features, such as uniqueness and realism. The +proposed framework consists of two main modules: a Subject-Specific Generation +Module (SSGM), based on an image-to-image Brownian Bridge Diffusion Model +(BBDM), which learns a one-to-many mapping between image pairs to generate +identity-aware synthetic forehead creases corresponding to real subjects, and a +Subject-Agnostic Generation Module (SAGM), which samples new synthetic +identities with assistance from the SSGM. We evaluate the diversity and realism +of the generated forehead-crease images primarily using the Fr\'echet Inception +Distance (FID) and the Structural Similarity Index Measure (SSIM). In addition, +we assess the utility of synthetically generated forehead-crease images using a +forehead-crease verification system (FHCVS). The results indicate an +improvement in the verification accuracy of the FHCVS by utilizing synthetic +data. + +
+
+ comment: Accepted at Generative AI for Futuristic Biometrics - IJCB'24 Special + Session +
+
+
+
+
+ + ☆ A quantitative model of takeover request time budget for conditionally + automated driving + + +
+ In conditional automation, the automated driving system assumes full control +and only issues a takeover request to a human driver to resume driving in +critical situations. Previous studies have concluded that the time budget +required by drivers to resume driving after a takeover request varies with +situations and different takeover variables. However, no comprehensive +generalized approaches for estimating in advance the time budget required by +drivers to takeover have been provided. In this contribution, fixed (7 s) and +variable time budgets (6 s, 5 s, and 4 s) with and without visual imagery +assistance were investigated for suitability in three takeover scenarios using +performance measures such as average lateral displacement. The results indicate +that 7 s is suitable for two of the studied scenarios based on their +characteristics. Using the obtained results and known relations between +takeover variables, a mathematical formula for estimating takeover request time +budget is proposed. The proposed formula integrates individual stimulus +response time, driving experience, scenario specific requirements and allows +increased safety for takeover maneuvers. Furthermore, the visual imagery +resulted in increased takeover time which invariably increases the time budget. +Thus the time demand of the visualized information if applicable (such as +visual imagery) should be included in the time budget. + +
+
+ comment: Manuscript: 12 pages, 12 figures, 7 tables +
+
+
+
+
+ + ☆ DEAR: Depth-Enhanced Action Recognition ECCV + + +
+ Detecting actions in videos, particularly within cluttered scenes, poses +significant challenges due to the limitations of 2D frame analysis from a +camera perspective. Unlike human vision, which benefits from 3D understanding, +recognizing actions in such environments can be difficult. This research +introduces a novel approach integrating 3D features and depth maps alongside +RGB features to enhance action recognition accuracy. Our method involves +processing estimated depth maps through a separate branch from the RGB feature +encoder and fusing the features to understand the scene and actions +comprehensively. Using the Side4Video framework and VideoMamba, which employ +CLIP and VisionMamba for spatial feature extraction, our approach outperformed +our implementation of the Side4Video network on the Something-Something V2 +dataset. Our code is available at: https://github.com/SadeghRahmaniB/DEAR + +
+
+ comment: 5 pages, 1 figure, 1 table, accepted at Human-inspired Computer + Vision, ECCV +
+
+
+
+
+ + ☆ Deep Learning Based Speckle Filtering for Polarimetric SAR Images. + Application to Sentinel-1 + + +
+ Speckle suppression in synthetic aperture radar (SAR) images is a key +processing step which continues to be a research topic. A wide variety of +methods, using either spatially-based approaches or transform-based strategies, +have been developed and have shown to provide outstanding results. However, +recent advances in deep learning techniques and their application to SAR image +despeckling have been demonstrated to offer state-of-the-art results. +Unfortunately, they have been mostly applied to single-polarimetric images. The +extension of a deep learning-based approach for speckle removal to polarimetric +SAR (PolSAR) images is complicated because of the complex nature of the +measured covariance matrices for every image pixel, the properties of which +must be preserved during filtering. In this work, we propose a complete +framework to remove speckle in polarimetric SAR images using a convolutional +neural network. The methodology includes a reversible transformation of the +original complex covariance matrix to obtain a set of real-valued intensity +bands which are fed to the neural network. In addition, the proposed method +includes a change detection strategy to avoid the neural network to learn +erroneous features in areas strongly affected by temporal changes, so that the +network only learns the underlying speckle component present in the data. The +method is implemented and tested with dual-polarimetric images acquired by +Sentinel-1. Experiments show that the proposed approach offers exceptional +results in both speckle reduction and resolution preservation. More +importantly, it is also shown that the neural network is not generating +artifacts or introducing bias in the filtered images, making them suitable for +further polarimetric processing and exploitation. + +
+
+ comment: 23 pages, 32 figures +
+
+
+
+
+ + ☆ Towards reliable respiratory disease diagnosis based on cough sounds and + vision transformers + + +
+ Recent advancements in deep learning techniques have sparked performance +boosts in various real-world applications including disease diagnosis based on +multi-modal medical data. Cough sound data-based respiratory disease (e.g., +COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also +attracted much attention. However, existing works usually utilise traditional +machine learning or deep models of moderate scales. On the other hand, the +developed approaches are trained and evaluated on small-scale data due to the +difficulty of curating and annotating clinical data on scale. To address these +issues in prior works, we create a unified framework to evaluate various deep +models from lightweight Convolutional Neural Networks (e.g., ResNet18) to +modern vision transformers and compare their performance in respiratory disease +classification. Based on the observations from such an extensive empirical +study, we propose a novel approach to cough-based disease classification based +on both self-supervised and supervised learning on a large-scale cough data +set. Experimental results demonstrate our proposed approach outperforms prior +arts consistently on two benchmark datasets for COVID-19 diagnosis and a +proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%. + +
+
+
+
+
+ + ☆ Merging and Splitting Diffusion Paths for Semantically Coherent + Panoramas ECCV 2024 + + +
+ Diffusion models have become the State-of-the-Art for text-to-image +generation, and increasing research effort has been dedicated to adapting the +inference process of pretrained diffusion models to achieve zero-shot +capabilities. An example is the generation of panorama images, which has been +tackled in recent works by combining independent diffusion paths over +overlapping latent features, which is referred to as joint diffusion, obtaining +perceptually aligned panoramas. However, these methods often yield semantically +incoherent outputs and trade-off diversity for uniformity. To overcome this +limitation, we propose the Merge-Attend-Diffuse operator, which can be plugged +into different types of pretrained diffusion models used in a joint diffusion +setting to improve the perceptual and semantical coherence of the generated +panorama images. Specifically, we merge the diffusion paths, reprogramming +self- and cross-attention to operate on the aggregated latent space. Extensive +quantitative and qualitative experimental analysis, together with a user study, +demonstrate that our method maintains compatibility with the input prompt and +visual quality of the generated images while increasing their semantic +coherence. We release the code at https://github.com/aimagelab/MAD. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ☆ TeFF: Tracking-enhanced Forgetting-free Few-shot 3D LiDAR Semantic + Segmentation + + +
+ In autonomous driving, 3D LiDAR plays a crucial role in understanding the +vehicle's surroundings. However, the newly emerged, unannotated objects +presents few-shot learning problem for semantic segmentation. This paper +addresses the limitations of current few-shot semantic segmentation by +exploiting the temporal continuity of LiDAR data. Employing a tracking model to +generate pseudo-ground-truths from a sequence of LiDAR frames, our method +significantly augments the dataset, enhancing the model's ability to learn on +novel classes. However, this approach introduces a data imbalance biased to +novel data that presents a new challenge of catastrophic forgetting. To +mitigate this, we incorporate LoRA, a technique that reduces the number of +trainable parameters, thereby preserving the model's performance on base +classes while improving its adaptability to novel classes. This work represents +a significant step forward in few-shot 3D LiDAR semantic segmentation for +autonomous driving. Our code is available at +https://github.com/junbao-zhou/Track-no-forgetting. + +
+
+
+
+
+ + ☆ Realigned Softmax Warping for Deep Metric Learning + + +
+ Deep Metric Learning (DML) loss functions traditionally aim to control the +forces of separability and compactness within an embedding space so that the +same class data points are pulled together and different class ones are pushed +apart. Within the context of DML, a softmax operation will typically normalize +distances into a probability for optimization, thus coupling all the push/pull +forces together. This paper proposes a potential new class of loss functions +that operate within a euclidean domain and aim to take full advantage of the +coupled forces governing embedding space formation under a softmax. These +forces of compactness and separability can be boosted or mitigated within +controlled locations at will by using a warping function. In this work, we +provide a simple example of a warping function and use it to achieve +competitive, state-of-the-art results on various metric learning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Online pre-training with long-form videos + + +
+ In this study, we investigate the impact of online pre-training with +continuous video clips. We will examine three methods for pre-training (masked +image modeling, contrastive learning, and knowledge distillation), and assess +the performance on downstream action recognition tasks. As a result, online +pre-training with contrast learning showed the highest performance in +downstream tasks. Our findings suggest that learning from long-form videos can +be helpful for action recognition with short videos. + +
+
+ comment: GCCE2024 +
+
+
+
+
+ + ☆ Leveraging Persistent Homology for Differential Diagnosis of Mild + Cognitive Impairment + + +
+ Mild cognitive impairment (MCI) is characterized by subtle changes in +cognitive functions, often associated with disruptions in brain connectivity. +The present study introduces a novel fine-grained analysis to examine +topological alterations in neurodegeneration pertaining to six different brain +networks of MCI subjects (Early/Late MCI). To achieve this, fMRI time series +from two distinct populations are investigated: (i) the publicly accessible +ADNI dataset and (ii) our in-house dataset. The study utilizes sliding window +embedding to convert each fMRI time series into a sequence of 3-dimensional +vectors, facilitating the assessment of changes in regional brain topology. +Distinct persistence diagrams are computed for Betti descriptors of +dimension-0, 1, and 2. Wasserstein distance metric is used to quantify +differences in topological characteristics. We have examined both (i) +ROI-specific inter-subject interactions and (ii) subject-specific inter-ROI +interactions. Further, a new deep learning model is proposed for +classification, achieving a maximum classification accuracy of 95% for the ADNI +dataset and 85% for the in-house dataset. This methodology is further adapted +for the differential diagnosis of MCI sub-types, resulting in a peak accuracy +of 76.5%, 91.1% and 80% in classifying HC Vs. EMCI, HC Vs. LMCI and EMCI Vs. +LMCI, respectively. We showed that the proposed approach surpasses current +state-of-the-art techniques designed for classifying MCI and its sub-types +using fMRI. + +
+
+ comment: 16 pages, 6 figures, 3 tables, accepted at International Conference + on Pattern Recognition 2024 +
+
+
+
+
+ + ☆ μgat: Improving Single-Page Document Parsing by Providing Multi-Page + Context ECCV + + +
+ Regesta are catalogs of summaries of other documents and, in some cases, are +the only source of information about the content of such full-length documents. +For this reason, they are of great interest to scholars in many social and +humanities fields. In this work, we focus on Regesta Pontificum Romanum, a +large collection of papal registers. Regesta are visually rich documents, where +the layout is as important as the text content to convey the contained +information through the structure, and are inherently multi-page documents. +Among Digital Humanities techniques that can help scholars efficiently exploit +regesta and other documental sources in the form of scanned documents, Document +Parsing has emerged as a task to process document images and convert them into +machine-readable structured representations, usually markup language. However, +current models focus on scientific and business documents, and most of them +consider only single-paged documents. To overcome this limitation, in this +work, we propose {\mu}gat, an extension of the recently proposed Document +parsing Nougat architecture, which can handle elements spanning over the single +page limits. Specifically, we adapt Nougat to process a larger, multi-page +context, consisting of the previous and the following page, while parsing the +current page. Experimental results, both qualitative and quantitative, +demonstrate the effectiveness of our proposed approach also in the case of the +challenging Regesta Pontificum Romanorum. + +
+
+ comment: Accepted at ECCV Workshop "AI4DH: Artificial Intelligence for Digital + Humanities" +
+
+
+
+
+ + ☆ RIDE: Boosting 3D Object Detection for LiDAR Point Clouds via + Rotation-Invariant Analysis + + +
+ The rotation robustness property has drawn much attention to point cloud +analysis, whereas it still poses a critical challenge in 3D object detection. +When subjected to arbitrary rotation, most existing detectors fail to produce +expected outputs due to the poor rotation robustness. In this paper, we present +RIDE, a pioneering exploration of Rotation-Invariance for the 3D +LiDAR-point-based object DEtector, with the key idea of designing +rotation-invariant features from LiDAR scenes and then effectively +incorporating them into existing 3D detectors. Specifically, we design a +bi-feature extractor that extracts (i) object-aware features though sensitive +to rotation but preserve geometry well, and (ii) rotation-invariant features, +which lose geometric information to a certain extent but are robust to +rotation. These two kinds of features complement each other to decode 3D +proposals that are robust to arbitrary rotations. Particularly, our RIDE is +compatible and easy to plug into the existing one-stage and two-stage 3D +detectors, and boosts both detection performance and rotation robustness. +Extensive experiments on the standard benchmarks showcase that the mean average +precision (mAP) and rotation robustness can be significantly boosted by +integrating with our RIDE, with +5.6% mAP and 53% rotation robustness +improvement on KITTI, +5.1% and 28% improvement correspondingly on nuScenes. +The code will be available soon. + +
+
+
+
+
+ + ☆ Can SAR improve RSVQA performance? + + +
+ Remote sensing visual question answering (RSVQA) has been involved in several +research in recent years, leading to an increase in new methods. RSVQA +automatically extracts information from satellite images, so far only optical, +and a question to automatically search for the answer in the image and provide +it in a textual form. In our research, we study whether Synthetic Aperture +Radar (SAR) images can be beneficial to this field. We divide our study into +three phases which include classification methods and VQA. In the first one, we +explore the classification results of SAR alone and investigate the best method +to extract information from SAR data. Then, we study the combination of SAR and +optical data. In the last phase, we investigate how SAR images and a +combination of different modalities behave in RSVQA compared to a method only +using optical images. We conclude that adding the SAR modality leads to +improved performances, although further research on using SAR data to +automatically answer questions is needed as well as more balanced datasets. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ MMDRFuse: Distilled Mini-Model with Dynamic Refresh for Multi-Modality + Image Fusion + + +
+ In recent years, Multi-Modality Image Fusion (MMIF) has been applied to many +fields, which has attracted many scholars to endeavour to improve the fusion +performance. However, the prevailing focus has predominantly been on the +architecture design, rather than the training strategies. As a low-level vision +task, image fusion is supposed to quickly deliver output images for observation +and supporting downstream tasks. Thus, superfluous computational and storage +overheads should be avoided. In this work, a lightweight Distilled Mini-Model +with a Dynamic Refresh strategy (MMDRFuse) is proposed to achieve this +objective. To pursue model parsimony, an extremely small convolutional network +with a total of 113 trainable parameters (0.44 KB) is obtained by three +carefully designed supervisions. First, digestible distillation is constructed +by emphasising external spatial feature consistency, delivering soft +supervision with balanced details and saliency for the target network. Second, +we develop a comprehensive loss to balance the pixel, gradient, and perception +clues from the source images. Third, an innovative dynamic refresh training +strategy is used to collaborate history parameters and current supervision +during training, together with an adaptive adjust function to optimise the +fusion network. Extensive experiments on several public datasets demonstrate +that our method exhibits promising advantages in terms of model efficiency and +complexity, with superior performance in multiple image fusion tasks and +downstream pedestrian detection application. The code of this work is publicly +available at https://github.com/yanglinDeng/MMDRFuse. + +
+
+ comment: 10 pages, 8 figures, accpeted by ACM International Conference on + Multimedia 2024(Oral) +
+
+
+
+
+ + ☆ Transfer Learning from Simulated to Real Scenes for Monocular 3D Object + Detection ECCV'24 + + +
+ Accurately detecting 3D objects from monocular images in dynamic roadside +scenarios remains a challenging problem due to varying camera perspectives and +unpredictable scene conditions. This paper introduces a two-stage training +strategy to address these challenges. Our approach initially trains a model on +the large-scale synthetic dataset, RoadSense3D, which offers a diverse range of +scenarios for robust feature learning. Subsequently, we fine-tune the model on +a combination of real-world datasets to enhance its adaptability to practical +conditions. Experimental results of the Cube R-CNN model on challenging public +benchmarks show a remarkable improvement in detection performance, with a mean +average precision rising from 0.26 to 12.76 on the TUM Traffic A9 Highway +dataset and from 2.09 to 6.60 on the DAIR-V2X-I dataset when performing +transfer learning. Code, data, and qualitative video results are available on +the project website: https://roadsense3d.github.io. + +
+
+ comment: 18 pages. Accepted for ECVA European Conference on Computer Vision + 2024 (ECCV'24) +
+
+
+
+
+ + ☆ CSAD: Unsupervised Component Segmentation for Logical Anomaly Detection + + +
+ To improve logical anomaly detection, some previous works have integrated +segmentation techniques with conventional anomaly detection methods. Although +these methods are effective, they frequently lead to unsatisfactory +segmentation results and require manual annotations. To address these +drawbacks, we develop an unsupervised component segmentation technique that +leverages foundation models to autonomously generate training labels for a +lightweight segmentation network without human labeling. Integrating this new +segmentation technique with our proposed Patch Histogram module and the +Local-Global Student-Teacher (LGST) module, we achieve a detection AUROC of +95.3% in the MVTec LOCO AD dataset, which surpasses previous SOTA methods. +Furthermore, our proposed method provides lower latency and higher throughput +than most existing approaches. + +
+
+
+
+
+ + ☆ Can Visual Language Models Replace OCR-Based Visual Question Answering + Pipelines in Production? A Case Study in Retail + + +
+ Most production-level deployments for Visual Question Answering (VQA) tasks +are still build as processing pipelines of independent steps including image +pre-processing, object- and text detection, Optical Character Recognition (OCR) +and (mostly supervised) object classification. However, the recent advances in +vision Foundation Models [25] and Vision Language Models (VLMs) [23] raise the +question if these custom trained, multi-step approaches can be replaced with +pre-trained, single-step VLMs. This paper analyzes the performance and limits +of various VLMs in the context of VQA and OCR [5, 9, 12] tasks in a +production-level scenario. Using data from the Retail-786k [10] dataset, we +investigate the capabilities of pre-trained VLMs to answer detailed questions +about advertised products in images. Our study includes two commercial models, +GPT-4V [16] and GPT-4o [17], as well as four open-source models: InternVL [5], +LLaVA 1.5 [12], LLaVA-NeXT [13], and CogAgent [9]. Our initial results show, +that there is in general no big performance gap between open-source and +commercial models. However, we observe a strong task dependent variance in VLM +performance: while most models are able to answer questions regarding the +product brand and price with high accuracy, they completely fail at the same +time to correctly identity the specific product name or discount. This +indicates the problem of VLMs to solve fine-grained classification tasks as +well to model the more abstract concept of discounts. + +
+
+
+
+
+ + ☆ Geometry-guided Feature Learning and Fusion for Indoor Scene + Reconstruction ICCV2023 + + +
+ In addition to color and textural information, geometry provides important +cues for 3D scene reconstruction. However, current reconstruction methods only +include geometry at the feature level thus not fully exploiting the geometric +information. + In contrast, this paper proposes a novel geometry integration mechanism for +3D scene reconstruction. Our approach incorporates 3D geometry at three levels, +i.e. feature learning, feature fusion, and network supervision. First, +geometry-guided feature learning encodes geometric priors to contain +view-dependent information. Second, a geometry-guided adaptive feature fusion +is introduced which utilizes the geometric priors as a guidance to adaptively +generate weights for multiple views. Third, at the supervision level, taking +the consistency between 2D and 3D normals into account, a consistent 3D normal +loss is designed to add local constraints. + Large-scale experiments are conducted on the ScanNet dataset, showing that +volumetric methods with our geometry integration mechanism outperform +state-of-the-art methods quantitatively as well as qualitatively. Volumetric +methods with ours also show good generalization on the 7-Scenes and TUM RGB-D +datasets. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ ES-PTAM: Event-based Stereo Parallel Tracking and Mapping + + +
+ Visual Odometry (VO) and SLAM are fundamental components for spatial +perception in mobile robots. Despite enormous progress in the field, current +VO/SLAM systems are limited by their sensors' capability. Event cameras are +novel visual sensors that offer advantages to overcome the limitations of +standard cameras, enabling robots to expand their operating range to +challenging scenarios, such as high-speed motion and high dynamic range +illumination. We propose a novel event-based stereo VO system by combining two +ideas: a correspondence-free mapping module that estimates depth by maximizing +ray density fusion and a tracking module that estimates camera poses by +maximizing edge-map alignment. We evaluate the system comprehensively on five +real-world datasets, spanning a variety of camera types (manufacturers and +spatial resolutions) and scenarios (driving, flying drone, hand-held, +egocentric, etc). The quantitative and qualitative results demonstrate that our +method outperforms the state of the art in majority of the test sequences by a +margin, e.g., trajectory error reduction of 45% on RPG dataset, 61% on DSEC +dataset, and 21% on TUM-VIE dataset. To benefit the community and foster +research on event-based perception systems, we release the source code and +results: https://github.com/tub-rip/ES-PTAM + +
+
+ comment: 17 pages, 7 figures, 4 tables, https://github.com/tub-rip/ES-PTAM +
+
+
+
+
+ + ☆ On the Benefits of Visual Stabilization for Frame- and Event-based + Perception + + +
+ Vision-based perception systems are typically exposed to large orientation +changes in different robot applications. In such conditions, their performance +might be compromised due to the inherent complexity of processing data captured +under challenging motion. Integration of mechanical stabilizers to compensate +for the camera rotation is not always possible due to the robot payload +constraints. This paper presents a processing-based stabilization approach to +compensate the camera's rotational motion both on events and on frames (i.e., +images). Assuming that the camera's attitude is available, we evaluate the +benefits of stabilization in two perception applications: feature tracking and +estimating the translation component of the camera's ego-motion. The validation +is performed using synthetic data and sequences from well-known event-based +vision datasets. The experiments unveil that stabilization can improve feature +tracking and camera ego-motion estimation accuracy in 27.37% and 34.82%, +respectively. Concurrently, stabilization can reduce the processing time of +computing the camera's linear velocity by at least 25%. Code is available at +https://github.com/tub-rip/visual_stabilization + +
+
+ comment: 8 pages, 4 figures, 4 tables, + https://github.com/tub-rip/visual_stabilization +
+
+
+
+
+ + ☆ Hierarchical Visual Categories Modeling: A Joint Representation Learning + and Density Estimation Framework for Out-of-Distribution Detection ICCV2023 + + +
+ Detecting out-of-distribution inputs for visual recognition models has become +critical in safe deep learning. This paper proposes a novel hierarchical visual +category modeling scheme to separate out-of-distribution data from +in-distribution data through joint representation learning and statistical +modeling. We learn a mixture of Gaussian models for each in-distribution +category. There are many Gaussian mixture models to model different visual +categories. With these Gaussian models, we design an in-distribution score +function by aggregating multiple Mahalanobis-based metrics. We don't use any +auxiliary outlier data as training samples, which may hurt the generalization +ability of out-of-distribution detection algorithms. We split the ImageNet-1k +dataset into ten folds randomly. We use one fold as the in-distribution dataset +and the others as out-of-distribution datasets to evaluate the proposed method. +We also conduct experiments on seven popular benchmarks, including CIFAR, +iNaturalist, SUN, Places, Textures, ImageNet-O, and OpenImage-O. Extensive +experiments indicate that the proposed method outperforms state-of-the-art +algorithms clearly. Meanwhile, we find that our visual representation has a +competitive performance when compared with features learned by classical +methods. These results demonstrate that the proposed method hasn't weakened the +discriminative ability of visual recognition models and keeps high efficiency +in detecting out-of-distribution samples. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Temporal Attention for Cross-View Sequential Image Localization IROS 2024 + + +
+ This paper introduces a novel approach to enhancing cross-view localization, +focusing on the fine-grained, sequential localization of street-view images +within a single known satellite image patch, a significant departure from +traditional one-to-one image retrieval methods. By expanding to sequential +image fine-grained localization, our model, equipped with a novel Temporal +Attention Module (TAM), leverages contextual information to significantly +improve sequential image localization accuracy. Our method shows substantial +reductions in both mean and median localization errors on the Cross-View Image +Sequence (CVIS) dataset, outperforming current state-of-the-art single-image +localization techniques. Additionally, by adapting the KITTI-CVL dataset into +sequential image sets, we not only offer a more realistic dataset for future +research but also demonstrate our model's robust generalization capabilities +across varying times and areas, evidenced by a 75.3% reduction in mean distance +error in cross-view sequential image localization. + +
+
+ comment: Accepted to IROS 2024 +
+
+
+
+
+ + ☆ TagOOD: A Novel Approach to Out-of-Distribution Detection via + Vision-Language Representations and Class Center Learning + + +
+ Multimodal fusion, leveraging data like vision and language, is rapidly +gaining traction. This enriched data representation improves performance across +various tasks. Existing methods for out-of-distribution (OOD) detection, a +critical area where AI models encounter unseen data in real-world scenarios, +rely heavily on whole-image features. These image-level features can include +irrelevant information that hinders the detection of OOD samples, ultimately +limiting overall performance. In this paper, we propose \textbf{TagOOD}, a +novel approach for OOD detection that leverages vision-language representations +to achieve label-free object feature decoupling from whole images. This +decomposition enables a more focused analysis of object semantics, enhancing +OOD detection performance. Subsequently, TagOOD trains a lightweight network on +the extracted object features to learn representative class centers. These +centers capture the central tendencies of IND object classes, minimizing the +influence of irrelevant image features during OOD detection. Finally, our +approach efficiently detects OOD samples by calculating distance-based metrics +as OOD scores between learned centers and test samples. We conduct extensive +experiments to evaluate TagOOD on several benchmark datasets and demonstrate +its superior performance compared to existing OOD detection methods. This work +presents a novel perspective for further exploration of multimodal information +utilization in OOD detection, with potential applications across various tasks. + +
+
+ comment: Accepted by ACMMM2024 +
+
+
+
+
+ + ☆ Generalization Capabilities of Neural Cellular Automata for Medical + Image Segmentation: A Robust and Lightweight Approach + + +
+ In the field of medical imaging, the U-Net architecture, along with its +variants, has established itself as a cornerstone for image segmentation tasks, +particularly due to its strong performance when trained on limited datasets. +Despite its impressive performance on identically distributed (in-domain) data, +U-Nets exhibit a significant decline in performance when tested on data that +deviates from the training distribution, out-of-distribution (out-of-domain) +data. Current methodologies predominantly address this issue by employing +generalization techniques that hinge on various forms of regularization, which +have demonstrated moderate success in specific scenarios. This paper, however, +ventures into uncharted territory by investigating the implications of +utilizing models that are smaller by three orders of magnitude (i.e., x1000) +compared to a conventional U-Net. A reduction of this size in U-net parameters +typically adversely affects both in-domain and out-of-domain performance, +possibly due to a significantly reduced receptive field. To circumvent this +issue, we explore the concept of Neural Cellular Automata (NCA), which, despite +its simpler model structure, can attain larger receptive fields through +recursive processes. Experimental results on two distinct datasets reveal that +NCA outperforms traditional methods in terms of generalization, while still +maintaining a commendable IID performance. + +
+
+
+
+
+ + ☆ Divide, Conquer and Combine: A Training-Free Framework for + High-Resolution Image Perception in Multimodal Large Language Models + + +
+ Multimodal large language models (MLLMs) have experienced significant +advancements recently, but still struggle to recognize and interpret intricate +details in high-resolution (HR) images effectively. While state-of-the-art +(SOTA) MLLMs claim to process images at 4K resolution, existing MLLM benchmarks +only support up to 2K, leaving the capabilities of SOTA models on true HR +images largely untested. Furthermore, existing methods for enhancing HR image +perception in MLLMs rely on computationally expensive visual instruction +tuning. To address these limitations, we introduce HR-Bench, the first +deliberately designed benchmark to rigorously evaluate MLLM performance on +4K&8K images. Through extensive experiments, we demonstrate that while +downsampling HR images leads to vision information loss, leveraging +complementary modalities, e.g., text, can effectively compensate for this loss. +Building upon this insight, we propose Divide, Conquer and Combine (DC$^2$), a +novel training-free framework for enhancing MLLM perception of HR images. +DC$^2$ follows a three-staged approach: 1) Divide: recursively partitioning the +HR image into patches and merging similar patches to minimize computational +overhead, 2) Conquer: leveraging the MLLM to generate accurate textual +descriptions for each image patch, and 3) Combine: utilizing the generated text +descriptions to enhance the MLLM's understanding of the overall HR image. +Extensive experiments show that: 1) the SOTA MLLM achieves 63% accuracy, which +is markedly lower than the 87% accuracy achieved by humans on HR-Bench; 2) our +DC$^2$ brings consistent and significant improvements (a relative increase of ++6% on HR-Bench and +8% on general multimodal benchmarks). The benchmark and +code will be released to facilitate the multimodal R&D community. + +
+
+
+
+
+ + ☆ Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep + Learning + + +
+ In recently years, a significant amount of research has been conducted on +applying deep learning methods for glaucoma classification and detection. +However, the explainability of those established machine learning models +remains a big concern. In this research, in contrast, we learn from cognitive +science concept and study how ophthalmologists judge glaucoma detection. +Simulating experts' efforts, we propose a hierarchical decision making system, +centered around a holistic set of carefully designed biomarker-oriented machine +learning models. While biomarkers represent the key indicators of how +ophthalmologists identify glaucoma, they usually exhibit latent +inter-relations. We thus construct a time series model, named TRI-LSTM, capable +of calculating and uncovering potential and latent relationships among various +biomarkers of glaucoma. Our model is among the first efforts to explore the +intrinsic connections among glaucoma biomarkers. We monitor temporal +relationships in patients' disease states over time and to capture and retain +the progression of disease-relevant clinical information from prior visits, +thereby enriching biomarker's potential relationships. Extensive experiments +over real-world dataset have demonstrated the effectiveness of the proposed +model. + +
+
+ comment: 9 pages, 4 images +
+
+
+
+
+ + ☆ ConsistencyTrack: A Robust Multi-Object Tracker with a Generation + Strategy of Consistency Model + + +
+ Multi-object tracking (MOT) is a critical technology in computer vision, +designed to detect multiple targets in video sequences and assign each target a +unique ID per frame. Existed MOT methods excel at accurately tracking multiple +objects in real-time across various scenarios. However, these methods still +face challenges such as poor noise resistance and frequent ID switches. In this +research, we propose a novel ConsistencyTrack, joint detection and +tracking(JDT) framework that formulates detection and association as a +denoising diffusion process on perturbed bounding boxes. This progressive +denoising strategy significantly improves the model's noise resistance. During +the training phase, paired object boxes within two adjacent frames are diffused +from ground-truth boxes to a random distribution, and then the model learns to +detect and track by reversing this process. In inference, the model refines +randomly generated boxes into detection and tracking results through minimal +denoising steps. ConsistencyTrack also introduces an innovative target +association strategy to address target occlusion. Experiments on the MOT17 and +DanceTrack datasets demonstrate that ConsistencyTrack outperforms other +compared methods, especially better than DiffusionTrack in inference speed and +other performance metrics. Our code is available at +https://github.com/Tankowa/ConsistencyTrack. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2308.09905 by other authors +
+
+
+
+
+ + ☆ Kangaroo: A Powerful Video-Language Model Supporting Long-context Video + Input + + +
+ Rapid advancements have been made in extending Large Language Models (LLMs) +to Large Multi-modal Models (LMMs). However, extending input modality of LLMs +to video data remains a challenging endeavor, especially for long videos. Due +to insufficient access to large-scale high-quality video data and the excessive +compression of visual features, current methods exhibit limitations in +effectively processing long videos. In this paper, we introduce Kangaroo, a +powerful Video LMM aimed at addressing these challenges. Confronted with issue +of inadequate training data, we develop a data curation system to build a +large-scale dataset with high-quality annotations for vision-language +pre-training and instruction tuning. In addition, we design a curriculum +training pipeline with gradually increasing resolution and number of input +frames to accommodate long videos. Evaluation results demonstrate that, with 8B +parameters, Kangaroo achieves state-of-the-art performance across a variety of +video understanding benchmarks while exhibiting competitive results on others. +Particularly, on benchmarks specialized for long videos, Kangaroo excels some +larger models with over 10B parameters and proprietary models. + +
+
+
+
+
+ + ☆ Ray-Distance Volume Rendering for Neural Scene Reconstruction ECCV2024 + + +
+ Existing methods in neural scene reconstruction utilize the Signed Distance +Function (SDF) to model the density function. However, in indoor scenes, the +density computed from the SDF for a sampled point may not consistently reflect +its real importance in volume rendering, often due to the influence of +neighboring objects. To tackle this issue, our work proposes a novel approach +for indoor scene reconstruction, which instead parameterizes the density +function with the Signed Ray Distance Function (SRDF). Firstly, the SRDF is +predicted by the network and transformed to a ray-conditioned density function +for volume rendering. We argue that the ray-specific SRDF only considers the +surface along the camera ray, from which the derived density function is more +consistent to the real occupancy than that from the SDF. Secondly, although +SRDF and SDF represent different aspects of scene geometries, their values +should share the same sign indicating the underlying spatial occupancy. +Therefore, this work introduces a SRDF-SDF consistency loss to constrain the +signs of the SRDF and SDF outputs. Thirdly, this work proposes a +self-supervised visibility task, introducing the physical visibility geometry +to the reconstruction task. The visibility task combines prior from predicted +SRDF and SDF as pseudo labels, and contributes to generating more accurate 3D +geometry. Our method implemented with different representations has been +validated on indoor datasets, achieving improved performance in both +reconstruction and view synthesis. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ A Simple Baseline with Single-encoder for Referring Image Segmentation + + +
+ Referring image segmentation (RIS) requires dense vision-language +interactions between visual pixels and textual words to segment objects based +on a given description. However, commonly adapted dual-encoders in RIS, e.g., +Swin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal +dual-encoder), lack dense multi-modal interactions during pre-training, leading +to a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods +often rely on multi-modal fusion modules that interact two encoders, but this +approach leads to high computational costs. In this paper, we present a novel +RIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of +shared self-attention across all framework components. This enables seamless +interactions of two modalities from input to final prediction, producing +granularly aligned multi-modal features. Furthermore, we propose lightweight +yet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which +contribute to the high efficiency of our model. Our simple baseline with a +single encoder achieves outstanding performances on the RIS benchmark datasets +while maintaining computational efficiency, compared to the most recent SoTA +methods based on dual-encoders. + +
+
+ comment: ArXiv pre-print +
+
+
+
+
+ + ☆ Depth-Weighted Detection of Behaviours of Risk in People with Dementia + using Cameras + + +
+ The behavioural and psychological symptoms of dementia, such as agitation and +aggression, present a significant health and safety risk in residential care +settings. Many care facilities have video cameras in place for digital +monitoring of public spaces, which can be leveraged to develop an automated +behaviours of risk detection system that can alert the staff to enable timely +intervention and prevent the situation from escalating. However, one of the +challenges in our previous study was the presence of false alarms due to +obstruction of view by activities happening close to the camera. To address +this issue, we proposed a novel depth-weighted loss function to train a +customized convolutional autoencoder to enforce equivalent importance to the +events happening both near and far from the cameras; thus, helping to reduce +false alarms and making the method more suitable for real-world deployment. The +proposed method was trained using data from nine participants with dementia +across three cameras situated in a specialized dementia unit and achieved an +area under the curve of receiver operating characteristic of $0.852$, $0.81$ +and $0.768$ for the three cameras. Ablation analysis was conducted for the +individual components of the proposed method and the performance of the +proposed method was investigated for participant-specific and sex-specific +behaviours of risk detection. The proposed method performed reasonably well in +detecting behaviours of risk in people with dementia motivating further +research toward the development of a behaviours of risk detection system +suitable for deployment in video surveillance systems in care facilities. + +
+
+
+
+
+ + ☆ Continual-learning-based framework for structural damage recognition + + +
+ Multi-damage is common in reinforced concrete structures and leads to the +requirement of large number of neural networks, parameters and data storage, if +convolutional neural network (CNN) is used for damage recognition. In addition, +conventional CNN experiences catastrophic forgetting and training inefficiency +as the number of tasks increases during continual learning, leading to large +accuracy decrease of previous learned tasks. To address these problems, this +study proposes a continuallearning-based damage recognition model (CLDRM) which +integrates the learning without forgetting continual learning method into the +ResNet-34 architecture for the recognition of damages in RC structures as well +as relevant structural components. Three experiments for four recognition tasks +were designed to validate the feasibility and effectiveness of the CLDRM +framework. In this way, it reduces both the prediction time and data storage by +about 75% in four tasks of continuous learning. Three experiments for four +recognition tasks were designed to validate the feasibility and effectiveness +of the CLDRM framework. By gradual feature fusion, CLDRM outperformed other +methods by managed to achieve high accuracy in the damage recognition and +classification. As the number of recognition tasks increased, CLDRM also +experienced smaller decrease of the previous learned tasks. Results indicate +that the CLDRM framework successfully performs damage recognition and +classification with reasonable accuracy and effectiveness. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ RoboSense: Large-scale Dataset and Benchmark for Multi-sensor Low-speed + Autonomous Driving + + +
+ Robust object detection and tracking under arbitrary sight of view is +challenging yet essential for the development of Autonomous Vehicle technology. +With the growing demand of unmanned function vehicles, near-field scene +understanding becomes an important research topic in the areas of low-speed +autonomous driving. Due to the complexity of driving conditions and diversity +of near obstacles such as blind spots and high occlusion, the perception +capability of near-field environment is still inferior than its farther +counterpart. To further enhance the intelligent ability of unmanned vehicles, +in this paper, we construct a multimodal data collection platform based on 3 +main types of sensors (Camera, LiDAR and Fisheye), which supports flexible +sensor configurations to enable dynamic sight of view for ego vehicle, either +global view or local view. Meanwhile, a large-scale multi-sensor dataset is +built, named RoboSense, to facilitate near-field scene understanding. RoboSense +contains more than 133K synchronized data with 1.4M 3D bounding box and IDs +annotated in the full $360^{\circ}$ view, forming 216K trajectories across 7.6K +temporal sequences. It has $270\times$ and $18\times$ as many annotations of +near-field obstacles within 5$m$ as the previous single-vehicle datasets such +as KITTI and nuScenes. Moreover, we define a novel matching criterion for +near-field 3D perception and prediction metrics. Based on RoboSense, we +formulate 6 popular tasks to facilitate the future development of related +research, where the detailed data analysis as well as benchmarks are also +provided accordingly. + +
+
+
+
+
+ + ☆ NAS-BNN: Neural Architecture Search for Binary Neural Networks + + +
+ Binary Neural Networks (BNNs) have gained extensive attention for their +superior inferencing efficiency and compression ratio compared to traditional +full-precision networks. However, due to the unique characteristics of BNNs, +designing a powerful binary architecture is challenging and often requires +significant manpower. A promising solution is to utilize Neural Architecture +Search (NAS) to assist in designing BNNs, but current NAS methods for BNNs are +relatively straightforward and leave a performance gap between the searched +models and manually designed ones. To address this gap, we propose a novel +neural architecture search scheme for binary neural networks, named NAS-BNN. We +first carefully design a search space based on the unique characteristics of +BNNs. Then, we present three training strategies, which significantly enhance +the training of supernet and boost the performance of all subnets. Our +discovered binary model family outperforms previous BNNs for a wide range of +operations (OPs) from 20M to 200M. For instance, we achieve 68.20% top-1 +accuracy on ImageNet with only 57M OPs. In addition, we validate the +transferability of these searched BNNs on the object detection task, and our +binary detectors with the searched BNNs achieve a novel state-of-the-art +result, e.g., 31.6% mAP with 370M OPs, on MS COCO dataset. The source code and +models will be released at https://github.com/VDIGPKU/NAS-BNN. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Dynamic Reconstruction from Neuromorphic Data + + +
+ Unlike traditional cameras which synchronously register pixel intensity, +neuromorphic sensors only register `changes' at pixels where a change is +occurring asynchronously. This enables neuromorphic sensors to sample at a +micro-second level and efficiently capture the dynamics. Since, only sequences +of asynchronous event changes are recorded rather than brightness intensities +over time, many traditional image processing techniques cannot be directly +applied. Furthermore, existing approaches, including the ones recently +introduced by the authors, use traditional images combined with neuromorphic +event data to carry out reconstructions. The aim of this work is introduce an +optimization based approach to reconstruct images and dynamics only from the +neuromoprhic event data without any additional knowledge of the events. Each +pixel is modeled temporally. The experimental results on real data highlight +the efficacy of the presented approach, paving the way for efficient and +accurate processing of neuromorphic sensor data in real-world applications. + +
+
+
+
+
+ + ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images + + +
+ Text-to-image generation models have achieved remarkable advancements in +recent years, aiming to produce realistic images from textual descriptions. +However, these models often struggle with generating anatomically accurate +representations of human hands. The resulting images frequently exhibit issues +such as incorrect numbers of fingers, unnatural twisting or interlacing of +fingers, or blurred and indistinct hands. These issues stem from the inherent +complexity of hand structures and the difficulty in aligning textual +descriptions with precise visual depictions of hands. To address these +challenges, we propose a novel approach named Hand1000 that enables the +generation of realistic hand images with target gesture using only 1,000 +training samples. The training of Hand1000 is divided into three stages with +the first stage aiming to enhance the model's understanding of hand anatomy by +using a pre-trained hand gesture recognition model to extract gesture +representation. The second stage further optimizes text embedding by +incorporating the extracted hand gesture representation, to improve alignment +between the textual descriptions and the generated hand images. The third stage +utilizes the optimized embedding to fine-tune the Stable Diffusion model to +generate realistic hand images. In addition, we construct the first publicly +available dataset specifically designed for text-to-hand image generation. +Based on the existing hand gesture recognition dataset, we adopt advanced image +captioning models and LLaMA3 to generate high-quality textual descriptions +enriched with detailed gesture information. Extensive experiments demonstrate +that Hand1000 significantly outperforms existing models in producing +anatomically correct hand images while faithfully representing other details in +the text, such as faces, clothing, and colors. + +
+
+ comment: Project page https://haozhuo-zhang.github.io/Hand1000-project-page/ +
+
+
+
+
+ + ☆ Avoiding Generative Model Writer's Block With Embedding Nudging + + +
+ Generative image models, since introduction, have become a global phenomenon. +From new arts becoming possible to new vectors of abuse, many new capabilities +have become available. One of the challenging issues with generative models is +controlling the generation process specially to prevent specific generations +classes or instances . There are several reasons why one may want to control +the output of generative models, ranging from privacy and safety concerns to +application limitations or user preferences + To address memorization and privacy challenges, there has been considerable +research dedicated to filtering prompts or filtering the outputs of these +models. What all these solutions have in common is that at the end of the day +they stop the model from producing anything, hence limiting the usability of +the model. In this paper, we propose a method for addressing this usability +issue by making it possible to steer away from unwanted concepts (when detected +in model's output) and still generating outputs. In particular we focus on the +latent diffusion image generative models and how one can prevent them to +generate particular images while generating similar images with limited +overhead. + We focus on mitigating issues like image memorization, demonstrating our +technique's effectiveness through qualitative and quantitative evaluations. Our +method successfully prevents the generation of memorized training images while +maintaining comparable image quality and relevance to the unmodified model. + +
+
+
+
+
+ + ☆ VLM4Bio: A Benchmark Dataset to Evaluate Pretrained Vision-Language + Models for Trait Discovery from Biological Images + + +
+ Images are increasingly becoming the currency for documenting biodiversity on +the planet, providing novel opportunities for accelerating scientific +discoveries in the field of organismal biology, especially with the advent of +large vision-language models (VLMs). We ask if pre-trained VLMs can aid +scientists in answering a range of biologically relevant questions without any +additional fine-tuning. In this paper, we evaluate the effectiveness of 12 +state-of-the-art (SOTA) VLMs in the field of organismal biology using a novel +dataset, VLM4Bio, consisting of 469K question-answer pairs involving 30K images +from three groups of organisms: fishes, birds, and butterflies, covering five +biologically relevant tasks. We also explore the effects of applying prompting +techniques and tests for reasoning hallucination on the performance of VLMs, +shedding new light on the capabilities of current SOTA VLMs in answering +biologically relevant questions using images. The code and datasets for running +all the analyses reported in this paper can be found at +https://github.com/sammarfy/VLM4Bio. + +
+
+ comment: 36 pages, 37 figures, 7 tables +
+
+
+
+
+ + ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? ECCV 2024 + + +
+ Foundation models have emerged as robust models with label efficiency in +diverse domains. In medical imaging, these models contribute to the advancement +of medical diagnoses due to the difficulty in obtaining labeled data. However, +it is unclear whether using a large amount of unlabeled data, biased by the +presence of sensitive attributes during pre-training, influences the fairness +of the model. This research examines the bias in the Foundation model +(RetFound) when it is applied to fine-tune the Brazilian Multilabel +Ophthalmological Dataset (BRSET), which has a different population than the +pre-training dataset. The model evaluation, in comparison with supervised +learning, shows that the Foundation Model has the potential to reduce the gap +between the maximum AUC and minimum AUC evaluations across gender and age +groups. However, in a data-efficient generalization, the model increases the +bias when the data amount decreases. These findings suggest that when deploying +a Foundation Model in real-life scenarios with limited data, the possibility of +fairness issues should be considered. + +
+
+ comment: Preprint of paper to be presented at Fairness and Ethics Towards + Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during + ECCV 2024 +
+
+
+
+
+ + ☆ Single-Photon 3D Imaging with Equi-Depth Photon Histograms + + +
+ Single-photon cameras present a promising avenue for high-resolution 3D +imaging. They have ultra-high sensitivity -- down to individual photons -- and +can record photon arrival times with extremely high (sub-nanosecond) +resolution. Single-photon 3D cameras estimate the round-trip time of a laser +pulse by forming equi-width (EW) histograms of detected photon timestamps. +Acquiring and transferring such EW histograms requires high bandwidth and +in-pixel memory, making SPCs less attractive in resource-constrained settings +such as mobile devices and AR/VR headsets. In this work we propose a 3D sensing +technique based on equi-depth (ED) histograms. ED histograms compress timestamp +data more efficiently than EW histograms, reducing the bandwidth requirement. +Moreover, to reduce the in-pixel memory requirement, we propose a lightweight +algorithm to estimate ED histograms in an online fashion without explicitly +storing the photon timestamps. This algorithm is amenable to future in-pixel +implementations. We propose algorithms that process ED histograms to perform 3D +computer-vision tasks of estimating scene distance maps and performing visual +odometry under challenging conditions such as high ambient light. Our work +paves the way towards lower bandwidth and reduced in-pixel memory requirements +for SPCs, making them attractive for resource-constrained 3D vision +applications. Project page: +$\href{https://www.computational.camera/pedh}{https://www.computational.camera/pedh}$ + +
+
+
+
+
+ + ☆ Using Backbone Foundation Model for Evaluating Fairness in Chest + Radiography Without Demographic Data MICCAI 2024 + + +
+ Ensuring consistent performance across diverse populations and incorporating +fairness into machine learning models are crucial for advancing medical image +diagnostics and promoting equitable healthcare. However, many databases do not +provide protected attributes or contain unbalanced representations of +demographic groups, complicating the evaluation of model performance across +different demographics and the application of bias mitigation techniques that +rely on these attributes. This study aims to investigate the effectiveness of +using the backbone of Foundation Models as an embedding extractor for creating +groups that represent protected attributes, such as gender and age. We propose +utilizing these groups in different stages of bias mitigation, including +pre-processing, in-processing, and evaluation. Using databases in and +out-of-distribution scenarios, it is possible to identify that the method can +create groups that represent gender in both databases and reduce in 4.44% the +difference between the gender attribute in-distribution and 6.16% in +out-of-distribution. However, the model lacks robustness in handling age +attributes, underscoring the need for more fundamentally fair and robust +Foundation models. These findings suggest a role in promoting fairness +assessment in scenarios where we lack knowledge of attributes, contributing to +the development of more equitable medical diagnostics. + +
+
+ comment: Preprint of paper to be presented at Fairness of AI in Medical + Imaging (FAIMI) during MICCAI 2024 +
+
+
+
+
+ + ☆ ChartEye: A Deep Learning Framework for Chart Information Extraction + + +
+ The widespread use of charts and infographics as a means of data +visualization in various domains has inspired recent research in automated +chart understanding. However, information extraction from chart images is a +complex multitasked process due to style variations and, as a consequence, it +is challenging to design an end-to-end system. In this study, we propose a deep +learning-based framework that provides a solution for key steps in the chart +information extraction pipeline. The proposed framework utilizes hierarchal +vision transformers for the tasks of chart-type and text-role classification, +while YOLOv7 for text detection. The detected text is then enhanced using Super +Resolution Generative Adversarial Networks to improve the recognition output of +the OCR. Experimental results on a benchmark dataset show that our proposed +framework achieves excellent performance at every stage with F1-scores of 0.97 +for chart-type classification, 0.91 for text-role classification, and a mean +Average Precision of 0.95 for text detection. + +
+
+ comment: 8 Pages, and 11 Figures +
+
+
+
+
+ + ☆ Alternating Direction Method of Multipliers for Negative Binomial Model + with The Weighted Difference of Anisotropic and Isotropic Total Variation ICME + + +
+ In many applications such as medical imaging, the measurement data represent +counts of photons hitting a detector. Such counts in low-photon settings are +often modeled using a Poisson distribution. However, this model assumes that +the mean and variance of the signal's noise distribution are equal. For +overdispersed data where the variance is greater than the mean, the negative +binomial distribution is a more appropriate statistical model. In this paper, +we propose an optimization approach for recovering images corrupted by +overdispersed Poisson noise. In particular, we incorporate a weighted +anisotropic-isotropic total variation regularizer, which avoids staircasing +artifacts that are introduced by a regular total variation penalty. We use an +alternating direction method of multipliers, where each subproblem has a +closed-form solution. Numerical experiments demonstrate the effectiveness of +our proposed approach, especially in very photon-limited settings. + +
+
+ comment: 6 pages, Accepted by the IEEE International Conference on Multimedia + and Expo (ICME) +
+
+
+
+
+ + ☆ Negative Binomial Matrix Completion SP + + +
+ Matrix completion focuses on recovering missing or incomplete information in +matrices. This problem arises in various applications, including image +processing and network analysis. Previous research proposed Poisson matrix +completion for count data with noise that follows a Poisson distribution, which +assumes that the mean and variance are equal. Since overdispersed count data, +whose variance is greater than the mean, is more likely to occur in realistic +settings, we assume that the noise follows the negative binomial (NB) +distribution, which can be more general than the Poisson distribution. In this +paper, we introduce NB matrix completion by proposing a nuclear-norm +regularized model that can be solved by proximal gradient descent. In our +experiments, we demonstrate that the NB model outperforms Poisson matrix +completion in various noise and missing data settings on real data. + +
+
+ comment: 6 pages, Accepted by the IEEE International Workshop on Machine + Learning for Signal Processing (MLSP) +
+
+
+
+
+ + ☆ 3D Reconstruction with Spatial Memory + + +
+ We present Spann3R, a novel approach for dense 3D reconstruction from ordered +or unordered image collections. Built on the DUSt3R paradigm, Spann3R uses a +transformer-based architecture to directly regress pointmaps from images +without any prior knowledge of the scene or camera parameters. Unlike DUSt3R, +which predicts per image-pair pointmaps each expressed in its local coordinate +frame, Spann3R can predict per-image pointmaps expressed in a global coordinate +system, thus eliminating the need for optimization-based global alignment. The +key idea of Spann3R is to manage an external spatial memory that learns to keep +track of all previous relevant 3D information. Spann3R then queries this +spatial memory to predict the 3D structure of the next frame in a global +coordinate system. Taking advantage of DUSt3R's pre-trained weights, and +further fine-tuning on a subset of datasets, Spann3R shows competitive +performance and generalization ability on various unseen datasets and can +process ordered image collections in real time. Project page: +\url{https://hengyiwang.github.io/projects/spanner} + +
+
+ comment: Project page: \url{https://hengyiwang.github.io/projects/spanner} +
+
+
+
+
+ + ♻ ☆ HER2 and FISH Status Prediction in Breast Biopsy H&E-Stained Images + Using Deep Learning + + +
+ The current standard for detecting human epidermal growth factor receptor 2 +(HER2) status in breast cancer patients relies on HER2 amplification, +identified through fluorescence in situ hybridization (FISH) or +immunohistochemistry (IHC). However, hematoxylin and eosin (H\&E) tumor stains +are more widely available, and accurately predicting HER2 status using H\&E +could reduce costs and expedite treatment selection. Deep Learning algorithms +for H&E have shown effectiveness in predicting various cancer features and +clinical outcomes, including moderate success in HER2 status prediction. In +this work, we employed a customized weak supervision classification technique +combined with MoCo-v2 contrastive learning to predict HER2 status. We trained +our pipeline on 182 publicly available H&E Whole Slide Images (WSIs) from The +Cancer Genome Atlas (TCGA), for which annotations by the pathology team at Yale +School of Medicine are publicly available. Our pipeline achieved an Area Under +the Curve (AUC) of 0.85 across four different test folds. Additionally, we +tested our model on 44 H&E slides from the TCGA-BRCA dataset, which had an HER2 +score of 2+ and included corresponding HER2 status and FISH test results. These +cases are considered equivocal for IHC, requiring an expensive FISH test on +their IHC slides for disambiguation. Our pipeline demonstrated an AUC of 0.81 +on these challenging H&E slides. Reducing the need for FISH test can have +significant implications in cancer treatment equity for underserved +populations. + +
+
+
+
+
+ + ♻ ☆ SCP: Soft Conditional Prompt Learning for Aerial Video Action + Recognition IROS2024 + + +
+ We present a new learning approach, Soft Conditional Prompt Learning (SCP), +which leverages the strengths of prompt learning for aerial video action +recognition. Our approach is designed to predict the action of each agent by +helping the models focus on the descriptions or instructions associated with +actions in the input videos for aerial/robot visual perception. Our formulation +supports various prompts, including learnable prompts, auxiliary visual +information, and large vision models to improve the recognition performance. We +present a soft conditional prompt method that learns to dynamically generate +prompts from a pool of prompt experts under different video inputs. By sharing +the same objective with the task, our proposed SCP can optimize prompts that +guide the model's predictions while explicitly learning input-invariant (prompt +experts pool) and input-specific (data-dependent) prompt knowledge. In +practice, we observe a 3.17-10.2% accuracy improvement on the aerial video +datasets (Okutama, NECDrone), which consist of scenes with single-agent and +multi-agent actions. We further evaluate our approach on ground camera videos +to verify the effectiveness and generalization and achieve a 1.0-3.6% +improvement on dataset SSV2. We integrate our method into the ROS2 as well. + +
+
+ comment: IROS2024 +
+
+
+
+
+ + ♻ ☆ Examining Pathological Bias in a Generative Adversarial Network + Discriminator: A Case Study on a StyleGAN3 Model + + +
+ Generative adversarial networks (GANs) generate photorealistic faces that are +often indistinguishable by humans from real faces. While biases in machine +learning models are often assumed to be due to biases in training data, we find +pathological internal color and luminance biases in the discriminator of a +pre-trained StyleGAN3-r model that are not explicable by the training data. We +also find that the discriminator systematically stratifies scores by both +image- and face-level qualities and that this disproportionately affects images +across gender, race, and other categories. We examine axes common in research +on stereotyping in social psychology. + +
+
+
+
+
+ + ♻ ☆ Infusion: internal diffusion for inpainting of dynamic textures and + complex motion + + +
+ Video inpainting is the task of filling a region in a video in a visually +convincing manner. It is very challenging due to the high dimensionality of the +data and the temporal consistency required for obtaining convincing results. +Recently, diffusion models have shown impressive results in modeling complex +data distributions, including images and videos. Such models remain nonetheless +very expensive to train and to perform inference with, which strongly reduce +their applicability to videos, and yields unreasonable computational loads. We +show that in the case of video inpainting, thanks to the highly auto-similar +nature of videos, the training data of a diffusion model can be restricted to +the input video and still produce very satisfying results. This leads us to +adopt an internal learning approach, which also allows us to greatly reduce the +neural network size by about three orders of magnitude less than current +diffusion models used for image inpainting. We also introduce a new method for +efficient training and inference of diffusion models in the context of internal +learning, by splitting the diffusion process into different learning intervals +corresponding to different noise levels of the diffusion process. To the best +of our knowledge, this is the first video inpainting method based purely on +diffusion. Other methods require additional components such as optical flow +estimation, which limits their performance in the case of dynamic textures and +complex motions. We show qualitative and quantitative results, demonstrating +that our method reaches state of the art performance in the case of dynamic +textures and complex dynamic backgrounds. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Provable Probabilistic Imaging using Score-Based Generative Priors + + +
+ Estimating high-quality images while also quantifying their uncertainty are +two desired features in an image reconstruction algorithm for solving ill-posed +inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as +a principled framework for characterizing the space of possible solutions to a +general inverse problem. PMC is able to incorporate expressive score-based +generative priors for high-quality image reconstruction while also performing +uncertainty quantification via posterior sampling. In particular, we develop +two PMC algorithms that can be viewed as the sampling analogues of the +traditional plug-and-play priors (PnP) and regularization by denoising (RED) +algorithms. To improve the sampling efficiency, we introduce weighted annealing +into these PMC algorithms, further developing two additional annealed PMC +algorithms (APMC). We establish a theoretical analysis for characterizing the +convergence behavior of PMC algorithms. Our analysis provides non-asymptotic +stationarity guarantees in terms of the Fisher information, fully compatible +with the joint presence of weighted annealing, potentially non-log-concave +likelihoods, and imperfect score networks. We demonstrate the performance of +the PMC algorithms on multiple representative inverse problems with both linear +and nonlinear forward models. Experimental results show that PMC significantly +improves reconstruction quality and enables high-fidelity uncertainty +quantification. + +
+
+
+
+
+ + ♻ ☆ Imperceptible Protection against Style Imitation from Diffusion Models + + +
+ Recent progress in diffusion models has profoundly enhanced the fidelity of +image generation, but it has raised concerns about copyright infringements. +While prior methods have introduced adversarial perturbations to prevent style +imitation, most are accompanied by the degradation of artworks' visual quality. +Recognizing the importance of maintaining this, we introduce a visually +improved protection method while preserving its protection capability. To this +end, we devise a perceptual map to highlight areas sensitive to human eyes, +guided by instance-aware refinement, which refines the protection intensity +accordingly. We also introduce a difficulty-aware protection by predicting how +difficult the artwork is to protect and dynamically adjusting the intensity +based on this. Lastly, we integrate a perceptual constraints bank to further +improve the imperceptibility. Results show that our method substantially +elevates the quality of the protected image without compromising on protection +efficacy. + +
+
+
+
+
+ + ♻ ☆ u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model + + +
+ Recent advancements in multi-modal large language models (MLLMs) have led to +substantial improvements in visual understanding, primarily driven by +sophisticated modality alignment strategies. However, predominant approaches +prioritize global or regional comprehension, with less focus on fine-grained, +pixel-level tasks. To address this gap, we introduce u-LLaVA, an innovative +unifying multi-task framework that integrates pixel, regional, and global +features to refine the perceptual faculties of MLLMs. We commence by leveraging +an efficient modality alignment approach, harnessing both image and video +datasets to bolster the model's foundational understanding across diverse +visual contexts. Subsequently, a joint instruction tuning method with +task-specific projectors and decoders for end-to-end downstream training is +presented. Furthermore, this work contributes a novel mask-based multi-task +dataset comprising 277K samples, crafted to challenge and assess the +fine-grained perception capabilities of MLLMs. The overall framework is simple, +effective, and achieves state-of-the-art performance across multiple +benchmarks. We also make our model, data, and code publicly accessible at +https://github.com/OPPOMKLab/u-LLaVA. + +
+
+
+
+
+ + ♻ ☆ Automated Real-World Sustainability Data Generation from Images of + Buildings + + +
+ When data on building features is unavailable, the task of determining how to +improve that building in terms of carbon emissions becomes infeasible. We show +that from only a set of images, a Large Language Model with appropriate prompt +engineering and domain knowledge can successfully estimate a range of building +features relevant for sustainability calculations. We compare our novel +image-to-data method with a ground truth comprising real building data for 47 +apartments and achieve accuracy better than a human performing the same task. +We also demonstrate that the method can generate tailored recommendations to +the owner on how best to improve their properties and discuss methods to scale +the approach. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ When Multi-Task Learning Meets Partial Supervision: A Computer Vision + Review + + +
+ Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while +exploiting their mutual relationships. By using shared resources to +simultaneously calculate multiple outputs, this learning paradigm has the +potential to have lower memory requirements and inference times compared to the +traditional approach of using separate methods for each task. Previous work in +MTL has mainly focused on fully-supervised methods, as task relationships can +not only be leveraged to lower the level of data-dependency of those methods +but they can also improve performance. However, MTL introduces a set of +challenges due to a complex optimisation scheme and a higher labeling +requirement. This review focuses on how MTL could be utilised under different +partial supervision settings to address these challenges. First, this review +analyses how MTL traditionally uses different parameter sharing techniques to +transfer knowledge in between tasks. Second, it presents the different +challenges arising from such a multi-objective optimisation scheme. Third, it +introduces how task groupings can be achieved by analysing task relationships. +Fourth, it focuses on how partially supervised methods applied to MTL can +tackle the aforementioned challenges. Lastly, this review presents the +available datasets, tools and benchmarking results of such methods. + +
+
+ comment: Accepted by Proceedings of the IEEE +
+
+
+
+
+ + ♻ ☆ Research on the Spatial Data Intelligent Foundation Model + + +
+ This report focuses on spatial data intelligent large models, delving into +the principles, methods, and cutting-edge applications of these models. It +provides an in-depth discussion on the definition, development history, current +status, and trends of spatial data intelligent large models, as well as the +challenges they face. The report systematically elucidates the key technologies +of spatial data intelligent large models and their applications in urban +environments, aerospace remote sensing, geography, transportation, and other +scenarios. Additionally, it summarizes the latest application cases of spatial +data intelligent large models in themes such as urban development, multimodal +systems, remote sensing, smart transportation, and resource environments. +Finally, the report concludes with an overview and outlook on the development +prospects of spatial data intelligent large models. + +
+
+ comment: V1 and V2 are in Chinese language, other versions are in English +
+
+
+
+
+ + ♻ ☆ FRAME: A Modular Framework for Autonomous Map Merging: Advancements in + the Field + + +
+ In this article, a novel approach for merging 3D point cloud maps in the +context of egocentric multi-robot exploration is presented. Unlike traditional +methods, the proposed approach leverages state-of-the-art place recognition and +learned descriptors to efficiently detect overlap between maps, eliminating the +need for the time-consuming global feature extraction and feature matching +process. The estimated overlapping regions are used to calculate a homogeneous +rigid transform, which serves as an initial condition for the GICP point cloud +registration algorithm to refine the alignment between the maps. The advantages +of this approach include faster processing time, improved accuracy, and +increased robustness in challenging environments. Furthermore, the +effectiveness of the proposed framework is successfully demonstrated through +multiple field missions of robot exploration in a variety of different +underground environments. + +
+
+ comment: 28 pages, 24 figures. Accepted to the IEEE Transactions on Field + Robotics +
+
+
+
+
+ + ♻ ☆ Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis + + +
+ Recent neural rendering and reconstruction techniques, such as NeRFs or +Gaussian Splatting, have shown remarkable novel view synthesis capabilities but +require hundreds of images of the scene from diverse viewpoints to render +high-quality novel views. With fewer images available, these methods start to +fail since they can no longer correctly triangulate the underlying 3D geometry +and converge to a non-optimal solution. These failures can manifest as floaters +or blurry renderings in sparsely observed areas of the scene. In this paper, we +propose Re-Nerfing, a simple and general add-on approach that leverages novel +view synthesis itself to tackle this problem. Using an already trained NVS +method, we render novel views between existing ones and augment the training +data to optimize a second model. This introduces additional multi-view +constraints and allows the second model to converge to a better solution. With +Re-Nerfing we achieve significant improvements upon multiple pipelines based on +NeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and +LLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra +supervision signals, making it a flexible and practical add-on. + +
+
+ comment: Code will be released upon acceptance +
+
+
+
+
+ + ♻ ☆ FAST-LIVO2: Fast, Direct LiDAR-Inertial-Visual Odometry + + +
+ This paper proposes FAST-LIVO2: a fast, direct LiDAR-inertial-visual odometry +framework to achieve accurate and robust state estimation in SLAM tasks and +provide great potential in real-time, onboard robotic applications. FAST-LIVO2 +fuses the IMU, LiDAR and image measurements efficiently through an ESIKF. To +address the dimension mismatch between the heterogeneous LiDAR and image +measurements, we use a sequential update strategy in the Kalman filter. To +enhance the efficiency, we use direct methods for both the visual and LiDAR +fusion, where the LiDAR module registers raw points without extracting edge or +plane features and the visual module minimizes direct photometric errors +without extracting ORB or FAST corner features. The fusion of both visual and +LiDAR measurements is based on a single unified voxel map where the LiDAR +module constructs the geometric structure for registering new LiDAR scans and +the visual module attaches image patches to the LiDAR points. To enhance the +accuracy of image alignment, we use plane priors from the LiDAR points in the +voxel map (and even refine the plane prior) and update the reference patch +dynamically after new images are aligned. Furthermore, to enhance the +robustness of image alignment, FAST-LIVO2 employs an on-demanding raycast +operation and estimates the image exposure time in real time. Lastly, we detail +three applications of FAST-LIVO2: UAV onboard navigation demonstrating the +system's computation efficiency for real-time onboard navigation, airborne +mapping showcasing the system's mapping accuracy, and 3D model rendering +(mesh-based and NeRF-based) underscoring the suitability of our reconstructed +dense map for subsequent rendering tasks. We open source our code, dataset and +application on GitHub to benefit the robotics community. + +
+
+ comment: 30 pages, 31 figures, due to the limitation that 'The abstract field + cannot exceed 1,920 characters', the abstract presented here is shorter than + the one in the PDF file +
+
+
+
+
+ + ♻ ☆ Automated Label Unification for Multi-Dataset Semantic Segmentation with + GNNs + + +
+ Deep supervised models possess significant capability to assimilate extensive +training data, thereby presenting an opportunity to enhance model performance +through training on multiple datasets. However, conflicts arising from +different label spaces among datasets may adversely affect model performance. +In this paper, we propose a novel approach to automatically construct a unified +label space across multiple datasets using graph neural networks. This enables +semantic segmentation models to be trained simultaneously on multiple datasets, +resulting in performance improvements. Unlike existing methods, our approach +facilitates seamless training without the need for additional manual +reannotation or taxonomy reconciliation. This significantly enhances the +efficiency and effectiveness of multi-dataset segmentation model training. The +results demonstrate that our method significantly outperforms other +multi-dataset training methods when trained on seven datasets simultaneously, +and achieves state-of-the-art performance on the WildDash 2 benchmark. + +
+
+
+
+
+ + ♻ ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and + Results + + +
+ Video quality assessment (VQA) is a crucial task in the development of video +compression standards, as it directly impacts the viewer experience. This paper +presents the results of the Compressed Video Quality Assessment challenge, held +in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV +2024. The challenge aimed to evaluate the performance of VQA methods on a +diverse dataset of 459 videos, encoded with 14 codecs of various compression +standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a +comprehensive collection of compression artifacts. To measure the methods +performance, we employed traditional correlation coefficients between their +predictions and subjective scores, which were collected via large-scale +crowdsourced pairwise human comparisons. For training purposes, participants +were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a +previously developed dataset of 1022 videos. Up to 30 participating teams +registered for the challenge, while we report the results of 6 teams, which +submitted valid final solutions and code for reproducing the results. Moreover, +we calculated and present the performance of state-of-the-art VQA methods on +the developed dataset, providing a comprehensive benchmark for future research. +The dataset, results, and online leaderboard are publicly available at +https://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html. + +
+
+
+
+
+ + ♻ ☆ DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping + + +
+ Recently, significant progress has been achieved in sensing real large-scale +outdoor 3D environments, particularly by using modern acquisition equipment +such as LiDAR sensors. Unfortunately, they are fundamentally limited in their +ability to produce dense, complete 3D scenes. To address this issue, recent +learning-based methods integrate neural implicit representations and +optimizable feature grids to approximate surfaces of 3D scenes. However, +naively fitting samples along raw LiDAR rays leads to noisy 3D mapping results +due to the nature of sparse, conflicting LiDAR measurements. Instead, in this +work we depart from fitting LiDAR data exactly, instead letting the network +optimize a non-metric monotonic implicit field defined in 3D space. To fit our +field, we design a learning system integrating a monotonicity loss that enables +optimizing neural monotonic fields and leverages recent progress in large-scale +3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as +captured by multiple quantitative and perceptual measures and visual results +obtained for Mai City, Newer College, and KITTI benchmarks. The code of our +approach will be made publicly available. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image + Generation from Spontaneous Facial Expression Reaction + + +
+ Researchers have proposed to use data of human preference feedback to +fine-tune text-to-image generative models. However, the scalability of human +feedback collection has been limited by its reliance on manual annotation. +Therefore, we develop and test a method to automatically score user preferences +from their spontaneous facial expression reaction to the generated images. We +collect a dataset of Facial Expression Reaction to Generated Images (FERGI) and +show that the activations of multiple facial action units (AUs) are highly +correlated with user evaluations of the generated images. We develop an FAU-Net +(Facial Action Units Neural Network), which receives inputs from an AU +estimation model, to automatically score user preferences for text-to-image +generation based on their facial expression reactions, which is complementary +to the pre-trained scoring models based on the input text prompts and generated +images. Integrating our FAU-Net valence score with the pre-trained scoring +models improves their consistency with human preferences. This method of +automatic annotation with facial expression analysis can be potentially +generalized to other generation tasks. The code is available at +https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at +the same link for research purposes. + +
+
+
+
+
+ + ♻ ☆ CMTA: Cross-Modal Temporal Alignment for Event-guided Video Deblurring ECCV2024 + + +
+ Video deblurring aims to enhance the quality of restored results in +motion-blurred videos by effectively gathering information from adjacent video +frames to compensate for the insufficient data in a single blurred frame. +However, when faced with consecutively severe motion blur situations, +frame-based video deblurring methods often fail to find accurate temporal +correspondence among neighboring video frames, leading to diminished +performance. To address this limitation, we aim to solve the video deblurring +task by leveraging an event camera with micro-second temporal resolution. To +fully exploit the dense temporal resolution of the event camera, we propose two +modules: 1) Intra-frame feature enhancement operates within the exposure time +of a single blurred frame, iteratively enhancing cross-modality features in a +recurrent manner to better utilize the rich temporal information of events, 2) +Inter-frame temporal feature alignment gathers valuable long-range temporal +information to target frames, aggregating sharp features leveraging the +advantages of the events. In addition, we present a novel dataset composed of +real-world blurred RGB videos, corresponding sharp videos, and event data. This +dataset serves as a valuable resource for evaluating event-guided deblurring +methods. We demonstrate that our proposed methods outperform state-of-the-art +frame-based and event-based motion deblurring methods through extensive +experiments conducted on both synthetic and real-world deblurring datasets. The +code and dataset are available at https://github.com/intelpro/CMTA. + +
+
+ comment: Accepted in ECCV2024 +
+
+
+
+
+ + ♻ ☆ Training-Free Action Recognition and Goal Inference with Dynamic Frame + Selection + + +
+ We introduce VidTFS, a Training-free, open-vocabulary video goal and action +inference framework that combines the frozen vision foundational model (VFM) +and large language model (LLM) with a novel dynamic Frame Selection module. Our +experiments demonstrate that the proposed frame selection module improves the +performance of the framework significantly. We validate the performance of the +proposed VidTFS on four widely used video datasets, including CrossTask, COIN, +UCF101, and ActivityNet, covering goal inference and action recognition tasks +under open-vocabulary settings without requiring any training or fine-tuning. +The results show that VidTFS outperforms pretrained and instruction-tuned +multimodal language models that directly stack LLM and VFM for downstream video +inference tasks. Our VidTFS with its adaptability shows the future potential +for generalizing to new training-free video inference tasks. + +
+
+
+
+
+ + ♻ ☆ Unrecognizable Yet Identifiable: Image Distortion with Preserved + Embeddings + + +
+ Biometric authentication systems play a crucial role in modern security +systems. However, maintaining the balance of privacy and integrity of stored +biometrics derivative data while achieving high recognition accuracy is often +challenging. Addressing this issue, we introduce an innovative image +transformation technique that effectively renders facial images unrecognizable +to the eye while maintaining their identifiability by neural network models, +which allows the distorted photo version to be stored for further verification. +While initially intended for biometrics systems, the proposed methodology can +be used in various artificial intelligence applications to distort the visual +data and keep the derived features close. By experimenting with widely used +datasets LFW and MNIST, we show that it is possible to build the distortion +that changes the image content by more than 70% while maintaining the same +recognition accuracy. We compare our method with previously state-of-the-art +approaches. We publically release the source code. + +
+
+
+
+
+ + ♻ ☆ How Physics and Background Attributes Impact Video Transformers in + Robotic Manipulation: A Case Study on Planar Pushing IROS 2024 + + +
+ As model and dataset sizes continue to scale in robot learning, the need to +understand how the composition and properties of a dataset affect model +performance becomes increasingly urgent to ensure cost-effective data +collection and model performance. In this work, we empirically investigate how +physics attributes (color, friction coefficient, shape) and scene background +characteristics, such as the complexity and dynamics of interactions with +background objects, influence the performance of Video Transformers in +predicting planar pushing trajectories. We investigate three primary questions: +How do physics attributes and background scene characteristics influence model +performance? What kind of changes in attributes are most detrimental to model +generalization? What proportion of fine-tuning data is required to adapt models +to novel scenarios? To facilitate this research, we present +CloudGripper-Push-1K, a large real-world vision-based robot pushing dataset +comprising 1278 hours and 460,000 videos of planar pushing interactions with +objects with different physics and background attributes. We also propose Video +Occlusion Transformer (VOT), a generic modular video-transformer-based +trajectory prediction framework which features 3 choices of 2D-spatial encoders +as the subject of our case study. The dataset and source code are available at +https://cloudgripper.org. + +
+
+ comment: IEEE/RSJ IROS 2024 +
+
+
+
+
+ + ♻ ☆ Evidential Deep Partial Multi-View Classification With Discount Fusion + + +
+ Incomplete multi-view data classification poses significant challenges due to +the common issue of missing views in real-world scenarios. Despite +advancements, existing methods often fail to provide reliable predictions, +largely due to the uncertainty of missing views and the inconsistent quality of +imputed data. To tackle these problems, we propose a novel framework called +Evidential Deep Partial Multi-View Classification (EDP-MVC). Initially, we use +K-means imputation to address missing views, creating a complete set of +multi-view data. However, the potential conflicts and uncertainties within this +imputed data can affect the reliability of downstream inferences. To manage +this, we introduce a Conflict-Aware Evidential Fusion Network (CAEFN), which +dynamically adjusts based on the reliability of the evidence, ensuring +trustworthy discount fusion and producing reliable inference outcomes. +Comprehensive experiments on various benchmark datasets reveal EDP-MVC not only +matches but often surpasses the performance of state-of-the-art methods. + +
+
+ comment: Ongoing work. 13 pages, 3 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Urdu Digital Text Word Optical Character Recognition Using Permuted Auto + Regressive Sequence Modeling + + +
+ This research paper presents a novel word-level Optical Character Recognition +(OCR) model developed specifically for digital Urdu text. The model utilizes +transformer-based architectures and attention mechanisms to address the unique +challenges of recognizing Urdu script, which includes handling a diverse range +of text styles, fonts, and variations. Trained on a comprehensive dataset of +approximately 160,000 Urdu text images, the model incorporates a permuted +autoregressive sequence (PARSeq) architecture. This design enables +context-aware inference and iterative refinement by leveraging bidirectional +context information, significantly enhancing its ability to accurately +recognize Urdu characters. The model achieves a character error rate (CER) of +0.178, highlighting its effectiveness and precision in real-world applications. +However, the model has some limitations, such as difficulties with blurred +images, non-horizontal orientations, and the presence of trailing punctuation +marks, which can introduce noise into the recognition process. Addressing these +challenges will be a key focus of future work. Future research will aim to +further refine the model through advanced data augmentation techniques, +optimization of hyperparameters, and the integration of context-aware language +models, ultimately enhancing the model's performance and robustness in Urdu +text recognition. + +
+
+
+
+
+ + ♻ ☆ When ControlNet Meets Inexplicit Masks: A Case Study of ControlNet on + its Contour-following Ability + + +
+ ControlNet excels at creating content that closely matches precise contours +in user-provided masks. However, when these masks contain noise, as a frequent +occurrence with non-expert users, the output would include unwanted artifacts. +This paper first highlights the crucial role of controlling the impact of these +inexplicit masks with diverse deterioration levels through in-depth analysis. +Subsequently, to enhance controllability with inexplicit masks, an advanced +Shape-aware ControlNet consisting of a deterioration estimator and a +shape-prior modulation block is devised. The deterioration estimator assesses +the deterioration factor of the provided masks. Then this factor is utilized in +the modulation block to adaptively modulate the model's contour-following +ability, which helps it dismiss the noise part in the inexplicit masks. +Extensive experiments prove its effectiveness in encouraging ControlNet to +interpret inaccurate spatial conditions robustly rather than blindly following +the given contours. We showcase application scenarios like modifying shape +priors and composable shape-controllable generation. Codes are soon available. + +
+
+ comment: Accepted by ACM-MM 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Computer Vision based Activity Recognition and Fall + Detection of the Elderly: a Systematic Review + + +
+ As the percentage of elderly people in developed countries increases +worldwide, the healthcare of this collective is a worrying matter, especially +if it includes the preservation of their autonomy. In this direction, many +studies are being published on Ambient Assisted Living (AAL) systems, which +help to reduce the preoccupations raised by the independent living of the +elderly. In this study, a systematic review of the literature is presented on +fall detection and Human Activity Recognition (HAR) for the elderly, as the two +main tasks to solve to guarantee the safety of elderly people living alone. To +address the current tendency to perform these two tasks, the review focuses on +the use of Deep Learning (DL) based approaches on computer vision data. In +addition, different collections of data like DL models, datasets or hardware +(e.g. depth or thermal cameras) are gathered from the reviewed studies and +provided for reference in future studies. Strengths and weaknesses of existing +approaches are also discussed and, based on them, our recommendations for +future works are provided. + +
+
+
+
+
+ + ♻ ☆ LLaVA-VSD: Large Language-and-Vision Assistant for Visual Spatial + Description + + +
+ Visual Spatial Description (VSD) aims to generate texts that describe the +spatial relationships between objects within images. Traditional visual spatial +relationship classification (VSRC) methods typically output the spatial +relationship between two objects in an image, often neglecting world knowledge +and lacking general language capabilities. In this paper, we propose a Large +Language-and-Vision Assistant for Visual Spatial Description, named LLaVA-VSD, +which is designed for the classification, description, and open-ended +description of visual spatial relationships. Specifically, the model first +constructs a VSD instruction-following dataset using given figure-caption pairs +for the three tasks. It then employs LoRA to fine-tune a Large Language and +Vision Assistant for VSD, which has 13 billion parameters and supports +high-resolution images. Finally, a large language model (Qwen-2) is used to +refine the generated sentences, enhancing their diversity and accuracy. +LLaVA-VSD demonstrates excellent multimodal conversational capabilities and can +follow open-ended instructions to assist with inquiries about object +relationships in images. + +
+
+ comment: We have discovered a significant error in the paper that affects the + main conclusions. To ensure the accuracy of our research, we have decided to + withdraw this paper and will resubmit it after making the necessary + corrections +
+
+
+
+
+ + ♻ ☆ Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images: + A Survey + + +
+ The detection and characterization of illegal solid waste disposal sites are +essential for environmental protection, particularly for mitigating pollution +and health hazards. Improperly managed landfills contaminate soil and +groundwater via rainwater infiltration, posing threats to both animals and +humans. Traditional landfill identification approaches, such as on-site +inspections, are time-consuming and expensive. Remote sensing is a +cost-effective solution for the identification and monitoring of solid waste +disposal sites that enables broad coverage and repeated acquisitions over time. +Earth Observation (EO) satellites, equipped with an array of sensors and +imaging capabilities, have been providing high-resolution data for several +decades. Researchers proposed specialized techniques that leverage remote +sensing imagery to perform a range of tasks such as waste site detection, +dumping site monitoring, and assessment of suitable locations for new +landfills. This review aims to provide a detailed illustration of the most +relevant proposals for the detection and monitoring of solid waste sites by +describing and comparing the approaches, the implemented techniques, and the +employed data. Furthermore, since the data sources are of the utmost importance +for developing an effective solid waste detection model, a comprehensive +overview of the satellites and publicly available data sets is presented. +Finally, this paper identifies the open issues in the state-of-the-art and +discusses the relevant research directions for reducing the costs and improving +the effectiveness of novel solid waste detection methods. + +
+
+
+
+
+ + ♻ ☆ TokenPacker: Efficient Visual Projector for Multimodal LLM + + +
+ The visual projector serves as an essential bridge between the visual encoder +and the Large Language Model (LLM) in a Multimodal LLM (MLLM). Typically, MLLMs +adopt a simple MLP to preserve all visual contexts via one-to-one +transformation. However, the visual tokens are redundant and can be +considerably increased when dealing with high-resolution images, impairing the +efficiency of MLLMs significantly. Some recent works have introduced resampler +or abstractor to reduce the number of resulting visual tokens. Unfortunately, +they fail to capture finer details and undermine the visual reasoning +capabilities of MLLMs. In this work, we propose a novel visual projector, which +adopts a coarse-to-fine scheme to inject the enriched characteristics to +generate the condensed visual tokens. In specific, we first interpolate the +visual features as a low-resolution point query, providing the overall visual +representation as the foundation. Then, we introduce a region-to-point +injection module that utilizes high-resolution, multi-level region-based cues +as fine-grained reference keys and values, allowing them to be fully absorbed +within the corresponding local context region. This step effectively updates +the coarse point query, transforming it into an enriched one for the subsequent +LLM reasoning. Extensive experiments demonstrate that our approach compresses +the visual tokens by 75%~89%, while achieves comparable or even better +performance across diverse benchmarks with significantly higher efficiency. The +source codes can be found at https://github.com/CircleRadon/TokenPacker. + +
+
+ comment: 16 pages, Codes:https://github.com/CircleRadon/TokenPacker +
+
+
+
+
+ + ♻ ☆ Unveiling the Human-like Similarities of Automatic Facial Expression + Recognition: An Empirical Exploration through Explainable AI + + +
+ Facial expression recognition is vital for human behavior analysis, and deep +learning has enabled models that can outperform humans. However, it is unclear +how closely they mimic human processing. This study aims to explore the +similarity between deep neural networks and human perception by comparing +twelve different networks, including both general object classifiers and +FER-specific models. We employ an innovative global explainable AI method to +generate heatmaps, revealing crucial facial regions for the twelve networks +trained on six facial expressions. We assess these results both quantitatively +and qualitatively, comparing them to ground truth masks based on Friesen and +Ekman's description and among them. We use Intersection over Union (IoU) and +normalized correlation coefficients for comparisons. We generate 72 heatmaps to +highlight critical regions for each expression and architecture. Qualitatively, +models with pre-trained weights show more similarity in heatmaps compared to +those without pre-training. Specifically, eye and nose areas influence certain +facial expressions, while the mouth is consistently important across all models +and expressions. Quantitatively, we find low average IoU values (avg. 0.2702) +across all expressions and architectures. The best-performing architecture +averages 0.3269, while the worst-performing one averages 0.2066. Dendrograms, +built with the normalized correlation coefficient, reveal two main clusters for +most expressions: models with pre-training and models without pre-training. +Findings suggest limited alignment between human and AI facial expression +recognition, with network architectures influencing the similarity, as similar +architectures prioritize similar facial regions. + +
+
+ comment: Multimed Tools Appl (2024) +
+
+
+
+
+ + ♻ ☆ DocLayLLM: An Efficient and Effective Multi-modal Extension of Large + Language Models for Text-rich Document Understanding + + +
+ Text-rich document understanding (TDU) refers to analyzing and comprehending +documents containing substantial textual content. With the rapid evolution of +large language models (LLMs), they have been widely leveraged for TDU due to +their remarkable versatility and generalization. In this paper, we introduce +DocLayLLM, an efficient and effective multi-modal extension of LLMs +specifically designed for TDU. By integrating visual patch tokens and 2D +positional tokens into LLMs and encoding the document content using the LLMs +themselves, we fully take advantage of the document comprehension capability of +LLMs and enhance their perception of OCR information. We have also deeply +considered the role of the chain-of-thought (CoT) and innovatively proposed the +techniques of CoT Pre-training and CoT Annealing. Our DocLayLLM can achieve +remarkable performances with lightweight training settings, showcasing its +efficiency and effectiveness. Experimental results demonstrate that our +DocLayLLM surpasses existing OCR-dependent methods and also outperforms +OCR-free competitors. + +
+
+
+
+
+ + ♻ ☆ Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention + + +
+ The Transformer architecture has revolutionized deep learning through its +Self-Attention mechanism, which effectively captures contextual information. +However, the memory footprint of Self-Attention presents significant challenges +for long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by +grouping queries and mean-pooling the corresponding key-value heads - reducing +the number of overall parameters and memory requirements in a flexible manner +without adversely compromising model accuracy. In this work, we introduce +enhancements to GQA, focusing on two novel approaches that deviate from the +static nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic +Key-Distributed GQA (DGQA), which leverage information from the norms of the +key heads to inform query allocation. Specifically, KDGQA looks at the ratios +of the norms of the key heads during each forward pass, while DGQA examines the +ratios of the norms as they evolve through training. Additionally, we present +Perturbed GQA (PGQA) as a case-study, which introduces variability in (static) +group formation via subtracting noise from the attention maps. Our experiments +with up-trained Vision Transformers, for Image Classification on datasets such +as CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of +these variants in improving upon the original GQA through more informed and +adaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of +up to 8% when utilizing DGQA in comparison to GQA and other variants. We +further analyze the impact of the number of Key-Value Heads on performance, +underscoring the importance of utilizing query-key affinities. Code is +available on GitHub. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection + with Semantic Feature Fusion Guidance + + +
+ Although most existing multi-modal salient object detection (SOD) methods +demonstrate effectiveness through training models from scratch, the limited +multi-modal data hinders these methods from reaching optimality. In this paper, +we propose a novel framework to explore and exploit the powerful feature +representation and zero-shot generalization ability of the pre-trained Segment +Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision +fundamental model, driving the class-agnostic SAM to comprehend and detect +salient objects accurately is non-trivial, especially in challenging scenes. To +this end, we develop \underline{SAM} with se\underline{m}antic +f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which +incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to +multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal +data to directly mine the complementary benefits of multi-modal inputs and +comprehensively utilize them to achieve accurate saliency prediction.To address +these issues, we first design a multi-modal complementary fusion module to +extract robust multi-modal semantic features by integrating information from +visible and thermal or depth image pairs. Then, we feed the extracted +multi-modal semantic features into both the SAM image encoder and mask decoder +for fine-tuning and prompting, respectively. Specifically, in the image +encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to +multi-modal information. In the mask decoder, a semantic-geometric prompt +generation strategy is proposed to produce corresponding embeddings with +various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD +benchmarks show the effectiveness of the proposed framework. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ HAIR: Hypernetworks-based All-in-One Image Restoration + + +
+ Image restoration aims to recover a high-quality clean image from its +degraded version. Recent progress in image restoration has demonstrated the +effectiveness of All-in-One image restoration models in addressing various +degradations simultaneously. However, these existing methods typically utilize +the same parameters to tackle images with different degradation types, thus +forcing the model to balance the performance between different tasks and +limiting its performance on each task. To alleviate this issue, we propose +HAIR, a \textbf{H}ypernetworks-based \textbf{A}ll-in-One \textbf{I}mage +\textbf{R}estoration method that dynamically generates parameters based on +input images. Specifically, HAIR consists of two main components, i.e., +Classifier and Hyper Selecting Net (HSN). The Classifier is a simple image +classification network used to generate a Global Information Vector (GIV) that +contains the degradation information of the input image, and the HSN is a +simple fully-connected neural network that receives the GIV and outputs +parameters for the corresponding modules. Extensive experiments demonstrate +that HAIR can significantly improve the performance of existing image +restoration models in a plug-and-play manner, both in single-task and +all-in-one settings. Notably, our innovative model, Res-HAIR, which integrates +HAIR into the well-known Restormer, can obtain superior or comparable +performance compared with current state-of-the-art methods. Moreover, we +theoretically demonstrate that our proposed HAIR requires fewer parameters in +contrast to the prevalent All-in-One methodologies. The code is available at +\textcolor{blue}{\href{https://github.com/toummHus/HAIR}{https://github.com/toummHus/HAIR}.} + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Boost Your NeRF: A Model-Agnostic Mixture of Experts Framework for High + Quality and Efficient Rendering ECCV 2024 + + +
+ Since the introduction of NeRFs, considerable attention has been focused on +improving their training and inference times, leading to the development of +Fast-NeRFs models. Despite demonstrating impressive rendering speed and +quality, the rapid convergence of such models poses challenges for further +improving reconstruction quality. Common strategies to improve rendering +quality involves augmenting model parameters or increasing the number of +sampled points. However, these computationally intensive approaches encounter +limitations in achieving significant quality enhancements. This study +introduces a model-agnostic framework inspired by Sparsely-Gated Mixture of +Experts to enhance rendering quality without escalating computational +complexity. Our approach enables specialization in rendering different scene +components by employing a mixture of experts with varying resolutions. We +present a novel gate formulation designed to maximize expert capabilities and +propose a resolution-based routing technique to effectively induce sparsity and +decompose scenes. Our work significantly improves reconstruction quality while +maintaining competitive performance. + +
+
+ comment: The paper has been accepted to the ECCV 2024 conference +
+
+
+
+
+ + ♻ ☆ Enhancing Quantitative Image Synthesis through Pretraining and + Resolution Scaling for Bone Mineral Density Estimation from a Plain X-ray + Image MICCAI + + +
+ While most vision tasks are essentially visual in nature (for recognition), +some important tasks, especially in the medical field, also require +quantitative analysis (for quantification) using quantitative images. Unlike in +visual analysis, pixel values in quantitative images correspond to physical +metrics measured by specific devices (e.g., a depth image). However, recent +work has shown that it is sometimes possible to synthesize accurate +quantitative values from visual ones (e.g., depth from visual cues or defocus). +This research aims to improve quantitative image synthesis (QIS) by exploring +pretraining and image resolution scaling. We propose a benchmark for evaluating +pretraining performance using the task of QIS-based bone mineral density (BMD) +estimation from plain X-ray images, where the synthesized quantitative image is +used to derive BMD. Our results show that appropriate pretraining can improve +QIS performance, significantly raising the correlation of BMD estimation from +0.820 to 0.898, while others do not help or even hinder it. Scaling-up the +resolution can further boost the correlation up to 0.923, a significant +enhancement over conventional methods. Future work will include exploring more +pretraining strategies and validating them on other image synthesis tasks. + +
+
+ comment: SASHIMI, 2024 (MICCAI workshop). 13 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ NOVUM: Neural Object Volumes for Robust Object Classification ECCV 2024 + + +
+ Discriminative models for object classification typically learn image-based +representations that do not capture the compositional and 3D nature of objects. +In this work, we show that explicitly integrating 3D compositional object +representations into deep networks for image classification leads to a largely +enhanced generalization in out-of-distribution scenarios. In particular, we +introduce a novel architecture, referred to as NOVUM, that consists of a +feature extractor and a neural object volume for every target object class. +Each neural object volume is a composition of 3D Gaussians that emit feature +vectors. This compositional object representation allows for a highly robust +and fast estimation of the object class by independently matching the features +of the 3D Gaussians of each category to features extracted from an input image. +Additionally, the object pose can be estimated via inverse rendering of the +corresponding neural object volume. To enable the classification of objects, +the neural features at each 3D Gaussian are trained discriminatively to be +distinct from (i) the features of 3D Gaussians in other categories, (ii) +features of other 3D Gaussians of the same object, and (iii) the background +features. Our experiments show that NOVUM offers intriguing advantages over +standard architectures due to the 3D compositional structure of the object +representation, namely: (1) An exceptional robustness across a spectrum of +real-world and synthetic out-of-distribution shifts and (2) an enhanced human +interpretability compared to standard models, all while maintaining real-time +inference and a competitive accuracy on in-distribution data. + +
+
+ comment: 14 pages, 4 figures, accepted at ECCV 2024, code is accessible at + https://github.com/GenIntel/NOVUM +
+
+
+
+
+ + ♻ ☆ Brain3D: Generating 3D Objects from fMRI + + +
+ Understanding the hidden mechanisms behind human's visual perception is a +fundamental question in neuroscience. To that end, investigating into the +neural responses of human mind activities, such as functional Magnetic +Resonance Imaging (fMRI), has been a significant research vehicle. However, +analyzing fMRI signals is challenging, costly, daunting, and demanding for +professional training. Despite remarkable progress in fMRI analysis, existing +approaches are limited to generating 2D images and far away from being +biologically meaningful and practically useful. Under this insight, we propose +to generate visually plausible and functionally more comprehensive 3D outputs +decoded from brain signals, enabling more sophisticated modeling of fMRI data. +Conceptually, we reformulate this task as a {\em fMRI conditioned 3D object +generation} problem. We design a novel 3D object representation learning +method, Brain3D, that takes as input the fMRI data of a subject who was +presented with a 2D image, and yields as output the corresponding 3D object +images. The key capabilities of this model include tackling the noises with +high-level semantic signals and a two-stage architecture design for progressive +high-level information integration. Extensive experiments validate the superior +capability of our model over previous state-of-the-art 3D object generation +methods. Importantly, we show that our model captures the distinct +functionalities of each region of human vision system as well as their +intricate interplay relationships, aligning remarkably with the established +discoveries in neuroscience. Further, preliminary evaluations indicate that +Brain3D can successfully identify the disordered brain regions in simulated +scenarios, such as V1, V2, V3, V4, and the medial temporal lobe (MTL) within +the human visual system. Our data and code will be available at +https://brain-3d.github.io/. + +
+
+ comment: 20 pages, 11 figures, project page: https://brain-3d.github.io/ +
+
+
+
+
+ + ♻ ☆ DualAnoDiff: Dual-Interrelated Diffusion Model for Few-Shot Anomaly + Image Generation + + +
+ The performance of anomaly inspection in industrial manufacturing is +constrained by the scarcity of anomaly data. To overcome this challenge, +researchers have started employing anomaly generation approaches to augment the +anomaly dataset. However, existing anomaly generation methods suffer from +limited diversity in the generated anomalies and struggle to achieve a seamless +blending of this anomaly with the original image. In this paper, we overcome +these challenges from a new perspective, simultaneously generating a pair of +the overall image and the corresponding anomaly part. We propose DualAnoDiff, a +novel diffusion-based few-shot anomaly image generation model, which can +generate diverse and realistic anomaly images by using a dual-interrelated +diffusion model, where one of them is employed to generate the whole image +while the other one generates the anomaly part. Moreover, we extract background +and shape information to mitigate the distortion and blurriness phenomenon in +few-shot image generation. Extensive experiments demonstrate the superiority of +our proposed model over state-of-the-art methods in terms of both realism and +diversity. Overall, our approach significantly improves the performance of +downstream anomaly detection tasks, including anomaly detection, anomaly +localization, and anomaly classification tasks. + +
+
+ comment: Code: https://github.com/yinyjin/DualAnoDiff +
+
+
+
+
+ + ♻ ☆ Lightweight High-Speed Photography Built on Coded Exposure and Implicit + Neural Representation of Videos + + +
+ The demand for compact cameras capable of recording high-speed scenes with +high resolution is steadily increasing. However, achieving such capabilities +often entails high bandwidth requirements, resulting in bulky, heavy systems +unsuitable for low-capacity platforms. To address this challenge, leveraging a +coded exposure setup to encode a frame sequence into a blurry snapshot and +subsequently retrieve the latent sharp video presents a lightweight solution. +Nevertheless, restoring motion from blur remains a formidable challenge due to +the inherent ill-posedness of motion blur decomposition, the intrinsic +ambiguity in motion direction, and the diverse motions present in natural +videos. In this study, we propose a novel approach to address these challenges +by combining the classical coded exposure imaging technique with the emerging +implicit neural representation for videos. We strategically embed motion +direction cues into the blurry image during the imaging process. Additionally, +we develop a novel implicit neural representation based blur decomposition +network to sequentially extract the latent video frames from the blurry image, +leveraging the embedded motion direction cues. To validate the effectiveness +and efficiency of our proposed framework, we conduct extensive experiments +using benchmark datasets and real-captured blurry images. The results +demonstrate that our approach significantly outperforms existing methods in +terms of both quality and flexibility. The code for our work is available at +.https://github.com/zhihongz/BDINR + +
+
+ comment: Accepted by IJCV +
+
+
+
+
+ + ♻ ☆ Structural Attention: Rethinking Transformer for Unpaired Medical Image + Synthesis MICCAI + + +
+ Unpaired medical image synthesis aims to provide complementary information +for an accurate clinical diagnostics, and address challenges in obtaining +aligned multi-modal medical scans. Transformer-based models excel in imaging +translation tasks thanks to their ability to capture long-range dependencies. +Although effective in supervised training settings, their performance falters +in unpaired image synthesis, particularly in synthesizing structural details. +This paper empirically demonstrates that, lacking strong inductive biases, +Transformer can converge to non-optimal solutions in the absence of paired +data. To address this, we introduce UNet Structured Transformer (UNest), a +novel architecture incorporating structural inductive biases for unpaired +medical image synthesis. We leverage the foundational Segment-Anything Model to +precisely extract the foreground structure and perform structural attention +within the main anatomy. This guides the model to learn key anatomical regions, +thus improving structural synthesis under the lack of supervision in unpaired +training. Evaluated on two public datasets, spanning three modalities, i.e., +MR, CT, and PET, UNest improves recent methods by up to 19.30% across six +medical image synthesis tasks. Our code is released at +https://github.com/HieuPhan33/MICCAI2024-UNest. + +
+
+ comment: MICCAI version before camera ready +
+
+
+
+
+ + ♻ ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models + + +
+ This report introduces xGen-MM (also known as BLIP-3), a framework for +developing Large Multimodal Models (LMMs). The framework comprises meticulously +curated datasets, a training recipe, model architectures, and a resulting suite +of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen +initiative on foundation AI models. Our models undergo rigorous evaluation +across a range of tasks, including both single and multi-image benchmarks. Our +pre-trained base model exhibits strong in-context learning capabilities and the +instruction-tuned model demonstrates competitive performance among open-source +LMMs with similar model sizes. In addition, we introduce a safety-tuned model +with DPO, aiming to mitigate harmful behaviors such as hallucinations and +improve safety. We open-source our models, curated large-scale datasets, and +our fine-tuning codebase to facilitate further advancements in LMM research. +Associated resources will be available on our project page above. + +
+
+
+
+
+ + ♻ ☆ Classification Matters: Improving Video Action Detection with + Class-Specific Attention ECCV 2024 + + +
+ Video action detection (VAD) aims to detect actors and classify their actions +in a video. We figure that VAD suffers more from classification rather than +localization of actors. Hence, we analyze how prevailing methods form features +for classification and find that they prioritize actor regions, yet often +overlooking the essential contextual information necessary for accurate +classification. Accordingly, we propose to reduce the bias toward actor and +encourage paying attention to the context that is relevant to each action +class. By assigning a class-dedicated query to each action class, our model can +dynamically determine where to focus for effective classification. The proposed +model demonstrates superior performance on three challenging benchmarks with +significantly fewer parameters and less computation. + +
+
+ comment: 31 pages, accepted to ECCV 2024 (oral) +
+
+
+
+
+ + ♻ ☆ Drone Referring Localization: An Efficient Heterogeneous Spatial Feature + Interaction Method For UAV Self-Localization + + +
+ Image retrieval (IR) has emerged as a promising approach for +self-localization in unmanned aerial vehicles (UAVs). However, IR-based methods +face several challenges: 1) Pre- and post-processing incur significant +computational and storage overhead; 2) The lack of interaction between +dual-source features impairs precise spatial perception. In this paper, we +propose an efficient heterogeneous spatial feature interaction method, termed +Drone Referring Localization (DRL), which aims to localize UAV-view images +within satellite imagery. Unlike conventional methods that treat different data +sources in isolation, followed by cosine similarity computations, DRL +facilitates the learnable interaction of heterogeneous features. To implement +the proposed DRL, we design two transformer-based frameworks, Post-Fusion and +Mix-Fusion, enabling end-to-end training and inference. Furthermore, we +introduce random scale cropping and weight balance loss techniques to augment +paired data and optimize the balance between positive and negative sample +weights. Additionally, we construct a new dataset, UL14, and establish a +benchmark tailored to the DRL framework. Compared to traditional IR methods, +DRL achieves superior localization accuracy (MA@20 +9.4\%) while significantly +reducing computational time (1/7) and storage overhead (1/3). The dataset and +code will be made publicly available. The dataset and code are available at +\url{https://github.com/Dmmm1997/DRL} . + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ MolNexTR: A Generalized Deep Learning Model for Molecular Image + Recognition + + +
+ In the field of chemical structure recognition, the task of converting +molecular images into machine-readable data formats such as SMILES string +stands as a significant challenge, primarily due to the varied drawing styles +and conventions prevalent in chemical literature. To bridge this gap, we +proposed MolNexTR, a novel image-to-graph deep learning model that collaborates +to fuse the strengths of ConvNext, a powerful Convolutional Neural Network +variant, and Vision-TRansformer. This integration facilitates a more detailed +extraction of both local and global features from molecular images. MolNexTR +can predict atoms and bonds simultaneously and understand their layout rules. +It also excels at flexibly integrating symbolic chemistry principles to discern +chirality and decipher abbreviated structures. We further incorporate a series +of advanced algorithms, including an improved data augmentation module, an +image contamination module, and a post-processing module for getting the final +SMILES output. These modules cooperate to enhance the model's robustness to +diverse styles of molecular images found in real literature. In our test sets, +MolNexTR has demonstrated superior performance, achieving an accuracy rate of +81-97%, marking a significant advancement in the domain of molecular structure +recognition. + +
+
+
+
+
+ + ♻ ☆ Phase Matching for Out-of-Distribution Generalization + + +
+ The Fourier transform, an explicit decomposition method for visual signals, +has been employed to explain the out-of-distribution generalization behaviors +of Deep Neural Networks (DNNs). Previous studies indicate that the amplitude +spectrum is susceptible to the disturbance caused by distribution shifts, +whereas the phase spectrum preserves highly-structured spatial information that +is crucial for robust visual representation learning. Inspired by this insight, +this paper is dedicated to clarifying the relationships between Domain +Generalization (DG) and the frequency components. Specifically, we provide +distribution analysis and empirical experiments for the frequency components. +Based on these observations, we propose a Phase Matching approach, termed +PhaMa, to address DG problems. To this end, PhaMa introduces perturbations on +the amplitude spectrum and establishes spatial relationships to match the phase +components with patch contrastive learning. Experiments on multiple benchmarks +demonstrate that our proposed method achieves state-of-the-art performance in +domain generalization and out-of-distribution robustness tasks. Beyond vanilla +analysis and experiments, we further clarify the relationships between the +Fourier components and DG problems by introducing a Fourier-based Structural +Causal Model (SCM). + +
+
+
+
+
+ + ♻ ☆ SGNet: Salient Geometric Network for Point Cloud Registration + + +
+ Point Cloud Registration (PCR) is a critical and challenging task in computer +vision. One of the primary difficulties in PCR is identifying salient and +meaningful points that exhibit consistent semantic and geometric properties +across different scans. Previous methods have encountered challenges with +ambiguous matching due to the similarity among patch blocks throughout the +entire point cloud and the lack of consideration for efficient global geometric +consistency. To address these issues, we propose a new framework that includes +several novel techniques. Firstly, we introduce a semantic-aware geometric +encoder that combines object-level and patch-level semantic information. This +encoder significantly improves registration recall by reducing ambiguity in +patch-level superpoint matching. Additionally, we incorporate a prior knowledge +approach that utilizes an intrinsic shape signature to identify salient points. +This enables us to extract the most salient super points and meaningful dense +points in the scene. Secondly, we introduce an innovative transformer that +encodes High-Order (HO) geometric features. These features are crucial for +identifying salient points within initial overlap regions while considering +global high-order geometric consistency. To optimize this high-order +transformer further, we introduce an anchor node selection strategy. By +encoding inter-frame triangle or polyhedron consistency features based on these +anchor nodes, we can effectively learn high-order geometric features of salient +super points. These high-order features are then propagated to dense points and +utilized by a Sinkhorn matching module to identify key correspondences for +successful registration. In our experiments conducted on well-known datasets +such as 3DMatch/3DLoMatch and KITTI, our approach has shown promising results, +highlighting the effectiveness of our novel method. + +
+
+
+
+
+ + ♻ ☆ Fine-Grained Building Function Recognition from Street-View Images via + Geometry-Aware Semi-Supervised Learning + + +
+ In this work, we propose a geometry-aware semi-supervised method for +fine-grained building function recognition. This method leverages the geometric +relationships between multi-source data to improve the accuracy of pseudo +labels in semi-supervised learning, extending the task's scope and making it +applicable to cross-categorization systems of building function recognition. +Firstly, we design an online semi-supervised pre-training stage, which +facilitates the precise acquisition of building facade location information in +street-view images. In the second stage, we propose a geometry-aware coarse +annotation generation module. This module effectively combines GIS data and +street-view data based on the geometric relationships, improving the accuracy +of pseudo annotations. In the third stage, we combine the newly generated +coarse annotations with the existing labeled dataset to achieve fine-grained +functional recognition of buildings across multiple cities at a large scale. +Extensive experiments demonstrate that our proposed framework exhibits superior +performance in fine-grained functional recognition of buildings. Within the +same categorization system, it achieves improvements of 7.6% and 4.8% compared +to fully-supervised methods and state-of-the-art semi-supervised methods, +respectively. Additionally, our method also performs well in cross-city tasks, +i.e., extending the model trained on OmniCity (New York) to new areas (i.e., +Los Angeles and Boston). This study provides a novel solution for the +fine-grained function recognition of large-scale buildings across multiple +cities, offering essential data for understanding urban infrastructure +planning, human activity patterns, and the interactions between humans and +buildings. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ Multi-weather Cross-view Geo-localization Using Denoising Diffusion + Models ACM MM24 + + +
+ Cross-view geo-localization in GNSS-denied environments aims to determine an +unknown location by matching drone-view images with the correct geo-tagged +satellite-view images from a large gallery. Recent research shows that learning +discriminative image representations under specific weather conditions can +significantly enhance performance. However, the frequent occurrence of unseen +extreme weather conditions hinders progress. This paper introduces MCGF, a +Multi-weather Cross-view Geo-localization Framework designed to dynamically +adapt to unseen weather conditions. MCGF establishes a joint optimization +between image restoration and geo-localization using denoising diffusion +models. For image restoration, MCGF incorporates a shared encoder and a +lightweight restoration module to help the backbone eliminate weather-specific +information. For geo-localization, MCGF uses EVA-02 as a backbone for feature +extraction, with cross-entropy loss for training and cosine distance for +testing. Extensive experiments on University160k-WX demonstrate that MCGF +achieves competitive results for geo-localization in varying weather +conditions. + +
+
+ comment: Accepted by ACM MM24 workshop +
+
+
+
+
+ + ♻ ☆ VHAKG: A Multi-modal Knowledge Graph Based on Synchronized Multi-view + Videos of Daily Activities CIKM2024 + + +
+ Multi-modal knowledge graphs (MMKGs), which ground various non-symbolic data +(e.g., images and videos) into symbols, have attracted attention as resources +enabling knowledge processing and machine learning across modalities. However, +the construction of MMKGs for videos consisting of multiple events, such as +daily activities, is still in the early stages. In this paper, we construct an +MMKG based on synchronized multi-view simulated videos of daily activities. +Besides representing the content of daily life videos as event-centric +knowledge, our MMKG also includes frame-by-frame fine-grained changes, such as +bounding boxes within video frames. In addition, we provide support tools for +querying our MMKG. As an application example, we demonstrate that our MMKG +facilitates benchmarking vision-language models by providing the necessary +vision-language datasets for a tailored task. + +
+
+ comment: 5 pages, 4 figures, accepted by CIKM2024 Resource Track +
+
+
+
+
+ + ♻ ☆ Customize-A-Video: One-Shot Motion Customization of Text-to-Video + Diffusion Models ECCV 2024 + + +
+ Image customization has been extensively studied in text-to-image (T2I) +diffusion models, leading to impressive outcomes and applications. With the +emergence of text-to-video (T2V) diffusion models, its temporal counterpart, +motion customization, has not yet been well investigated. To address the +challenge of one-shot video motion customization, we propose Customize-A-Video +that models the motion from a single reference video and adapts it to new +subjects and scenes with both spatial and temporal varieties. It leverages +low-rank adaptation (LoRA) on temporal attention layers to tailor the +pre-trained T2V diffusion model for specific motion modeling. To disentangle +the spatial and temporal information during training, we introduce a novel +concept of appearance absorbers that detach the original appearance from the +reference video prior to motion learning. The proposed modules are trained in a +staged pipeline and inferred in a plug-and-play fashion, enabling easy +extensions to various downstream tasks such as custom video generation and +editing, video appearance customization and multiple motion combination. Our +project page can be found at https://customize-a-video.github.io. + +
+
+ comment: Accepted by ECCV 2024. Project page: + https://customize-a-video.github.io +
+
+
+
+
+ + ♻ ☆ AutoInst: Automatic Instance-Based Segmentation of LiDAR 3D Scans IROS + + +
+ Recently, progress in acquisition equipment such as LiDAR sensors has enabled +sensing increasingly spacious outdoor 3D environments. Making sense of such 3D +acquisitions requires fine-grained scene understanding, such as constructing +instance-based 3D scene segmentations. Commonly, a neural network is trained +for this task; however, this requires access to a large, densely annotated +dataset, which is widely known to be challenging to obtain. To address this +issue, in this work we propose to predict instance segmentations for 3D scenes +in an unsupervised way, without relying on ground-truth annotations. To this +end, we construct a learning framework consisting of two components: (1) a +pseudo-annotation scheme for generating initial unsupervised pseudo-labels; and +(2) a self-training algorithm for instance segmentation to fit robust, accurate +instances from initial noisy proposals. To enable generating 3D instance mask +proposals, we construct a weighted proxy-graph by connecting 3D points with +edges integrating multi-modal image- and point-based self-supervised features, +and perform graph-cuts to isolate individual pseudo-instances. We then build on +a state-of-the-art point-based architecture and train a 3D instance +segmentation model, resulting in significant refinement of initial proposals. +To scale to arbitrary complexity 3D scenes, we design our algorithm to operate +on local 3D point chunks and construct a merging step to generate scene-level +instance segmentations. Experiments on the challenging SemanticKITTI benchmark +demonstrate the potential of our approach, where it attains 13.3% higher +Average Precision and 9.1% higher F1 score compared to the best-performing +baseline. The code will be made publicly available at +https://github.com/artonson/autoinst. + +
+
+ comment: 8 pages, 7 figures, to be published in IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS) 2024 +
+
+
+
+
+ + ♻ ☆ AnomalousPatchCore: Exploring the Use of Anomalous Samples in Industrial + Anomaly Detection ECCV + + +
+ Visual inspection, or industrial anomaly detection, is one of the most common +quality control types in manufacturing. The task is to identify the presence of +an anomaly given an image, e.g., a missing component on an image of a circuit +board, for subsequent manual inspection. While industrial anomaly detection has +seen a surge in recent years, most anomaly detection methods still utilize +knowledge only from normal samples, failing to leverage the information from +the frequently available anomalous samples. Additionally, they heavily rely on +very general feature extractors pre-trained on common image classification +datasets. In this paper, we address these shortcomings and propose the new +anomaly detection system AnomalousPatchCore~(APC) based on a feature extractor +fine-tuned with normal and anomalous in-domain samples and a subsequent memory +bank for identifying unusual features. To fine-tune the feature extractor in +APC, we propose three auxiliary tasks that address the different aspects of +anomaly detection~(classification vs. localization) and mitigate the effect of +the imbalance between normal and anomalous samples. Our extensive evaluation on +the MVTec dataset shows that APC outperforms state-of-the-art systems in +detecting anomalies, which is especially important in industrial anomaly +detection given the subsequent manual inspection. In detailed ablation studies, +we further investigate the properties of our APC. + +
+
+ comment: Accepted at the 2nd workshop on Vision-based InduStrial InspectiON + (VISION) @ ECCV +
+
+
+
+
+ + ♻ ☆ MMASD+: A Novel Dataset for Privacy-Preserving Behavior Analysis of + Children with Autism Spectrum Disorder + + +
+ Autism spectrum disorder (ASD) is characterized by significant challenges in +social interaction and comprehending communication signals. Recently, +therapeutic interventions for ASD have increasingly utilized Deep learning +powered-computer vision techniques to monitor individual progress over time. +These models are trained on private, non-public datasets from the autism +community, creating challenges in comparing results across different models due +to privacy-preserving data-sharing issues. This work introduces MMASD+, an +enhanced version of the novel open-source dataset called Multimodal ASD +(MMASD). MMASD+ consists of diverse data modalities, including 3D-Skeleton, 3D +Body Mesh, and Optical Flow data. It integrates the capabilities of Yolov8 and +Deep SORT algorithms to distinguish between the therapist and children, +addressing a significant barrier in the original dataset. Additionally, a +Multimodal Transformer framework is proposed to predict 11 action types and the +presence of ASD. This framework achieves an accuracy of 95.03% for predicting +action types and 96.42% for predicting ASD presence, demonstrating over a 10% +improvement compared to models trained on single data modalities. These +findings highlight the advantages of integrating multiple data modalities +within the Multimodal Transformer framework. + +
+
+
+
+
+ + ♻ ☆ Field-of-View Extension for Brain Diffusion MRI via Deep Generative + Models + + +
+ Purpose: In diffusion MRI (dMRI), the volumetric and bundle analyses of +whole-brain tissue microstructure and connectivity can be severely impeded by +an incomplete field-of-view (FOV). This work aims to develop a method for +imputing the missing slices directly from existing dMRI scans with an +incomplete FOV. We hypothesize that the imputed image with complete FOV can +improve the whole-brain tractography for corrupted data with incomplete FOV. +Therefore, our approach provides a desirable alternative to discarding the +valuable dMRI data, enabling subsequent tractography analyses that would +otherwise be challenging or unattainable with corrupted data. Approach: We +propose a framework based on a deep generative model that estimates the absent +brain regions in dMRI scans with incomplete FOV. The model is capable of +learning both the diffusion characteristics in diffusion-weighted images (DWI) +and the anatomical features evident in the corresponding structural images for +efficiently imputing missing slices of DWI outside of incomplete FOV. Results: +For evaluating the imputed slices, on the WRAP dataset the proposed framework +achieved PSNRb0=22.397, SSIMb0=0.905, PSNRb1300=22.479, SSIMb1300=0.893; on the +NACC dataset it achieved PSNRb0=21.304, SSIMb0=0.892, PSNRb1300=21.599, +SSIMb1300= 0.877. The proposed framework improved the tractography accuracy, as +demonstrated by an increased average Dice score for 72 tracts (p < 0.001) on +both the WRAP and NACC datasets. Conclusions: Results suggest that the proposed +framework achieved sufficient imputation performance in dMRI data with +incomplete FOV for improving whole-brain tractography, thereby repairing the +corrupted data. Our approach achieved more accurate whole-brain tractography +results with extended and complete FOV and reduced the uncertainty when +analyzing bundles associated with Alzheimer's Disease. + +
+
+ comment: 20 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Biomedical Image Segmentation: A Systematic Literature Review of Deep + Learning Based Object Detection Methods + + +
+ Biomedical image segmentation plays a vital role in diagnosis of diseases +across various organs. Deep learning-based object detection methods are +commonly used for such segmentation. There exists an extensive research in this +topic. However, there is no standard review on this topic. Existing surveys +often lack a standardized approach or focus on broader segmentation techniques. +In this paper, we conducted a systematic literature review (SLR), collected and +analysed 148 articles that explore deep learning object detection methods for +biomedical image segmentation. We critically analyzed these methods, identified +the key challenges, and discussed the future directions. From the selected +articles we extracted the results including the deep learning models, targeted +imaging modalities, targeted diseases, and the metrics for the analysis of the +methods. The results have been presented in tabular and/or charted forms. The +results are presented in three major categories including two stage detection +models, one stage detection models and point-based detection models. Each +article is individually analyzed along with its pros and cons. Finally, we +discuss open challenges, potential benefits, and future research directions. +This SLR aims to provide the research community with a quick yet deeper +understanding of these segmentation models, ultimately facilitating the +development of more powerful solutions for biomedical image analysis. + +
+
+
+
+
+ + ♻ ☆ PhysPart: Physically Plausible Part Completion for Interactable Objects + + +
+ Interactable objects are ubiquitous in our daily lives. Recent advances in 3D +generative models make it possible to automate the modeling of these objects, +benefiting a range of applications from 3D printing to the creation of robot +simulation environments. However, while significant progress has been made in +modeling 3D shapes and appearances, modeling object physics, particularly for +interactable objects, remains challenging due to the physical constraints +imposed by inter-part motions. In this paper, we tackle the problem of +physically plausible part completion for interactable objects, aiming to +generate 3D parts that not only fit precisely into the object but also allow +smooth part motions. To this end, we propose a diffusion-based part generation +model that utilizes geometric conditioning through classifier-free guidance and +formulates physical constraints as a set of stability and mobility losses to +guide the sampling process. Additionally, we demonstrate the generation of +dependent parts, paving the way toward sequential part generation for objects +with complex part-whole hierarchies. Experimentally, we introduce a new metric +for measuring physical plausibility based on motion success rates. Our model +outperforms existing baselines over shape and physical metrics, especially +those that do not adequately model physical constraints. We also demonstrate +our applications in 3D printing, robot manipulation, and sequential part +generation, showing our strength in realistic tasks with the demand for high +physical plausibility. + +
+
+
+
+
+ + ♻ ☆ SurGen: Text-Guided Diffusion Model for Surgical Video Generation + + +
+ Diffusion-based video generation models have made significant strides, +producing outputs with improved visual fidelity, temporal coherence, and user +control. These advancements hold great promise for improving surgical education +by enabling more realistic, diverse, and interactive simulation environments. +In this study, we introduce SurGen, a text-guided diffusion model tailored for +surgical video synthesis, producing the highest resolution and longest duration +videos among existing surgical video generation models. We validate the visual +and temporal quality of the outputs using standard image and video generation +metrics. Additionally, we assess their alignment to the corresponding text +prompts through a deep learning classifier trained on surgical data. Our +results demonstrate the potential of diffusion models to serve as valuable +educational tools for surgical trainees. + +
+
+
+
+
+ + ♻ ☆ Interpretable Image Emotion Recognition: A Domain Adaptation Approach + Using Facial Expressions + + +
+ This paper proposes a feature-based domain adaptation technique for +identifying emotions in generic images, encompassing both facial and non-facial +objects, as well as non-human components. This approach addresses the challenge +of the limited availability of pre-trained models and well-annotated datasets +for Image Emotion Recognition (IER). Initially, a deep-learning-based Facial +Expression Recognition (FER) system is developed, classifying facial images +into discrete emotion classes. Maintaining the same network architecture, this +FER system is then adapted to recognize emotions in generic images through the +application of discrepancy loss, enabling the model to effectively learn IER +features while classifying emotions into categories such as 'happy,' 'sad,' +'hate,' and 'anger.' Additionally, a novel interpretability method, Divide and +Conquer based Shap (DnCShap), is introduced to elucidate the visual features +most relevant for emotion recognition. The proposed IER system demonstrated +emotion classification accuracies of 60.98% for the IAPSa dataset, 58.86% for +the ArtPhoto dataset, 69.13% for the FI dataset, and 58.06% for the EMOTIC +dataset. The system effectively identifies the important visual features +leading to specific emotion classifications and provides detailed embedding +plots to explain the predictions, enhancing the understanding and trust in +AI-driven emotion recognition systems. + +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing the sequence of historical interactions between users and items, +sequential recommendation models learn user intent and make predictions about +the next item of interest. Next to these item interactions, most systems also +have interactions with pages not related to specific items, for example +navigation pages, account pages, and pages for a specific category, which may +provide additional insights into the user's interests. However, while there are +several approaches to integrate additional information about items and users, +the topic of integrating non-item pages has been less explored. We use the +hypotheses testing framework HypTrails to show that there is indeed a +relationship between these non-item pages and the items of interest and fill +this gap by proposing various approaches of representing non-item pages (e.g, +based on their content) to use them as an additional information source for the +task of sequential next-item prediction. + We create a synthetic dataset with non-item pages highly related to the +subsequent item to show that the models are generally capable of learning from +these interactions, and subsequently evaluate the improvements gained by +including non-item pages in two real-world datasets. + We adapt eight popular sequential recommender models, covering CNN-, RNN- and +transformer-based architectures, to integrate non-item pages and investigate +the capabilities of these models to leverage their information for next item +prediction. We also analyze their behavior on noisy data and compare different +item representation strategies. + Our results show that non-item pages are a valuable source of information, +but representing such a page well is the key to successfully leverage them. The +inclusion of non-item pages can increase the performance for next-item +prediction in all examined model architectures with a varying degree. + +
+
+ comment: 36 pages, 19 figures; Work in Progress +
+
+
+
+
+ + ☆ Knowledge Navigator: LLM-guided Browsing Framework for Exploratory + Search in Scientific Literature + + +
+ The exponential growth of scientific literature necessitates advanced tools +for effective knowledge exploration. We present Knowledge Navigator, a system +designed to enhance exploratory search abilities by organizing and structuring +the retrieved documents from broad topical queries into a navigable, two-level +hierarchy of named and descriptive scientific topics and subtopics. This +structured organization provides an overall view of the research themes in a +domain, while also enabling iterative search and deeper knowledge discovery +within specific subtopics by allowing users to refine their focus and retrieve +additional relevant documents. Knowledge Navigator combines LLM capabilities +with cluster-based methods to enable an effective browsing method. We +demonstrate our approach's effectiveness through automatic and manual +evaluations on two novel benchmarks, CLUSTREC-COVID and SCITOC. Our code, +prompts, and benchmarks are made publicly available. + +
+
+
+
+
+ + ☆ Evaluating Named Entity Recognition Using Few-Shot Prompting with Large + Language Models + + +
+ This paper evaluates Few-Shot Prompting with Large Language Models for Named +Entity Recognition (NER). Traditional NER systems rely on extensive labeled +datasets, which are costly and time-consuming to obtain. Few-Shot Prompting or +in-context learning enables models to recognize entities with minimal examples. +We assess state-of-the-art models like GPT-4 in NER tasks, comparing their +few-shot performance to fully supervised benchmarks. Results show that while +there is a performance gap, large models excel in adapting to new entity types +and domains with very limited data. We also explore the effects of prompt +engineering, guided output format and context length on performance. This study +underscores Few-Shot Learning's potential to reduce the need for large labeled +datasets, enhancing NER scalability and accessibility. + +
+
+ comment: Github repo: https://github.com/GEODE-project/ner-llm +
+
+
+
+
+ + ☆ Interactive Agents: Simulating Counselor-Client Psychological Counseling + via Role-Playing LLM-to-LLM Interactions + + +
+ Virtual counselors powered by large language models (LLMs) aim to create +interactive support systems that effectively assist clients struggling with +mental health challenges. To replicate counselor-client conversations, +researchers have built an online mental health platform that allows +professional counselors to provide clients with text-based counseling services +for about an hour per session. Notwithstanding its effectiveness, challenges +exist as human annotation is time-consuming, cost-intensive, privacy-protected, +and not scalable. To address this issue and investigate the applicability of +LLMs in psychological counseling conversation simulation, we propose a +framework that employs two LLMs via role-playing for simulating +counselor-client interactions. Our framework involves two LLMs, one acting as a +client equipped with a specific and real-life user profile and the other +playing the role of an experienced counselor, generating professional responses +using integrative therapy techniques. We implement both the counselor and the +client by zero-shot prompting the GPT-4 model. In order to assess the +effectiveness of LLMs in simulating counselor-client interactions and +understand the disparities between LLM- and human-generated conversations, we +evaluate the synthetic data from various perspectives. We begin by assessing +the client's performance through automatic evaluations. Next, we analyze and +compare the disparities between dialogues generated by the LLM and those +generated by professional counselors. Furthermore, we conduct extensive +experiments to thoroughly examine the performance of our LLM-based counselor +trained with synthetic interactive dialogues by benchmarking against +state-of-the-art models for mental health. + +
+
+
+
+
+ + ☆ PDSR: A Privacy-Preserving Diversified Service Recommendation Method on + Distributed Data + + +
+ The last decade has witnessed a tremendous growth of service computing, while +efficient service recommendation methods are desired to recommend high-quality +services to users. It is well known that collaborative filtering is one of the +most popular methods for service recommendation based on QoS, and many existing +proposals focus on improving recommendation accuracy, i.e., recommending +high-quality redundant services. Nevertheless, users may have different +requirements on QoS, and hence diversified recommendation has been attracting +increasing attention in recent years to fulfill users' diverse demands and to +explore potential services. Unfortunately, the recommendation performances +relies on a large volume of data (e.g., QoS data), whereas the data may be +distributed across multiple platforms. Therefore, to enable data sharing across +the different platforms for diversified service recommendation, we propose a +Privacy-preserving Diversified Service Recommendation (PDSR) method. +Specifically, we innovate in leveraging the Locality-Sensitive Hashing (LSH) +mechanism such that privacy-preserved data sharing across different platforms +is enabled to construct a service similarity graph. Based on the similarity +graph, we propose a novel accuracy-diversity metric and design a +$2$-approximation algorithm to select $K$ services to recommend by maximizing +the accuracy-diversity measure. Extensive experiments on real datasets are +conducted to verify the efficacy of our PDSR method. + +
+
+
+
+
+ + ☆ CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge + Graph and Ternary Relationship + + +
+ The problem of career trajectory prediction (CTP) aims to predict one's +future employer or job position. While several CTP methods have been developed +for this problem, we posit that none of these methods (1) jointly considers the +mutual ternary dependency between three key units (i.e., user, position, and +company) of a career and (2) captures the characteristic shifts of key units in +career over time, leading to an inaccurate understanding of the job movement +patterns in the labor market. To address the above challenges, we propose a +novel solution, named as CAPER, that solves the challenges via sophisticated +temporal knowledge graph (TKG) modeling. It enables the utilization of a +graph-structured knowledge base with rich expressiveness, effectively +preserving the changes in job movement patterns. Furthermore, we devise an +extrapolated career reasoning task on TKG for a realistic evaluation. The +experiments on a real-world career trajectory dataset demonstrate that CAPER +consistently and significantly outperforms four baselines, two recent TKG +reasoning methods, and five state-of-the-art CTP methods in predicting one's +future companies and positions-i.e., on average, yielding 6.80% and 34.58% more +accurate predictions, respectively. + +
+
+
+
+
+ + ☆ Lyrically Speaking: Exploring the Link Between Lyrical Emotions, Themes + and Depression Risk + + +
+ Lyrics play a crucial role in affecting and reinforcing emotional states by +providing meaning and emotional connotations that interact with the acoustic +properties of the music. Specific lyrical themes and emotions may intensify +existing negative states in listeners and may lead to undesirable outcomes, +especially in listeners with mood disorders such as depression. Hence, it is +important for such individuals to be mindful of their listening strategies. In +this study, we examine online music consumption of individuals at risk of +depression in light of lyrical themes and emotions. Lyrics obtained from the +listening histories of 541 Last.fm users, divided into At-Risk and No-Risk +based on their mental well-being scores, were analyzed using natural language +processing techniques. Statistical analyses of the results revealed that +individuals at risk for depression prefer songs with lyrics associated with low +valence and low arousal. Additionally, lyrics associated with themes of denial, +self-reference, and ambivalence were preferred. In contrast, themes such as +liberation, familiarity, and activity are not as favored. This study opens up +the possibility of an approach to assessing depression risk from the digital +footprint of individuals and potentially developing personalized recommendation +systems. + +
+
+ comment: Accepted at the 25th International Society for Music Information + Retrieval Conference (ISMIR) 2024, San Francisco, United States +
+
+
+
+
+ + ☆ Efficient $k$-NN Search in IoT Data: Overlap Optimization in Tree-Based + Indexing Structures + + +
+ The proliferation of interconnected devices in the Internet of Things (IoT) +has led to an exponential increase in data, commonly known as Big IoT Data. +Efficient retrieval of this heterogeneous data demands a robust indexing +mechanism for effective organization. However, a significant challenge remains: +the overlap in data space partitions during index construction. This overlap +increases node access during search and retrieval, resulting in higher resource +consumption, performance bottlenecks, and impedes system scalability. To +address this issue, we propose three innovative heuristics designed to quantify +and strategically reduce data space partition overlap. The volume-based method +(VBM) offers a detailed assessment by calculating the intersection volume +between partitions, providing deeper insights into spatial relationships. The +distance-based method (DBM) enhances efficiency by using the distance between +partition centers and radii to evaluate overlap, offering a streamlined yet +accurate approach. Finally, the object-based method (OBM) provides a practical +solution by counting objects across multiple partitions, delivering an +intuitive understanding of data space dynamics. Experimental results +demonstrate the effectiveness of these methods in reducing search time, +underscoring their potential to improve data space partitioning and enhance +overall system performance. + +
+
+ comment: 28 pages, 21 figures, 1 table +
+
+
+
+
+ + ☆ An Extremely Data-efficient and Generative LLM-based Reinforcement + Learning Agent for Recommenders + + +
+ Recent advancements in large language models (LLMs) have enabled +understanding webpage contexts, product details, and human instructions. +Utilizing LLMs as the foundational architecture for either reward models or +policies in reinforcement learning has gained popularity -- a notable +achievement is the success of InstructGPT. RL algorithms have been instrumental +in maximizing long-term customer satisfaction and avoiding short-term, myopic +goals in industrial recommender systems, which often rely on deep learning +models to predict immediate clicks or purchases. + In this project, several RL methods are implemented and evaluated using the +WebShop benchmark environment, data, simulator, and pre-trained model +checkpoints. The goal is to train an RL agent to maximize the purchase reward +given a detailed human instruction describing a desired product. The RL agents +are developed by fine-tuning a pre-trained BERT model with various objectives, +learning from preferences without a reward model, and employing contemporary +training techniques such as Proximal Policy Optimization (PPO) as used in +InstructGPT, and Direct Preference Optimization (DPO). This report also +evaluates the RL agents trained using generative trajectories. Evaluations were +conducted using Thompson sampling in the WebShop simulator environment. + The simulated online experiments demonstrate that agents trained on generated +trajectories exhibited comparable task performance to those trained using human +trajectories. This has demonstrated an example of an extremely low-cost +data-efficient way of training reinforcement learning agents. Also, with +limited training time (<2hours), without utilizing any images, a DPO agent +achieved a 19% success rate after approximately 3000 steps or 30 minutes of +training on T4 GPUs, compared to a PPO agent, which reached a 15% success rate. + +
+
+
+
+
+ + ♻ ☆ Contextual Bandit with Herding Effects: Algorithms and Recommendation + Applications PRICAI 2024 + + +
+ Contextual bandits serve as a fundamental algorithmic framework for +optimizing recommendation decisions online. Though extensive attention has been +paid to tailoring contextual bandits for recommendation applications, the +"herding effects" in user feedback have been ignored. These herding effects +bias user feedback toward historical ratings, breaking down the assumption of +unbiased feedback inherent in contextual bandits. This paper develops a novel +variant of the contextual bandit that is tailored to address the feedback bias +caused by the herding effects. A user feedback model is formulated to capture +this feedback bias. We design the TS-Conf (Thompson Sampling under Conformity) +algorithm, which employs posterior sampling to balance the exploration and +exploitation tradeoff. We prove an upper bound for the regret of the algorithm, +revealing the impact of herding effects on learning speed. Extensive +experiments on datasets demonstrate that TS-Conf outperforms four benchmark +algorithms. Analysis reveals that TS-Conf effectively mitigates the negative +impact of herding effects, resulting in faster learning and improved +recommendation accuracy. + +
+
+ comment: Published as a conference paper at PRICAI 2024 +
+
+
+
+
+ + ♻ ☆ PASH at TREC 2021 Deep Learning Track: Generative Enhanced Model for + Multi-stage Ranking + + +
+ This paper describes the PASH participation in TREC 2021 Deep Learning Track. +In the recall stage, we adopt a scheme combining sparse and dense retrieval +method. In the multi-stage ranking phase, point-wise and pair-wise ranking +strategies are used one after another based on model continual pre-trained on +general knowledge and document-level data. Compared to TREC 2020 Deep Learning +Track, we have additionally introduced the generative model T5 to further +enhance the performance. + +
+
+ comment: TREC 2021 +
+
+
+
+
+ + ♻ ☆ WeKnow-RAG: An Adaptive Approach for Retrieval-Augmented Generation + Integrating Web Search and Knowledge Graphs KDD + + +
+ Large Language Models (LLMs) have greatly contributed to the development of +adaptive intelligent agents and are positioned as an important way to achieve +Artificial General Intelligence (AGI). However, LLMs are prone to produce +factually incorrect information and often produce "phantom" content that +undermines their reliability, which poses a serious challenge for their +deployment in real-world scenarios. Enhancing LLMs by combining external +databases and information retrieval mechanisms is an effective path. To address +the above challenges, we propose a new approach called WeKnow-RAG, which +integrates Web search and Knowledge Graphs into a "Retrieval-Augmented +Generation (RAG)" system. First, the accuracy and reliability of LLM responses +are improved by combining the structured representation of Knowledge Graphs +with the flexibility of dense vector retrieval. WeKnow-RAG then utilizes +domain-specific knowledge graphs to satisfy a variety of queries and domains, +thereby improving performance on factual information and complex reasoning +tasks by employing multi-stage web page retrieval techniques using both sparse +and dense retrieval methods. Our approach effectively balances the efficiency +and accuracy of information retrieval, thus improving the overall retrieval +process. Finally, we also integrate a self-assessment mechanism for the LLM to +evaluate the trustworthiness of the answers it generates. Our approach proves +its outstanding effectiveness in a wide range of offline experiments and online +submissions. + +
+
+ comment: 8 pages, 2 figures, technical report for 3rd place in Task 3 of Meta + KDD Cup 2024 CRAG Challenge +
+
+
+
+
+ + ♻ ☆ From Data Creator to Data Reuser: Distance Matters + + +
+ Sharing research data is necessary, but not sufficient, for data reuse. Open +science policies focus more heavily on data sharing than on reuse, yet both are +complex, labor-intensive, expensive, and require infrastructure investments by +multiple stakeholders. The value of data reuse lies in relationships between +creators and reusers. By addressing knowledge exchange, rather than mere +transactions between stakeholders, investments in data management and knowledge +infrastructures can be made more wisely. Drawing upon empirical studies of data +sharing and reuse, we develop the theoretical construct of distance between +data creator and data reuser, identifying six distance dimensions that +influence the ability to transfer knowledge effectively: domain, methods, +collaboration, curation, purposes, and time and temporality. We address the +social and socio-technical aspects of these dimensions, exploring ways in which +they may decrease -- or increase -- distances between creators and reusers. Our +theoretical framing of the distance between data creators and prospective +reusers leads to recommendations to four categories of stakeholders on how to +make data sharing and reuse more effective: data creators, data reusers, data +archivists, and funding agencies. 'It takes a village' to share research data +-- and a village to reuse data. Our aim is to provoke new research questions, +new research, and new investments in effective and efficient circulation of +research data; and to identify criteria for investments at each stage of data +and research life cycles. + +
+
+ comment: 74 pages, double-spaced, consisting of Table of Contents, Abstract, + 45 page narrative, 1 box, 1 figure, 1 table, 27 pages references. Original + work +
+
+
+
+
+
+
+
+ + Machine Learning 119 + +
+
+
+ + ☆ Q-MRS: A Deep Learning Framework for Quantitative Magnetic Resonance + Spectra Analysis + + +
+ Magnetic resonance spectroscopy (MRS) is an established technique for +studying tissue metabolism, particularly in central nervous system disorders. +While powerful and versatile, MRS is often limited by challenges associated +with data quality, processing, and quantification. Existing MRS quantification +methods face difficulties in balancing model complexity and reproducibility +during spectral modeling, often falling into the trap of either +oversimplification or over-parameterization. To address these limitations, this +study introduces a deep learning (DL) framework that employs transfer learning, +in which the model is pre-trained on simulated datasets before it undergoes +fine-tuning on in vivo data. The proposed framework showed promising +performance when applied to the Philips dataset from the BIG GABA repository +and represents an exciting advancement in MRS data analysis. + +
+
+ comment: 8 pages, 4 figures, and 3 tables for the main body; 9 pages, 4 + figures, and 3 tables for the supplementary material +
+
+
+
+
+ + ☆ Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of + Encoders + + +
+ The ability to accurately interpret complex visual information is a crucial +topic of multimodal large language models (MLLMs). Recent work indicates that +enhanced visual perception significantly reduces hallucinations and improves +performance on resolution-sensitive tasks, such as optical character +recognition and document analysis. A number of recent MLLMs achieve this goal +using a mixture of vision encoders. Despite their success, there is a lack of +systematic comparisons and detailed ablation studies addressing critical +aspects, such as expert selection and the integration of multiple vision +experts. This study provides an extensive exploration of the design space for +MLLMs using a mixture of vision encoders and resolutions. Our findings reveal +several underlying principles common to various existing strategies, leading to +a streamlined yet effective design approach. We discover that simply +concatenating visual tokens from a set of complementary vision encoders is as +effective as more complex mixing architectures or strategies. We additionally +introduce Pre-Alignment to bridge the gap between vision-focused encoders and +language tokens, enhancing model coherence. The resulting family of MLLMs, +Eagle, surpasses other leading open-source models on major MLLM benchmarks. +Models and code: https://github.com/NVlabs/Eagle + +
+
+ comment: Github: https://github.com/NVlabs/Eagle, HuggingFace: + https://huggingface.co/NVEagle +
+
+
+
+
+ + ☆ Mamba or Transformer for Time Series Forecasting? Mixture of Universals + (MoU) Is All You Need + + +
+ Time series forecasting requires balancing short-term and long-term +dependencies for accurate predictions. Existing methods mainly focus on +long-term dependency modeling, neglecting the complexities of short-term +dynamics, which may hinder performance. Transformers are superior in modeling +long-term dependencies but are criticized for their quadratic computational +cost. Mamba provides a near-linear alternative but is reported less effective +in time series longterm forecasting due to potential information loss. Current +architectures fall short in offering both high efficiency and strong +performance for long-term dependency modeling. To address these challenges, we +introduce Mixture of Universals (MoU), a versatile model to capture both +short-term and long-term dependencies for enhancing performance in time series +forecasting. MoU is composed of two novel designs: Mixture of Feature +Extractors (MoF), an adaptive method designed to improve time series patch +representations for short-term dependency, and Mixture of Architectures (MoA), +which hierarchically integrates Mamba, FeedForward, Convolution, and +Self-Attention architectures in a specialized order to model long-term +dependency from a hybrid perspective. The proposed approach achieves +state-of-the-art performance while maintaining relatively low computational +costs. Extensive experiments on seven real-world datasets demonstrate the +superiority of MoU. Code is available at https://github.com/lunaaa95/mou/. + +
+
+ comment: Code at https://github.com/lunaaa95/mou/ +
+
+
+
+
+ + ☆ ClimDetect: A Benchmark Dataset for Climate Change Detection and + Attribution + + +
+ Detecting and attributing temperature increases due to climate change is +crucial for understanding global warming and guiding adaptation strategies. The +complexity of distinguishing human-induced climate signals from natural +variability has challenged traditional detection and attribution (D&A) +approaches, which seek to identify specific "fingerprints" in climate response +variables. Deep learning offers potential for discerning these complex patterns +in expansive spatial datasets. However, lack of standard protocols has hindered +consistent comparisons across studies. We introduce ClimDetect, a standardized +dataset of over 816k daily climate snapshots, designed to enhance model +accuracy in identifying climate change signals. ClimDetect integrates various +input and target variables used in past research, ensuring comparability and +consistency. We also explore the application of vision transformers (ViT) to +climate data, a novel and modernizing approach in this context. Our open-access +data and code serve as a benchmark for advancing climate science through +improved model evaluations. ClimDetect is publicly accessible via Huggingface +dataet respository at: https://huggingface.co/datasets/ClimDetect/ClimDetect. + +
+
+
+
+
+ + ☆ CoGen: Learning from Feedback with Coupled Comprehension and Generation + + +
+ Systems with both language comprehension and generation capabilities can +benefit from the tight connection between the two. This work studies coupling +comprehension and generation with focus on continually learning from +interaction with users. We propose techniques to tightly integrate the two +capabilities for both learning and inference. We situate our studies in +two-player reference games, and deploy various models for thousands of +interactions with human users, while learning from interaction feedback +signals. We show dramatic improvements in performance over time, with +comprehension-generation coupling leading to performance improvements up to 26% +in absolute terms and up to 17% higher accuracies compared to a non-coupled +system. Our analysis also shows coupling has substantial qualitative impact on +the system's language, making it significantly more human-like. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ Stability of Primal-Dual Gradient Flow Dynamics for Multi-Block Convex + Optimization Problems + + +
+ We examine stability properties of primal-dual gradient flow dynamics for +composite convex optimization problems with multiple, possibly nonsmooth, terms +in the objective function under the generalized consensus constraint. The +proposed dynamics are based on the proximal augmented Lagrangian and they +provide a viable alternative to ADMM which faces significant challenges from +both analysis and implementation viewpoints in large-scale multi-block +scenarios. In contrast to customized algorithms with individualized convergence +guarantees, we provide a systematic approach for solving a broad class of +challenging composite optimization problems. We leverage various structural +properties to establish global (exponential) convergence guarantees for the +proposed dynamics. Our assumptions are much weaker than those required to prove +(exponential) stability of various primal-dual dynamics as well as (linear) +convergence of discrete-time methods, e.g., standard two-block and multi-block +ADMM and EXTRA algorithms. Finally, we show necessity of some of our structural +assumptions for exponential stability and provide computational experiments to +demonstrate the convenience of the proposed dynamics for parallel and +distributed computing applications. + +
+
+ comment: 31 pages; 4 figures +
+
+
+
+
+ + ☆ Efficient Slice Anomaly Detection Network for 3D Brain MRI Volume + + +
+ Current anomaly detection methods excel with benchmark industrial data but +struggle with natural images and medical data due to varying definitions of +'normal' and 'abnormal.' This makes accurate identification of deviations in +these fields particularly challenging. Especially for 3D brain MRI data, all +the state-of-the-art models are reconstruction-based with 3D convolutional +neural networks which are memory-intensive, time-consuming and producing noisy +outputs that require further post-processing. We propose a framework called +Simple Slice-based Network (SimpleSliceNet), which utilizes a model pre-trained +on ImageNet and fine-tuned on a separate MRI dataset as a 2D slice feature +extractor to reduce computational cost. We aggregate the extracted features to +perform anomaly detection tasks on 3D brain MRI volumes. Our model integrates a +conditional normalizing flow to calculate log likelihood of features and +employs the Semi-Push-Pull Mechanism to enhance anomaly detection accuracy. The +results indicate improved performance, showcasing our model's remarkable +adaptability and effectiveness when addressing the challenges exists in brain +MRI data. In addition, for the large-scale 3D brain volumes, our model +SimpleSliceNet outperforms the state-of-the-art 2D and 3D models in terms of +accuracy, memory usage and time consumption. Code is available at: +https://anonymous.4open.science/r/SimpleSliceNet-8EA3. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Generating Binary Species Range Maps + + +
+ Accurately predicting the geographic ranges of species is crucial for +assisting conservation efforts. Traditionally, range maps were manually created +by experts. However, species distribution models (SDMs) and, more recently, +deep learning-based variants offer a potential automated alternative. Deep +learning-based SDMs generate a continuous probability representing the +predicted presence of a species at a given location, which must be binarized by +setting per-species thresholds to obtain binary range maps. However, selecting +appropriate per-species thresholds to binarize these predictions is non-trivial +as different species can require distinct thresholds. In this work, we evaluate +different approaches for automatically identifying the best thresholds for +binarizing range maps using presence-only data. This includes approaches that +require the generation of additional pseudo-absence data, along with ones that +only require presence data. We also propose an extension of an existing +presence-only technique that is more robust to outliers. We perform a detailed +evaluation of different thresholding techniques on the tasks of binary range +estimation and large-scale fine-grained visual classification, and we +demonstrate improved performance over existing pseudo-absence free approaches +using our method. + +
+
+
+
+
+ + ☆ Modeling and Analyzing the Influence of Non-Item Pages on Sequential + Next-Item Prediction + + +
+ Analyzing the sequence of historical interactions between users and items, +sequential recommendation models learn user intent and make predictions about +the next item of interest. Next to these item interactions, most systems also +have interactions with pages not related to specific items, for example +navigation pages, account pages, and pages for a specific category, which may +provide additional insights into the user's interests. However, while there are +several approaches to integrate additional information about items and users, +the topic of integrating non-item pages has been less explored. We use the +hypotheses testing framework HypTrails to show that there is indeed a +relationship between these non-item pages and the items of interest and fill +this gap by proposing various approaches of representing non-item pages (e.g, +based on their content) to use them as an additional information source for the +task of sequential next-item prediction. + We create a synthetic dataset with non-item pages highly related to the +subsequent item to show that the models are generally capable of learning from +these interactions, and subsequently evaluate the improvements gained by +including non-item pages in two real-world datasets. + We adapt eight popular sequential recommender models, covering CNN-, RNN- and +transformer-based architectures, to integrate non-item pages and investigate +the capabilities of these models to leverage their information for next item +prediction. We also analyze their behavior on noisy data and compare different +item representation strategies. + Our results show that non-item pages are a valuable source of information, +but representing such a page well is the key to successfully leverage them. The +inclusion of non-item pages can increase the performance for next-item +prediction in all examined model architectures with a varying degree. + +
+
+ comment: 36 pages, 19 figures; Work in Progress +
+
+
+
+
+ + ☆ Sigma Flows for Image and Data Labeling and Learning Structured + Prediction + + +
+ This paper introduces the sigma flow model for the prediction of structured +labelings of data observed on Riemannian manifolds, including Euclidean image +domains as special case. The approach combines the Laplace-Beltrami framework +for image denoising and enhancement, introduced by Sochen, Kimmel and Malladi +about 25 years ago, and the assignment flow approach introduced and studied by +the authors. + The sigma flow arises as Riemannian gradient flow of generalized harmonic +energies and thus is governed by a nonlinear geometric PDE which determines a +harmonic map from a closed Riemannian domain manifold to a statistical +manifold, equipped with the Fisher-Rao metric from information geometry. A +specific ingredient of the sigma flow is the mutual dependency of the +Riemannian metric of the domain manifold on the evolving state. This makes the +approach amenable to machine learning in a specific way, by realizing this +dependency through a mapping with compact time-variant parametrization that can +be learned from data. Proof of concept experiments demonstrate the expressivity +of the sigma flow model and prediction performance. + Structural similarities to transformer network architectures and networks +generated by the geometric integration of sigma flows are pointed out, which +highlights the connection to deep learning and, conversely, may stimulate the +use of geometric design principles for structured prediction in other areas of +scientific machine learning. + +
+
+ comment: 51 pages +
+
+
+
+
+ + ☆ Generalized Naive Bayes + + +
+ In this paper we introduce the so-called Generalized Naive Bayes structure as +an extension of the Naive Bayes structure. We give a new greedy algorithm that +finds a good fitting Generalized Naive Bayes (GNB) probability distribution. We +prove that this fits the data at least as well as the probability distribution +determined by the classical Naive Bayes (NB). Then, under a not very +restrictive condition, we give a second algorithm for which we can prove that +it finds the optimal GNB probability distribution, i.e. best fitting structure +in the sense of KL divergence. Both algorithms are constructed to maximize the +information content and aim to minimize redundancy. Based on these algorithms, +new methods for feature selection are introduced. We discuss the similarities +and differences to other related algorithms in terms of structure, methodology, +and complexity. Experimental results show, that the algorithms introduced +outperform the related algorithms in many cases. + +
+
+ comment: 44 pages, 19 figures +
+
+
+
+
+ + ☆ Multi-modal Adversarial Training for Zero-Shot Voice Cloning INTERSPEECH 2024 + + +
+ A text-to-speech (TTS) model trained to reconstruct speech given text tends +towards predictions that are close to the average characteristics of a dataset, +failing to model the variations that make human speech sound natural. This +problem is magnified for zero-shot voice cloning, a task that requires training +data with high variance in speaking styles. We build off of recent works which +have used Generative Advsarial Networks (GAN) by proposing a Transformer +encoder-decoder architecture to conditionally discriminates between real and +generated speech features. The discriminator is used in a training pipeline +that improves both the acoustic and prosodic features of a TTS model. We +introduce our novel adversarial training technique by applying it to a +FastSpeech2 acoustic model and training on Libriheavy, a large multi-speaker +dataset, for the task of zero-shot voice cloning. Our model achieves +improvements over the baseline in terms of speech quality and speaker +similarity. Audio examples from our system are available online. + +
+
+ comment: Accepted at INTERSPEECH 2024 +
+
+
+
+
+ + ☆ MetaGFN: Exploring Distant Modes with Adapted Metadynamics for + Continuous GFlowNets + + +
+ Generative Flow Networks (GFlowNets) are a class of generative models that +sample objects in proportion to a specified reward function through a learned +policy. They can be trained either on-policy or off-policy, needing a balance +between exploration and exploitation for fast convergence to a target +distribution. While exploration strategies for discrete GFlowNets have been +studied, exploration in the continuous case remains to be investigated, despite +the potential for novel exploration algorithms due to the local connectedness +of continuous domains. Here, we introduce Adapted Metadynamics, a variant of +metadynamics that can be applied to arbitrary black-box reward functions on +continuous domains. We use Adapted Metadynamics as an exploration strategy for +continuous GFlowNets. We show three continuous domains where the resulting +algorithm, MetaGFN, accelerates convergence to the target distribution and +discovers more distant reward modes than previous off-policy exploration +strategies used for GFlowNets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Nexus: Specialization meets Adaptability for Efficiently Training + Mixture of Experts + + +
+ Efficiency, specialization, and adaptability to new data distributions are +qualities that are hard to combine in current Large Language Models. The +Mixture of Experts (MoE) architecture has been the focus of significant +research because its inherent conditional computation enables such desirable +properties. In this work, we focus on "upcycling" dense expert models into an +MoE, aiming to improve specialization while also adding the ability to adapt to +new tasks easily. We introduce Nexus, an enhanced MoE architecture with +adaptive routing where the model learns to project expert embeddings from +domain representations. This approach allows Nexus to flexibly add new experts +after the initial upcycling through separately trained dense models, without +requiring large-scale MoE training for unseen data domains. Our experiments +show that Nexus achieves a relative gain of up to 2.1% over the baseline for +initial upcycling, and a 18.8% relative gain for extending the MoE with a new +expert by using limited finetuning data. This flexibility of Nexus is crucial +to enable an open-source ecosystem where every user continuously assembles +their own MoE-mix according to their needs. + +
+
+
+
+
+ + ☆ Airfoil Diffusion: Denoising Diffusion Model For Conditional Airfoil + Generation + + +
+ The design of aerodynamic shapes, such as airfoils, has traditionally +required significant computational resources and relied on predefined design +parameters, which limit the potential for novel shape synthesis. In this work, +we introduce a data-driven methodology for airfoil generation using a diffusion +model. Trained on a dataset of preexisting airfoils, our model can generate an +arbitrary number of new airfoils from random vectors, which can be conditioned +on specific aerodynamic performance metrics such as lift and drag, or geometric +criteria. Our results demonstrate that the diffusion model effectively produces +airfoil shapes with realistic aerodynamic properties, offering substantial +improvements in efficiency, flexibility, and the potential for discovering +innovative airfoil designs. This approach significantly expands the design +space, facilitating the synthesis of high-performance aerodynamic shapes that +transcend the limitations of traditional methods. + +
+
+ comment: 12 Pages, 6 figures +
+
+
+
+
+ + ☆ A New Method for Cross-Lingual-based Semantic Role Labeling + + +
+ Semantic role labeling is a crucial task in natural language processing, +enabling better comprehension of natural language. However, the lack of +annotated data in multiple languages has posed a challenge for researchers. To +address this, a deep learning algorithm based on model transfer has been +proposed. The algorithm utilizes a dataset consisting of the English portion of +CoNLL2009 and a corpus of semantic roles in Persian. To optimize the efficiency +of training, only ten percent of the educational data from each language is +used. The results of the proposed model demonstrate significant improvements +compared to Niksirt et al.'s model. In monolingual mode, the proposed model +achieved a 2.05 percent improvement on F1-score, while in cross-lingual mode, +the improvement was even more substantial, reaching 6.23 percent. Worth noting +is that the compared model only trained two of the four stages of semantic role +labeling and employed golden data for the remaining two stages. This suggests +that the actual superiority of the proposed model surpasses the reported +numbers by a significant margin. The development of cross-lingual methods for +semantic role labeling holds promise, particularly in addressing the scarcity +of annotated data for various languages. These advancements pave the way for +further research in understanding and processing natural language across +different linguistic contexts. + +
+
+
+
+
+ + ☆ Bias in LLMs as Annotators: The Effect of Party Cues on Labelling + Decision by Large Language Models + + +
+ Human coders are biased. We test similar biases in Large Language Models +(LLMs) as annotators. By replicating an experiment run by Ennser-Jedenastik and +Meyer (2018), we find evidence that LLMs use political information, and +specifically party cues, to judge political statements. Not only do LLMs use +relevant information to contextualize whether a statement is positive, +negative, or neutral based on the party cue, they also reflect the biases of +the human-generated data upon which they have been trained. We also find that +unlike humans, who are only biased when faced with statements from extreme +parties, LLMs exhibit significant bias even when prompted with statements from +center-left and center-right parties. The implications of our findings are +discussed in the conclusion. + +
+
+
+
+
+ + ☆ The Role of Fibration Symmetries in Geometric Deep Learning + + +
+ Geometric Deep Learning (GDL) unifies a broad class of machine learning +techniques from the perspectives of symmetries, offering a framework for +introducing problem-specific inductive biases like Graph Neural Networks +(GNNs). However, the current formulation of GDL is limited to global symmetries +that are not often found in real-world problems. We propose to relax GDL to +allow for local symmetries, specifically fibration symmetries in graphs, to +leverage regularities of realistic instances. We show that GNNs apply the +inductive bias of fibration symmetries and derive a tighter upper bound for +their expressive power. Additionally, by identifying symmetries in networks, we +collapse network nodes, thereby increasing their computational efficiency +during both inference and training of deep neural networks. The mathematical +extension introduced here applies beyond graphs to manifolds, bundles, and +grids for the development of models with inductive biases induced by local +symmetries that can lead to better generalization. + +
+
+
+
+
+ + ☆ Robust Statistical Scaling of Outlier Scores: Improving the Quality of + Outlier Probabilities for Outliers (Extended Version) + + +
+ Outlier detection algorithms typically assign an outlier score to each +observation in a dataset, indicating the degree to which an observation is an +outlier. However, these scores are often not comparable across algorithms and +can be difficult for humans to interpret. Statistical scaling addresses this +problem by transforming outlier scores into outlier probabilities without using +ground-truth labels, thereby improving interpretability and comparability +across algorithms. However, the quality of this transformation can be different +for outliers and inliers. Missing outliers in scenarios where they are of +particular interest - such as healthcare, finance, or engineering - can be +costly or dangerous. Thus, ensuring good probabilities for outliers is +essential. This paper argues that statistical scaling, as commonly used in the +literature, does not produce equally good probabilities for outliers as for +inliers. Therefore, we propose robust statistical scaling, which uses robust +estimators to improve the probabilities for outliers. We evaluate several +variants of our method against other outlier score transformations for +real-world datasets and outlier detection algorithms, where it can improve the +probabilities for outliers. + +
+
+ comment: 15 pages, 4 figures, accepted for publication in SISAP 2024 +
+
+
+
+
+ + ☆ Retrieval-Augmented Instruction Tuning for Automated Process Engineering + Calculations : A Tool-Chaining Problem-Solving Framework with Attributable + Reflection ECML + + +
+ The current technology landscape lacks a foundational AI model for solving +process engineering calculations. In this work, we introduce a novel autonomous +agent framework leveraging Retrieval-Augmented Instruction-Tuning (RAIT) to +enhance open, customizable small code language models (SLMs) for these +calculations. By combining instruction tuned code SLMs with Retrieval-Augmented +Code Generation (RACG) using external tools, the agent generates, debugs, and +optimizes code from natural language specifications. Our approach addresses the +limitations of the current lack of a foundational AI model for specialized +process engineering tasks and offers benefits of explainability, knowledge +editing, and cost-effectiveness. Additionally, we curate custom datasets of +chemical and process engineering problems and solutions to overcome data +scarcity. Experimental results show that our framework matches the performance +of large-scale proprietary models on benchmark datasets, proving its +effectiveness and usability. + +
+
+ comment: Accepted for publication at ML4CCE workshop at ECML PKDD 2024. Please + find the link: https://ml4cce-ecml.com/#agenda +
+
+
+
+
+ + ☆ microYOLO: Towards Single-Shot Object Detection on Microcontrollers ECML + + +
+ This work-in-progress paper presents results on the feasibility of +single-shot object detection on microcontrollers using YOLO. Single-shot object +detectors like YOLO are widely used, however due to their complexity mainly on +larger GPU-based platforms. We present microYOLO, which can be used on Cortex-M +based microcontrollers, such as the OpenMV H7 R2, achieving about 3.5 FPS when +classifying 128x128 RGB images while using less than 800 KB Flash and less than +350 KB RAM. Furthermore, we share experimental results for three different +object detection tasks, analyzing the accuracy of microYOLO on them. + +
+
+ comment: Published at the ECML PKDD Conference 2023, at the 4th Workshop on + IoT, Edge, and Mobile for Embedded Machine Learning +
+
+
+
+
+ + ☆ Fusing Pruned and Backdoored Models: Optimal Transport-based Data-free + Backdoor Mitigation + + +
+ Backdoor attacks present a serious security threat to deep neuron networks +(DNNs). Although numerous effective defense techniques have been proposed in +recent years, they inevitably rely on the availability of either clean or +poisoned data. In contrast, data-free defense techniques have evolved slowly +and still lag significantly in performance. To address this issue, different +from the traditional approach of pruning followed by fine-tuning, we propose a +novel data-free defense method named Optimal Transport-based Backdoor Repairing +(OTBR) in this work. This method, based on our findings on neuron weight +changes (NWCs) of random unlearning, uses optimal transport (OT)-based model +fusion to combine the advantages of both pruned and backdoored models. +Specifically, we first demonstrate our findings that the NWCs of random +unlearning are positively correlated with those of poison unlearning. Based on +this observation, we propose a random-unlearning NWC pruning technique to +eliminate the backdoor effect and obtain a backdoor-free pruned model. Then, +motivated by the OT-based model fusion, we propose the pruned-to-backdoored +OT-based fusion technique, which fuses pruned and backdoored models to combine +the advantages of both, resulting in a model that demonstrates high clean +accuracy and a low attack success rate. To our knowledge, this is the first +work to apply OT and model fusion techniques to backdoor defense. Extensive +experiments show that our method successfully defends against all seven +backdoor attacks across three benchmark datasets, outperforming both +state-of-the-art (SOTA) data-free and data-dependent methods. The code +implementation and Appendix are provided in the Supplementary Material. + +
+
+
+
+
+ + ☆ chemtrain: Learning Deep Potential Models via Automatic Differentiation + and Statistical Physics + + +
+ Neural Networks (NNs) are promising models for refining the accuracy of +molecular dynamics, potentially opening up new fields of application. Typically +trained bottom-up, atomistic NN potential models can reach first-principle +accuracy, while coarse-grained implicit solvent NN potentials surpass classical +continuum solvent models. However, overcoming the limitations of costly +generation of accurate reference data and data inefficiency of common bottom-up +training demands efficient incorporation of data from many sources. This paper +introduces the framework chemtrain to learn sophisticated NN potential models +through customizable training routines and advanced training algorithms. These +routines can combine multiple top-down and bottom-up algorithms, e.g., to +incorporate both experimental and simulation data or pre-train potentials with +less costly algorithms. chemtrain provides an object-oriented high-level +interface to simplify the creation of custom routines. On the lower level, +chemtrain relies on JAX to compute gradients and scale the computations to use +available resources. We demonstrate the simplicity and importance of combining +multiple algorithms in the examples of parametrizing an all-atomistic model of +titanium and a coarse-grained implicit solvent model of alanine dipeptide. + +
+
+ comment: Package source code published at http://github.com/tummfm/chemtrain +
+
+
+
+
+ + ☆ Automatic Differential Diagnosis using Transformer-Based Multi-Label + Sequence Classification + + +
+ As the field of artificial intelligence progresses, assistive technologies +are becoming more widely used across all industries. The healthcare industry is +no different, with numerous studies being done to develop assistive tools for +healthcare professionals. Automatic diagnostic systems are one such beneficial +tool that can assist with a variety of tasks, including collecting patient +information, analyzing test results, and diagnosing patients. However, the idea +of developing systems that can provide a differential diagnosis has been +largely overlooked in most of these research studies. In this study, we propose +a transformer-based approach for providing differential diagnoses based on a +patient's age, sex, medical history, and symptoms. We use the DDXPlus dataset, +which provides differential diagnosis information for patients based on 49 +disease types. Firstly, we propose a method to process the tabular patient data +from the dataset and engineer them into patient reports to make them suitable +for our research. In addition, we introduce two data modification modules to +diversify the training data and consequently improve the robustness of the +models. We approach the task as a multi-label classification problem and +conduct extensive experiments using four transformer models. All the models +displayed promising results by achieving over 97% F1 score on the held-out test +set. Moreover, we design additional behavioral tests to get a broader +understanding of the models. In particular, for one of our test cases, we +prepared a custom test set of 100 samples with the assistance of a doctor. The +results on the custom set showed that our proposed data modification modules +improved the model's generalization capabilities. We hope our findings will +provide future researchers with valuable insights and inspire them to develop +reliable systems for automatic differential diagnosis. + +
+
+ comment: 25 pages, 7 figures +
+
+
+
+
+ + ☆ Automated Mixture Analysis via Structural Evaluation + + +
+ The determination of chemical mixture components is vital to a multitude of +scientific fields. Oftentimes spectroscopic methods are employed to decipher +the composition of these mixtures. However, the sheer density of spectral +features present in spectroscopic databases can make unambiguous assignment to +individual species challenging. Yet, components of a mixture are commonly +chemically related due to environmental processes or shared precursor +molecules. Therefore, analysis of the chemical relevance of a molecule is +important when determining which species are present in a mixture. In this +paper, we combine machine-learning molecular embedding methods with a +graph-based ranking system to determine the likelihood of a molecule being +present in a mixture based on the other known species and/or chemical priors. +By incorporating this metric in a rotational spectroscopy mixture analysis +algorithm, we demonstrate that the mixture components can be identified with +extremely high accuracy (>97%) in an efficient manner. + +
+
+ comment: Accepted for publication in The Journal of Physical Chemistry A +
+
+
+
+
+ + ☆ Language Adaptation on a Tight Academic Compute Budget: Tokenizer + Swapping Works and Pure bfloat16 Is Enough ICML 2024 + + +
+ We investigate continued pretraining of LLMs for language adaptation on a +tight academic budget: a setting in which only a few GPUs can be used in +parallel, for a heavily constrained duration. We focus on adapting Mistral-7B +to German or Arabic and evaluate several techniques to improve efficiency and +effectiveness in this setting. Our German models adapted on this tight compute +budget underperform compared to the base Mistral-7B, while our Arabic models +outperform several baselines, showing that for sufficiently well-represented +languages, continued pretraining for specialization is not always helpful. Our +main findings focus on training precision and tokenizer swapping. Our results +show that pure bfloat16 training is a viable alternative to mixed-precision +training, while being much faster when only using a few GPUs. Swapping the +tokenizer for a specialized one yields more efficient tokenization and is +competitive with the original tokenizer, which already contains some German +tokens, but did not significantly increase performance for German. Code and +model weights are available at on GitHub. + +
+
+ comment: WANT@ICML 2024 +
+
+
+
+
+ + ☆ Efficient LLM Scheduling by Learning to Rank + + +
+ In Large Language Model (LLM) inference, the output length of an LLM request +is typically regarded as not known a priori. Consequently, most LLM serving +systems employ a simple First-come-first-serve (FCFS) scheduling strategy, +leading to Head-Of-Line (HOL) blocking and reduced throughput and service +quality. In this paper, we reexamine this assumption -- we show that, although +predicting the exact generation length of each request is infeasible, it is +possible to predict the relative ranks of output lengths in a batch of +requests, using learning to rank. The ranking information offers valuable +guidance for scheduling requests. Building on this insight, we develop a novel +scheduler for LLM inference and serving that can approximate the +shortest-job-first (SJF) schedule better than existing approaches. We integrate +this scheduler with the state-of-the-art LLM serving system and show +significant performance improvement in several important applications: 2.8x +lower latency in chatbot serving and 6.5x higher throughput in synthetic data +generation. Our code is available at https://github.com/hao-ai-lab/vllm-ltr.git + +
+
+
+
+
+ + ☆ Implicit Regularization Paths of Weighted Neural Representations + + +
+ We study the implicit regularization effects induced by (observation) +weighting of pretrained features. For weight and feature matrices of bounded +operator norms that are infinitesimally free with respect to (normalized) trace +functionals, we derive equivalence paths connecting different weighting +matrices and ridge regularization levels. Specifically, we show that ridge +estimators trained on weighted features along the same path are asymptotically +equivalent when evaluated against test vectors of bounded norms. These paths +can be interpreted as matching the effective degrees of freedom of ridge +estimators fitted with weighted features. For the special case of subsampling +without replacement, our results apply to independently sampled random features +and kernel features and confirm recent conjectures (Conjectures 7 and 8) of the +authors on the existence of such paths in Patil et al. We also present an +additive risk decomposition for ensembles of weighted estimators and show that +the risks are equivalent along the paths when the ensemble size goes to +infinity. As a practical consequence of the path equivalences, we develop an +efficient cross-validation method for tuning and apply it to subsampled +pretrained representations across several models (e.g., ResNet-50) and datasets +(e.g., CIFAR-100). + +
+
+ comment: 19 pages for main and 19 pages for appendix +
+
+
+
+
+ + ☆ wav2pos: Sound Source Localization using Masked Autoencoders + + +
+ We present a novel approach to the 3D sound source localization task for +distributed ad-hoc microphone arrays by formulating it as a set-to-set +regression problem. By training a multi-modal masked autoencoder model that +operates on audio recordings and microphone coordinates, we show that such a +formulation allows for accurate localization of the sound source, by +reconstructing coordinates masked in the input. Our approach is flexible in the +sense that a single model can be used with an arbitrary number of microphones, +even when a subset of audio recordings and microphone coordinates are missing. +We test our method on simulated and real-world recordings of music and speech +in indoor environments, and demonstrate competitive performance compared to +both classical and other learning based localization methods. + +
+
+ comment: IPIN 2024 +
+
+
+
+
+ + ☆ Harmonized Speculative Sampling + + +
+ Speculative sampling has proven to be an effective solution to accelerate +decoding from large language models, where the acceptance rate significantly +determines the performance. Most previous works on improving the acceptance +rate focus on aligned training and efficient decoding, implicitly paying less +attention to the linkage of training and decoding. In this work, we first +investigate the linkage of training and decoding for speculative sampling and +then propose a solution named HArmonized Speculative Sampling (HASS). HASS +improves the acceptance rate without extra inference overhead by harmonizing +training and decoding on their objectives and contexts. Experiments on three +LLaMA models demonstrate that HASS achieves 2.81x-3.65x wall-clock time speedup +ratio averaging across three datasets, which is 8%-15% faster than EAGLE-2. + +
+
+
+
+
+ + ☆ A Neural Material Point Method for Particle-based Simulations + + +
+ Mesh-free Lagrangian methods are widely used for simulating fluids, solids, +and their complex interactions due to their ability to handle large +deformations and topological changes. These physics simulators, however, +require substantial computational resources for accurate simulations. To +address these issues, deep learning emulators promise faster and scalable +simulations, yet they often remain expensive and difficult to train, limiting +their practical use. Inspired by the Material Point Method (MPM), we present +NeuralMPM, a neural emulation framework for particle-based simulations. +NeuralMPM interpolates Lagrangian particles onto a fixed-size grid, computes +updates on grid nodes using image-to-image neural networks, and interpolates +back to the particles. Similarly to MPM, NeuralMPM benefits from the regular +voxelized representation to simplify the computation of the state dynamics, +while avoiding the drawbacks of mesh-based Eulerian methods. We demonstrate the +advantages of NeuralMPM on several datasets, including fluid dynamics and +fluid-solid interactions. Compared to existing methods, NeuralMPM reduces +training times from days to hours, while achieving comparable or superior +long-term accuracy, making it a promising approach for practical forward and +inverse problems. A project page is available at https://neuralmpm.isach.be + +
+
+
+
+
+ + ☆ Advanced POD-Based Performance Evaluation of Classifiers Applied to + Human Driver Lane Changing Prediction + + +
+ Machine learning (ML) classifiers serve as essential tools facilitating +classification and prediction across various domains. The performance of these +algorithms should be known to ensure their reliable application. In certain +fields, receiver operating characteristic and precision-recall curves are +frequently employed to assess machine learning algorithms without accounting +for the impact of process parameters. However, it may be essential to evaluate +the performance of these algorithms in relation to such parameters. As a +performance evaluation metric capable of considering the effects of process +parameters, this paper uses a modified probability of detection (POD) approach +to assess the reliability of ML-based algorithms. As an example, the POD-based +approach is employed to assess ML models used for predicting the lane changing +behavior of a vehicle driver. The time remaining to the predicted (and +therefore unknown) lane changing event is considered as process parameter. The +hit/miss approach to POD is taken here and modified by considering the +probability of lane changing derived from ML algorithms at each time step, and +obtaining the final result of the analysis accordingly. This improves the +reliability of results compared to the standard hit/miss approach, which +considers the outcome of the classifiers as either 0 or 1, while also +simplifying evaluation compared to the \^a versus a approach. Performance +evaluation results of the proposed approach are compared with those obtained +with the standard hit/miss approach and a pre-developed \^a versus a approach +to validate the effectiveness of the proposed method. The comparison shows that +this method provides an averaging conservative behavior with the advantage of +enhancing the reliability of the hit/miss approach to POD while retaining its +simplicity. + +
+
+ comment: Manuscript: 8 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Autoregressive model path dependence near Ising criticality + + +
+ Autoregressive models are a class of generative model that probabilistically +predict the next output of a sequence based on previous inputs. The +autoregressive sequence is by definition one-dimensional (1D), which is natural +for language tasks and hence an important component of modern architectures +like recurrent neural networks (RNNs) and transformers. However, when language +models are used to predict outputs on physical systems that are not +intrinsically 1D, the question arises of which choice of autoregressive +sequence -- if any -- is optimal. In this paper, we study the reconstruction of +critical correlations in the two-dimensional (2D) Ising model, using RNNs and +transformers trained on binary spin data obtained near the thermal phase +transition. We compare the training performance for a number of different 1D +autoregressive sequences imposed on finite-size 2D lattices. We find that paths +with long 1D segments are more efficient at training the autoregressive models +compared to space-filling curves that better preserve the 2D locality. Our +results illustrate the potential importance in choosing the optimal +autoregressive sequence ordering when training modern language models for tasks +in physics. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Pixels to Prose: Understanding the art of Image Captioning + + +
+ In the era of evolving artificial intelligence, machines are increasingly +emulating human-like capabilities, including visual perception and linguistic +expression. Image captioning stands at the intersection of these domains, +enabling machines to interpret visual content and generate descriptive text. +This paper provides a thorough review of image captioning techniques, catering +to individuals entering the field of machine learning who seek a comprehensive +understanding of available options, from foundational methods to +state-of-the-art approaches. Beginning with an exploration of primitive +architectures, the review traces the evolution of image captioning models to +the latest cutting-edge solutions. By dissecting the components of these +architectures, readers gain insights into the underlying mechanisms and can +select suitable approaches tailored to specific problem requirements without +duplicating efforts. The paper also delves into the application of image +captioning in the medical domain, illuminating its significance in various +real-world scenarios. + Furthermore, the review offers guidance on evaluating the performance of +image captioning systems, highlighting key metrics for assessment. By +synthesizing theoretical concepts with practical application, this paper equips +readers with the knowledge needed to navigate the complex landscape of image +captioning and harness its potential for diverse applications in machine +learning and beyond. + +
+
+
+
+
+ + ☆ Evaluating Model Robustness Using Adaptive Sparse L0 Regularization + + +
+ Deep Neural Networks have demonstrated remarkable success in various domains +but remain susceptible to adversarial examples, which are slightly altered +inputs designed to induce misclassification. While adversarial attacks +typically optimize under Lp norm constraints, attacks based on the L0 norm, +prioritising input sparsity, are less studied due to their complex and non +convex nature. These sparse adversarial examples challenge existing defenses by +altering a minimal subset of features, potentially uncovering more subtle DNN +weaknesses. However, the current L0 norm attack methodologies face a trade off +between accuracy and efficiency either precise but computationally intense or +expedient but imprecise. This paper proposes a novel, scalable, and effective +approach to generate adversarial examples based on the L0 norm, aimed at +refining the robustness evaluation of DNNs against such perturbations. + +
+
+ comment: Accepted by the 20th International Conference on Advanced Data Mining + and Applications (ADMA 2024) +
+
+
+
+
+ + ☆ Towards reliable respiratory disease diagnosis based on cough sounds and + vision transformers + + +
+ Recent advancements in deep learning techniques have sparked performance +boosts in various real-world applications including disease diagnosis based on +multi-modal medical data. Cough sound data-based respiratory disease (e.g., +COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also +attracted much attention. However, existing works usually utilise traditional +machine learning or deep models of moderate scales. On the other hand, the +developed approaches are trained and evaluated on small-scale data due to the +difficulty of curating and annotating clinical data on scale. To address these +issues in prior works, we create a unified framework to evaluate various deep +models from lightweight Convolutional Neural Networks (e.g., ResNet18) to +modern vision transformers and compare their performance in respiratory disease +classification. Based on the observations from such an extensive empirical +study, we propose a novel approach to cough-based disease classification based +on both self-supervised and supervised learning on a large-scale cough data +set. Experimental results demonstrate our proposed approach outperforms prior +arts consistently on two benchmark datasets for COVID-19 diagnosis and a +proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%. + +
+
+
+
+
+ + ☆ Auxiliary-Loss-Free Load Balancing Strategy for Mixture-of-Experts + + +
+ For Mixture-of-Experts (MoE) models, an unbalanced expert load will lead to +routing collapse or increased computational overhead. Existing methods commonly +employ an auxiliary loss to encourage load balance, but a large auxiliary loss +will introduce non-negligible interference gradients into training and thus +impair the model performance. In order to control load balance while not +producing undesired gradients during training, we propose Loss-Free Balancing, +featured by an auxiliary-loss-free load balancing strategy. To be specific, +before the top-K routing decision, Loss-Free Balancing will first apply an +expert-wise bias to the routing scores of each expert. By dynamically updating +the bias of each expert according to its recent load, Loss-Free Balancing can +consistently maintain a balanced distribution of expert load. In addition, +since Loss-Free Balancing does not produce any interference gradients, it also +elevates the upper bound of model performance gained from MoE training. We +validate the performance of Loss-Free Balancing on MoE models with up to 3B +parameters trained on up to 200B tokens. Experimental results show that +Loss-Free Balancing achieves both better performance and better load balance +compared with traditional auxiliary-loss-controlled load balancing strategies. + +
+
+
+
+
+ + ☆ GANs Conditioning Methods: A Survey + + +
+ In recent years, Generative Adversarial Networks (GANs) have seen significant +advancements, leading to their widespread adoption across various fields. The +original GAN architecture enables the generation of images without any specific +control over the content, making it an unconditional generation process. +However, many practical applications require precise control over the generated +output, which has led to the development of conditional GANs (cGANs) that +incorporate explicit conditioning to guide the generation process. cGANs extend +the original framework by incorporating additional information (conditions), +enabling the generation of samples that adhere to that specific criteria. +Various conditioning methods have been proposed, each differing in how they +integrate the conditioning information into both the generator and the +discriminator networks. In this work, we review the conditioning methods +proposed for GANs, exploring the characteristics of each method and +highlighting their unique mechanisms and theoretical foundations. Furthermore, +we conduct a comparative analysis of these methods, evaluating their +performance on various image datasets. Through these analyses, we aim to +provide insights into the strengths and limitations of various conditioning +techniques, guiding future research and application in generative modeling. + +
+
+
+
+
+ + ☆ Comparison of Model Predictive Control and Proximal Policy Optimization + for a 1-DOF Helicopter System + + +
+ This study conducts a comparative analysis of Model Predictive Control (MPC) +and Proximal Policy Optimization (PPO), a Deep Reinforcement Learning (DRL) +algorithm, applied to a 1-Degree of Freedom (DOF) Quanser Aero 2 system. +Classical control techniques such as MPC and Linear Quadratic Regulator (LQR) +are widely used due to their theoretical foundation and practical +effectiveness. However, with advancements in computational techniques and +machine learning, DRL approaches like PPO have gained traction in solving +optimal control problems through environment interaction. This paper +systematically evaluates the dynamic response characteristics of PPO and MPC, +comparing their performance, computational resource consumption, and +implementation complexity. Experimental results show that while LQR achieves +the best steady-state accuracy, PPO excels in rise-time and adaptability, +making it a promising approach for applications requiring rapid response and +adaptability. Additionally, we have established a baseline for future +RL-related research on this specific testbed. We also discuss the strengths and +limitations of each control strategy, providing recommendations for selecting +appropriate controllers for real-world scenarios. + +
+
+ comment: Accepted at INDIN2024 +
+
+
+
+
+ + ☆ Convergent Differential Privacy Analysis for General Federated Learning: + the f-DP Perspective + + +
+ Federated learning (FL) is an efficient collaborative training paradigm +extensively developed with a focus on local privacy protection, and +differential privacy (DP) is a classical approach to capture and ensure the +reliability of local privacy. The powerful cooperation of FL and DP provides a +promising learning framework for large-scale private clients, juggling both +privacy securing and trustworthy learning. As the predominant algorithm of DP, +the noisy perturbation has been widely studied and incorporated into various +federated algorithms, theoretically proven to offer significant privacy +protections. However, existing analyses in noisy FL-DP mostly rely on the +composition theorem and cannot tightly quantify the privacy leakage challenges, +which is nearly tight for small numbers of communication rounds but yields an +arbitrarily loose and divergent bound under the large communication rounds. +This implies a counterintuitive judgment, suggesting that FL may not provide +adequate privacy protection during long-term training. To further investigate +the convergent privacy and reliability of the FL-DP framework, in this paper, +we comprehensively evaluate the worst privacy of two classical methods under +the non-convex and smooth objectives based on the f-DP analysis, i.e. +Noisy-FedAvg and Noisy-FedProx methods. With the aid of the +shifted-interpolation technique, we successfully prove that the worst privacy +of the Noisy-FedAvg method achieves a tight convergent lower bound. Moreover, +in the Noisy-FedProx method, with the regularization of the proxy term, the +worst privacy has a stable constant lower bound. Our analysis further provides +a solid theoretical foundation for the reliability of privacy protection in +FL-DP. Meanwhile, our conclusions can also be losslessly converted to other +classical DP analytical frameworks, e.g. $(\epsilon,\delta)$-DP and +R$\acute{\text{e}}$nyi-DP (RDP). + +
+
+
+
+
+ + ☆ CAPER: Enhancing Career Trajectory Prediction using Temporal Knowledge + Graph and Ternary Relationship + + +
+ The problem of career trajectory prediction (CTP) aims to predict one's +future employer or job position. While several CTP methods have been developed +for this problem, we posit that none of these methods (1) jointly considers the +mutual ternary dependency between three key units (i.e., user, position, and +company) of a career and (2) captures the characteristic shifts of key units in +career over time, leading to an inaccurate understanding of the job movement +patterns in the labor market. To address the above challenges, we propose a +novel solution, named as CAPER, that solves the challenges via sophisticated +temporal knowledge graph (TKG) modeling. It enables the utilization of a +graph-structured knowledge base with rich expressiveness, effectively +preserving the changes in job movement patterns. Furthermore, we devise an +extrapolated career reasoning task on TKG for a realistic evaluation. The +experiments on a real-world career trajectory dataset demonstrate that CAPER +consistently and significantly outperforms four baselines, two recent TKG +reasoning methods, and five state-of-the-art CTP methods in predicting one's +future companies and positions-i.e., on average, yielding 6.80% and 34.58% more +accurate predictions, respectively. + +
+
+
+
+
+ + ☆ Large-Scale Demand Prediction in Urban Rail using Multi-Graph Inductive + Representation Learning + + +
+ With the expansion of cities over time, URT (Urban Rail Transit) networks +have also grown significantly. Demand prediction plays an important role in +supporting planning, scheduling, fleet management, and other operational +decisions. In this study, we propose an Origin-Destination (OD) demand +prediction model called Multi-Graph Inductive Representation Learning +(mGraphSAGE) for large-scale URT networks under operational uncertainties. Our +main contributions are twofold: we enhance prediction results while ensuring +scalability for large networks by relying simultaneously on multiple graphs, +where each OD pair is a node on a graph and distinct OD relationships, such as +temporal and spatial correlations; we show the importance of including +operational uncertainties such as train delays and cancellations as inputs in +demand prediction for daily operations. The model is validated on three +different scales of the URT network in Copenhagen, Denmark. Experimental +results show that by leveraging information from neighboring ODs and learning +node representations via sampling and aggregation, mGraphSAGE is particularly +suitable for OD demand prediction in large-scale URT networks, outperforming +reference machine learning methods. Furthermore, during periods with train +cancellations and delays, the performance gap between mGraphSAGE and other +methods improves compared to normal operating conditions, demonstrating its +ability to leverage system reliability information for predicting OD demand +under uncertainty. + +
+
+ comment: 18 pages, 3 figures +
+
+
+
+
+ + ☆ Statistical QoS Provision in Business-Centric Networks + + +
+ More refined resource management and Quality of Service (QoS) provisioning is +a critical goal of wireless communication technologies. In this paper, we +propose a novel Business-Centric Network (BCN) aimed at enabling scalable QoS +provisioning, based on a cross-layer framework that captures the relationship +between application, transport parameters, and channels. We investigate both +continuous flow and event-driven flow models, presenting key QoS metrics such +as throughput, delay, and reliability. By jointly considering power and +bandwidth allocation, transmission parameters, and AP network topology across +layers, we optimize weighted resource efficiency with statistical QoS +provisioning. To address the coupling among parameters, we propose a novel deep +reinforcement learning (DRL) framework, which is Collaborative Optimization +among Heterogeneous Actors with Experience Sharing (COHA-ES). Power and +sub-channel (SC) Actors representing multiple APs are jointly optimized under +the unified guidance of a common critic. Additionally, we introduce a novel +multithreaded experience-sharing mechanism to accelerate training and enhance +rewards. Extensive comparative experiments validate the effectiveness of our +DRL framework in terms of convergence and efficiency. Moreover, comparative +analyses demonstrate the comprehensive advantages of the BCN structure in +enhancing both spectral and energy efficiency. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Grand canonical generative diffusion model for crystalline phases and + grain boundaries + + +
+ The diffusion model has emerged as a powerful tool for generating atomic +structures for materials science. This work calls attention to the deficiency +of current particle-based diffusion models, which represent atoms as a point +cloud, in generating even the simplest ordered crystalline structures. The +problem is attributed to particles being trapped in local minima during the +score-driven simulated annealing of the diffusion process, similar to the +physical process of force-driven simulated annealing. We develop a solution, +the grand canonical diffusion model, which adopts an alternative voxel-based +representation with continuous rather than fixed number of particles. The +method is applied towards generation of several common crystalline phases as +well as the technologically important and challenging problem of grain boundary +structures. + +
+
+
+
+
+ + ☆ Exploring Selective Layer Fine-Tuning in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for fine-tuning +foundation models using distributed data in a privacy-preserving manner. Under +limited computational resources, clients often find it more practical to +fine-tune a selected subset of layers, rather than the entire model, based on +their task-specific data. In this study, we provide a thorough theoretical +exploration of selective layer fine-tuning in FL, emphasizing a flexible +approach that allows the clients to adjust their selected layers according to +their local data and resources. We theoretically demonstrate that the layer +selection strategy has a significant impact on model convergence in two +critical aspects: the importance of selected layers and the heterogeneous +choices across clients. Drawing from these insights, we further propose a +strategic layer selection method that utilizes local gradients and regulates +layer selections across clients. The extensive experiments on both image and +text datasets demonstrate the effectiveness of the proposed strategy compared +with several baselines, highlighting its advances in identifying critical +layers that adapt to the client heterogeneity and training dynamics in FL. + +
+
+
+
+
+ + ☆ Skills Regularized Task Decomposition for Multi-task Offline + Reinforcement Learning NeurIPS 2022 + + +
+ Reinforcement learning (RL) with diverse offline datasets can have the +advantage of leveraging the relation of multiple tasks and the common skills +learned across those tasks, hence allowing us to deal with real-world complex +problems efficiently in a data-driven way. In offline RL where only offline +data is used and online interaction with the environment is restricted, it is +yet difficult to achieve the optimal policy for multiple tasks, especially when +the data quality varies for the tasks. In this paper, we present a skill-based +multi-task RL technique on heterogeneous datasets that are generated by +behavior policies of different quality. To learn the shareable knowledge across +those datasets effectively, we employ a task decomposition method for which +common skills are jointly learned and used as guidance to reformulate a task in +shared and achievable subtasks. In this joint learning, we use Wasserstein +auto-encoder (WAE) to represent both skills and tasks on the same latent space +and use the quality-weighted loss as a regularization term to induce tasks to +be decomposed into subtasks that are more consistent with high-quality skills +than others. To improve the performance of offline RL agents learned on the +latent space, we also augment datasets with imaginary trajectories relevant to +high-quality skills for each task. Through experiments, we show that our +multi-task offline RL approach is robust to the mixed configurations of +different-quality datasets and it outperforms other state-of-the-art algorithms +for several robotic manipulation tasks and drone navigation tasks. + +
+
+ comment: 12 pages, 5 figures, acceepted in NeurIPS 2022 +
+
+
+
+
+ + ☆ VFLIP: A Backdoor Defense for Vertical Federated Learning via + Identification and Purification ESORICS 2024 + + +
+ Vertical Federated Learning (VFL) focuses on handling vertically partitioned +data over FL participants. Recent studies have discovered a significant +vulnerability in VFL to backdoor attacks which specifically target the distinct +characteristics of VFL. Therefore, these attacks may neutralize existing +defense mechanisms designed primarily for Horizontal Federated Learning (HFL) +and deep neural networks. In this paper, we present the first backdoor defense, +called VFLIP, specialized for VFL. VFLIP employs the identification and +purification techniques that operate at the inference stage, consequently +improving the robustness against backdoor attacks to a great extent. VFLIP +first identifies backdoor-triggered embeddings by adopting a participant-wise +anomaly detection approach. Subsequently, VFLIP conducts purification which +removes the embeddings identified as malicious and reconstructs all the +embeddings based on the remaining embeddings. We conduct extensive experiments +on CIFAR10, CINIC10, Imagenette, NUS-WIDE, and BankMarketing to demonstrate +that VFLIP can effectively mitigate backdoor attacks in VFL. +https://github.com/blingcho/VFLIP-esorics24 + +
+
+ comment: Accepted by 29th European Symposium on Research in Computer Security + (ESORICS 2024) +
+
+
+
+
+ + ☆ Bayesian optimization of atomic structures with prior probabilities from + universal interatomic potentials + + +
+ The optimization of atomic structures plays a pivotal role in understanding +and designing materials with desired properties. However, conventional methods +often struggle with the formidable task of navigating the vast potential energy +surface, especially in high-dimensional spaces with numerous local minima. +Recent advancements in machine learning-driven surrogate models offer a +promising avenue for alleviating this computational burden. In this study, we +propose a novel approach that combines the strengths of universal machine +learning potentials with a Bayesian approach of the GOFEE/BEACON framework. By +leveraging the comprehensive chemical knowledge encoded in pretrained universal +machine learning potentials as a prior estimate of energy and forces, we enable +the Gaussian process to focus solely on capturing the intricate nuances of the +potential energy surface. We demonstrate the efficacy of our approach through +comparative analyses across diverse systems, including periodic bulk materials, +surface structures, and a cluster. + +
+
+
+
+
+ + ☆ Boosting Lossless Speculative Decoding via Feature Sampling and Partial + Alignment Distillation AAAI 2025 + + +
+ Lossless speculative decoding accelerates target large language model (LLM) +inference by employing a lightweight draft model for generating tree-structured +candidates, which are subsequently verified in parallel by the target LLM. +Currently, effective approaches leverage feature-level rather than token-level +autoregression within the draft model to facilitate more straightforward +predictions and enhanced knowledge distillation. In this paper, we reassess +these approaches and propose FSPAD (Feature Sampling and Partial Alignment +Distillation for Lossless Speculative Decoding), which introduces two +straightforward and effective components within the existing framework to boost +lossless speculative decoding. Firstly, FSPAD utilizes token embeddings to +sample features of the target LLM in high-dimensional space before feeding them +into the draft model, due to the inherent uncertainty of the features +preventing the draft model from obtaining the specific token output by the +target LLM. Secondly, FSPAD introduces partial alignment distillation to weaken +the draft model's connection between features and logits, aiming to reduce the +conflict between feature alignment and logit confidence during training. Our +experiments include both greedy and non-greedy decoding on the largest and +smallest models from the Vicuna and LLaMA3-Instruct series, as well as tasks in +multi-turn conversation, translation, summarization, question answering, +mathematical reasoning, and retrieval-augmented generation. The results show +that FSPAD outperforms the state-of-the-art method across all the +aforementioned tasks and target LLMs. + +
+
+ comment: The work was not submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Latent Relationship Mining of Glaucoma Biomarkers: a TRI-LSTM based Deep + Learning + + +
+ In recently years, a significant amount of research has been conducted on +applying deep learning methods for glaucoma classification and detection. +However, the explainability of those established machine learning models +remains a big concern. In this research, in contrast, we learn from cognitive +science concept and study how ophthalmologists judge glaucoma detection. +Simulating experts' efforts, we propose a hierarchical decision making system, +centered around a holistic set of carefully designed biomarker-oriented machine +learning models. While biomarkers represent the key indicators of how +ophthalmologists identify glaucoma, they usually exhibit latent +inter-relations. We thus construct a time series model, named TRI-LSTM, capable +of calculating and uncovering potential and latent relationships among various +biomarkers of glaucoma. Our model is among the first efforts to explore the +intrinsic connections among glaucoma biomarkers. We monitor temporal +relationships in patients' disease states over time and to capture and retain +the progression of disease-relevant clinical information from prior visits, +thereby enriching biomarker's potential relationships. Extensive experiments +over real-world dataset have demonstrated the effectiveness of the proposed +model. + +
+
+ comment: 9 pages, 4 images +
+
+
+
+
+ + ☆ A Novel Denoising Technique and Deep Learning Based Hybrid Wind Speed + Forecasting Model for Variable Terrain Conditions + + +
+ Wind flow can be highly unpredictable and can suffer substantial fluctuations +in speed and direction due to the shape and height of hills, mountains, and +valleys, making accurate wind speed (WS) forecasting essential in complex +terrain. This paper presents a novel and adaptive model for short-term +forecasting of WS. The paper's key contributions are as follows: (a) The +Partial Auto Correlation Function (PACF) is utilised to minimise the dimension +of the set of Intrinsic Mode Functions (IMF), hence reducing training time; (b) +The sample entropy (SampEn) was used to calculate the complexity of the reduced +set of IMFs. The proposed technique is adaptive since a specific Deep Learning +(DL) model-feature combination was chosen based on complexity; (c) A novel +bidirectional feature-LSTM framework for complicated IMFs has been suggested, +resulting in improved forecasting accuracy; (d) The proposed model shows +superior forecasting performance compared to the persistence, hybrid, Ensemble +empirical mode decomposition (EEMD), and Variational Mode Decomposition +(VMD)-based deep learning models. It has achieved the lowest variance in terms +of forecasting accuracy between simple and complex terrain conditions 0.70%. +Dimension reduction of IMF's and complexity-based model-feature selection helps +reduce the training time by 68.77% and improve forecasting quality by 58.58% on +average. + +
+
+
+
+
+ + ☆ SciLitLLM: How to Adapt LLMs for Scientific Literature Understanding + + +
+ Scientific literature understanding is crucial for extracting targeted +information and garnering insights, thereby significantly advancing scientific +discovery. Despite the remarkable success of Large Language Models (LLMs), they +face challenges in scientific literature understanding, primarily due to (1) a +lack of scientific knowledge and (2) unfamiliarity with specialized scientific +tasks. + To develop an LLM specialized in scientific literature understanding, we +propose a hybrid strategy that integrates continual pre-training (CPT) and +supervised fine-tuning (SFT), to simultaneously infuse scientific domain +knowledge and enhance instruction-following capabilities for domain-specific +tasks.cIn this process, we identify two key challenges: (1) constructing +high-quality CPT corpora, and (2) generating diverse SFT instructions. We +address these challenges through a meticulous pipeline, including PDF text +extraction, parsing content error correction, quality filtering, and synthetic +instruction creation. Applying this strategy, we present a suite of LLMs: +SciLitLLM, specialized in scientific literature understanding. These models +demonstrate promising performance on scientific literature understanding +benchmarks. + Our contributions are threefold: (1) We present an effective framework that +integrates CPT and SFT to adapt LLMs to scientific literature understanding, +which can also be easily adapted to other domains. (2) We propose an LLM-based +synthesis method to generate diverse and high-quality scientific instructions, +resulting in a new instruction set -- SciLitIns -- for supervised fine-tuning +in less-represented scientific domains. (3) SciLitLLM achieves promising +performance improvements on scientific literature understanding benchmarks. + +
+
+
+
+
+ + ☆ Improving Thompson Sampling via Information Relaxation for Budgeted + Multi-armed Bandits + + +
+ We consider a Bayesian budgeted multi-armed bandit problem, in which each arm +consumes a different amount of resources when selected and there is a budget +constraint on the total amount of resources that can be used. Budgeted Thompson +Sampling (BTS) offers a very effective heuristic to this problem, but its +arm-selection rule does not take into account the remaining budget information. +We adopt \textit{Information Relaxation Sampling} framework that generalizes +Thompson Sampling for classical $K$-armed bandit problems, and propose a series +of algorithms that are randomized like BTS but more carefully optimize their +decisions with respect to the budget constraint. In a one-to-one correspondence +with these algorithms, a series of performance benchmarks that improve the +conventional benchmark are also suggested. Our theoretical analysis and +simulation results show that our algorithms (and our benchmarks) make +incremental improvements over BTS (respectively, the conventional benchmark) +across various settings including a real-world example. + +
+
+ comment: accepted +
+
+
+
+
+ + ☆ Measuring the Reliability of Causal Probing Methods: Tradeoffs, + Limitations, and the Plight of Nullifying Interventions + + +
+ Causal probing is an approach to interpreting foundation models, such as +large language models, by training probes to recognize latent properties of +interest from embeddings, intervening on probes to modify this representation, +and analyzing the resulting changes in the model's behavior. While some recent +works have cast doubt on the theoretical basis of several leading causal +probing intervention methods, it has been unclear how to systematically and +empirically evaluate their effectiveness in practice. To address this problem, +we propose a general empirical analysis framework to evaluate the reliability +of causal probing interventions, formally defining and quantifying two key +causal probing desiderata: completeness (fully transforming the representation +of the target property) and selectivity (minimally impacting other properties). +Our formalism allows us to make the first direct comparisons between different +families of causal probing methods (e.g., linear vs. nonlinear or +counterfactual vs. nullifying interventions). We conduct extensive experiments +across several leading methods, finding that (1) there is an inherent tradeoff +between these criteria, and no method is able to consistently satisfy both at +once; and (2) across the board, nullifying interventions are always far less +complete than counterfactual interventions, indicating that nullifying methods +may not be an effective approach to causal probing. + +
+
+
+
+
+ + ☆ MODULI: Unlocking Preference Generalization via Diffusion Models for + Offline Multi-Objective Reinforcement Learning + + +
+ Multi-objective Reinforcement Learning (MORL) seeks to develop policies that +simultaneously optimize multiple conflicting objectives, but it requires +extensive online interactions. Offline MORL provides a promising solution by +training on pre-collected datasets to generalize to any preference upon +deployment. However, real-world offline datasets are often conservatively and +narrowly distributed, failing to comprehensively cover preferences, leading to +the emergence of out-of-distribution (OOD) preference areas. Existing offline +MORL algorithms exhibit poor generalization to OOD preferences, resulting in +policies that do not align with preferences. Leveraging the excellent +expressive and generalization capabilities of diffusion models, we propose +MODULI (Multi-objective Diffusion Planner with Sliding Guidance), which employs +a preference-conditioned diffusion model as a planner to generate trajectories +that align with various preferences and derive action for decision-making. To +achieve accurate generation, MODULI introduces two return normalization methods +under diverse preferences for refining guidance. To further enhance +generalization to OOD preferences, MODULI proposes a novel sliding guidance +mechanism, which involves training an additional slider adapter to capture the +direction of preference changes. Incorporating the slider, it transitions from +in-distribution (ID) preferences to generating OOD preferences, patching, and +extending the incomplete Pareto front. Extensive experiments on the D4MORL +benchmark demonstrate that our algorithm outperforms state-of-the-art Offline +MORL baselines, exhibiting excellent generalization to OOD preferences. + +
+
+ comment: 23 pages, 7 figures +
+
+
+
+
+ + ☆ Deep Learning to Predict Late-Onset Breast Cancer Metastasis: the Single + Hyperparameter Grid Search (SHGS) Strategy for Meta Tuning Concerning Deep + Feed-forward Neural Network + + +
+ While machine learning has advanced in medicine, its widespread use in +clinical applications, especially in predicting breast cancer metastasis, is +still limited. We have been dedicated to constructing a DFNN model to predict +breast cancer metastasis n years in advance. However, the challenge lies in +efficiently identifying optimal hyperparameter values through grid search, +given the constraints of time and resources. Issues such as the infinite +possibilities for continuous hyperparameters like l1 and l2, as well as the +time-consuming and costly process, further complicate the task. To address +these challenges, we developed Single Hyperparameter Grid Search (SHGS) +strategy, serving as a preselection method before grid search. Our experiments +with SHGS applied to DFNN models for breast cancer metastasis prediction focus +on analyzing eight target hyperparameters: epochs, batch size, dropout, L1, L2, +learning rate, decay, and momentum. We created three figures, each depicting +the experiment results obtained from three LSM-I-10-Plus-year datasets. These +figures illustrate the relationship between model performance and the target +hyperparameter values. For each hyperparameter, we analyzed whether changes in +this hyperparameter would affect model performance, examined if there were +specific patterns, and explored how to choose values for the particular +hyperparameter. Our experimental findings reveal that the optimal value of a +hyperparameter is not only dependent on the dataset but is also significantly +influenced by the settings of other hyperparameters. Additionally, our +experiments suggested some reduced range of values for a target hyperparameter, +which may be helpful for low-budget grid search. This approach serves as a +prior experience and foundation for subsequent use of grid search to enhance +model performance. + +
+
+
+
+
+ + ☆ Remove Symmetries to Control Model Expressivity + + +
+ When symmetry is present in the loss function, the model is likely to be +trapped in a low-capacity state that is sometimes known as a "collapse." Being +trapped in these low-capacity states can be a major obstacle to training across +many scenarios where deep learning technology is applied. We first prove two +concrete mechanisms through which symmetries lead to reduced capacities and +ignored features during training. We then propose a simple and theoretically +justified algorithm, syre, to remove almost all symmetry-induced low-capacity +states in neural networks. The proposed method is shown to improve the training +of neural networks in scenarios when this type of entrapment is especially a +concern. A remarkable merit of the proposed method is that it is model-agnostic +and does not require any knowledge of the symmetry. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ CTRQNets & LQNets: Continuous Time Recurrent and Liquid Quantum Neural + Networks + + +
+ Neural networks have continued to gain prevalence in the modern era for their +ability to model complex data through pattern recognition and behavior +remodeling. However, the static construction of traditional neural networks +inhibits dynamic intelligence. This makes them inflexible to temporal changes +in data and unfit to capture complex dependencies. With the advent of quantum +technology, there has been significant progress in creating quantum algorithms. +In recent years, researchers have developed quantum neural networks that +leverage the capabilities of qubits to outperform classical networks. However, +their current formulation exhibits a static construction limiting the system's +dynamic intelligence. To address these weaknesses, we develop a Liquid Quantum +Neural Network (LQNet) and a Continuous Time Recurrent Quantum Neural Network +(CTRQNet). Both models demonstrate a significant improvement in accuracy +compared to existing quantum neural networks (QNNs), achieving accuracy +increases as high as 40\% on CIFAR 10 through binary classification. We propose +LQNets and CTRQNets might shine a light on quantum machine learning's black +box. + +
+
+
+
+
+ + ☆ PersonalizedUS: Interpretable Breast Cancer Risk Assessment with Local + Coverage Uncertainty Quantification + + +
+ Correctly assessing the malignancy of breast lesions identified during +ultrasound examinations is crucial for effective clinical decision-making. +However, the current "golden standard" relies on manual BI-RADS scoring by +clinicians, often leading to unnecessary biopsies and a significant mental +health burden on patients and their families. In this paper, we introduce +PersonalizedUS, an interpretable machine learning system that leverages recent +advances in conformal prediction to provide precise and personalized risk +estimates with local coverage guarantees and sensitivity, specificity, and +predictive values above 0.9 across various threshold levels. In particular, we +identify meaningful lesion subgroups where distribution-free, model-agnostic +conditional coverage holds, with approximately 90% of our prediction sets +containing only the ground truth in most lesion subgroups, thus explicitly +characterizing for which patients the model is most suitably applied. Moreover, +we make available a curated tabular dataset of 1936 biopsied breast lesions +from a recent observational multicenter study and benchmark the performance of +several state-of-the-art learning algorithms. We also report a successful case +study of the deployed system in the same multicenter context. Concrete clinical +benefits include up to a 65% reduction in requested biopsies among BI-RADS 4a +and 4b lesions, with minimal to no missed cancer cases. + +
+
+ comment: 9 pages, 5 figure, 2 tables +
+
+
+
+
+ + ☆ Certified Causal Defense with Generalizable Robustness AAAI + + +
+ While machine learning models have proven effective across various scenarios, +it is widely acknowledged that many models are vulnerable to adversarial +attacks. Recently, there have emerged numerous efforts in adversarial defense. +Among them, certified defense is well known for its theoretical guarantees +against arbitrary adversarial perturbations on input within a certain range +(e.g., $l_2$ ball). However, most existing works in this line struggle to +generalize their certified robustness in other data domains with distribution +shifts. This issue is rooted in the difficulty of eliminating the negative +impact of spurious correlations on robustness in different domains. To address +this problem, in this work, we propose a novel certified defense framework +GLEAN, which incorporates a causal perspective into the generalization problem +in certified defense. More specifically, our framework integrates a certifiable +causal factor learning component to disentangle the causal relations and +spurious correlations between input and label, and thereby exclude the negative +effect of spurious correlations on defense. On top of that, we design a +causally certified defense strategy to handle adversarial attacks on latent +causal factors. In this way, our framework is not only robust against malicious +noises on data in the training distribution but also can generalize its +robustness across domains with distribution shifts. Extensive experiments on +benchmark datasets validate the superiority of our framework in certified +robustness generalization in different data domains. Code is available in the +supplementary materials. + +
+
+ comment: Submitted to AAAI +
+
+
+
+
+ + ☆ Avoiding Generative Model Writer's Block With Embedding Nudging + + +
+ Generative image models, since introduction, have become a global phenomenon. +From new arts becoming possible to new vectors of abuse, many new capabilities +have become available. One of the challenging issues with generative models is +controlling the generation process specially to prevent specific generations +classes or instances . There are several reasons why one may want to control +the output of generative models, ranging from privacy and safety concerns to +application limitations or user preferences + To address memorization and privacy challenges, there has been considerable +research dedicated to filtering prompts or filtering the outputs of these +models. What all these solutions have in common is that at the end of the day +they stop the model from producing anything, hence limiting the usability of +the model. In this paper, we propose a method for addressing this usability +issue by making it possible to steer away from unwanted concepts (when detected +in model's output) and still generating outputs. In particular we focus on the +latent diffusion image generative models and how one can prevent them to +generate particular images while generating similar images with limited +overhead. + We focus on mitigating issues like image memorization, demonstrating our +technique's effectiveness through qualitative and quantitative evaluations. Our +method successfully prevents the generation of memorized training images while +maintaining comparable image quality and relevance to the unmodified model. + +
+
+
+
+
+ + ☆ CardBench: A Benchmark for Learned Cardinality Estimation in Relational + Databases + + +
+ Cardinality estimation is crucial for enabling high query performance in +relational databases. Recently learned cardinality estimation models have been +proposed to improve accuracy but there is no systematic benchmark or datasets +which allows researchers to evaluate the progress made by new learned +approaches and even systematically develop new learned approaches. In this +paper, we are releasing a benchmark, containing thousands of queries over 20 +distinct real-world databases for learned cardinality estimation. In contrast +to other initial benchmarks, our benchmark is much more diverse and can be used +for training and testing learned models systematically. Using this benchmark, +we explored whether learned cardinality estimation can be transferred to an +unseen dataset in a zero-shot manner. We trained GNN-based and +transformer-based models to study the problem in three setups: 1-) +instance-based, 2-) zero-shot, and 3-) fine-tuned. Our results show that while +we get promising results for zero-shot cardinality estimation on simple single +table queries; as soon as we add joins, the accuracy drops. However, we show +that with fine-tuning, we can still utilize pre-trained models for cardinality +estimation, significantly reducing training overheads compared to instance +specific models. We are open sourcing our scripts to collect statistics, +generate queries and training datasets to foster more extensive research, also +from the ML community on the important problem of cardinality estimation and in +particular improve on recent directions such as pre-trained cardinality +estimation. + +
+
+
+
+
+ + ☆ Simulating realistic short tandem repeat capillary electrophoretic + signal using a generative adversarial network + + +
+ DNA profiles are made up from multiple series of electrophoretic signal +measuring fluorescence over time. Typically, human DNA analysts 'read' DNA +profiles using their experience to distinguish instrument noise, artefactual +signal, and signal corresponding to DNA fragments of interest. Recent work has +developed an artificial neural network, ANN, to carry out the task of +classifying fluorescence types into categories in DNA profile electrophoretic +signal. But the creation of the necessarily large amount of labelled training +data for the ANN is time consuming and expensive, and a limiting factor in the +ability to robustly train the ANN. If realistic, prelabelled, training data +could be simulated then this would remove the barrier to training an ANN with +high efficacy. Here we develop a generative adversarial network, GAN, modified +from the pix2pix GAN to achieve this task. With 1078 DNA profiles we train the +GAN and achieve the ability to simulate DNA profile information, and then use +the generator from the GAN as a 'realism filter' that applies the noise and +artefact elements exhibited in typical electrophoretic signal. + +
+
+ comment: 29 pages, 9 Figures +
+
+
+
+
+ + ☆ LeMON: Learning to Learn Multi-Operator Networks + + +
+ Single-operator learning involves training a deep neural network to learn a +specific operator, whereas recent work in multi-operator learning uses an +operator embedding structure to train a single neural network on data from +multiple operators. Thus, multi-operator learning is capable of predicting a +range of operators within one model. In this work, we propose pretraining and +fine-tuning strategies for solving PDEs using multi-operator learning. One key +aspect is that by increasing the number of families of operators used in +pretraining, a PDE foundation model can be fine-tuned to downstream tasks +involving new PDEs with a limited number of samples, thus outperforming single +operator neural networks. Specifically, a multi-operator learning model +pre-trained with data from diverse PDE families can predict unseen operators +after fine-tuning with only a limited number of operators from the new family, +enabling them to serve as a data-free PDE solver. We also show that the +proposed training and fine-tuning method is able to predict new operators in +zero-shot prediction without samples. Additionally, we introduce a PDE-agnostic +meta-learning algorithm to improve the adaptability of the model to various +PDEs by providing a better parameter initialization process. To address the +needs of applications with limited computing resources, we explore low-rank +adaptation methods that reduce computational costs while enhancing solver +accuracy. Lastly, by examining the scaling law with respect to the number of +operator families, we establish and highlight its potential for broad +adaptation in PDE-solving tasks. + +
+
+
+
+
+ + ☆ Free Lunch in the Forest: Functionally-Identical Pruning of Boosted Tree + Ensembles + + +
+ Tree ensembles, including boosting methods, are highly effective and widely +used for tabular data. However, large ensembles lack interpretability and +require longer inference times. We introduce a method to prune a tree ensemble +into a reduced version that is "functionally identical" to the original model. +In other words, our method guarantees that the prediction function stays +unchanged for any possible input. As a consequence, this pruning algorithm is +lossless for any aggregated metric. We formalize the problem of functionally +identical pruning on ensembles, introduce an exact optimization model, and +provide a fast yet highly effective method to prune large ensembles. Our +algorithm iteratively prunes considering a finite set of points, which is +incrementally augmented using an adversarial model. In multiple computational +experiments, we show that our approach is a "free lunch", significantly +reducing the ensemble size without altering the model's behavior. Thus, we can +preserve state-of-the-art performance at a fraction of the original model's +size. + +
+
+
+
+
+ + ☆ CLPNets: Coupled Lie-Poisson Neural Networks for Multi-Part Hamiltonian + Systems with Symmetries + + +
+ To accurately compute data-based prediction of Hamiltonian systems, +especially the long-term evolution of such systems, it is essential to utilize +methods that preserve the structure of the equations over time. We consider a +case that is particularly challenging for data-based methods: systems with +interacting parts that do not reduce to pure momentum evolution. Such systems +are essential in scientific computations. For example, any discretization of a +continuum elastic rod can be viewed as interacting elements that can move and +rotate in space, with each discrete element moving on the group of rotations +and translations $SE(3)$. + We develop a novel method of data-based computation and complete phase space +learning of such systems. We follow the original framework of \emph{SympNets} +(Jin et al, 2020) building the neural network from canonical phase space +mappings, and transformations that preserve the Lie-Poisson structure +(\emph{LPNets}) as in (Eldred et al, 2024). We derive a novel system of +mappings that are built into neural networks for coupled systems. We call such +networks Coupled Lie-Poisson Neural Networks, or \emph{CLPNets}. We consider +increasingly complex examples for the applications of CLPNets: rotation of two +rigid bodies about a common axis, the free rotation of two rigid bodies, and +finally the evolution of two connected and interacting $SE(3)$ components. Our +method preserves all Casimir invariants of each system to machine precision, +irrespective of the quality of the training data, and preserves energy to high +accuracy. Our method also shows good resistance to the curse of dimensionality, +requiring only a few thousand data points for all cases studied, with the +effective dimension varying from three to eighteen. Additionally, the method is +highly economical in memory requirements, requiring only about 200 parameters +for the most complex case considered. + +
+
+ comment: 52 pages, 9 figures +
+
+
+
+
+ + ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? ECCV 2024 + + +
+ Foundation models have emerged as robust models with label efficiency in +diverse domains. In medical imaging, these models contribute to the advancement +of medical diagnoses due to the difficulty in obtaining labeled data. However, +it is unclear whether using a large amount of unlabeled data, biased by the +presence of sensitive attributes during pre-training, influences the fairness +of the model. This research examines the bias in the Foundation model +(RetFound) when it is applied to fine-tune the Brazilian Multilabel +Ophthalmological Dataset (BRSET), which has a different population than the +pre-training dataset. The model evaluation, in comparison with supervised +learning, shows that the Foundation Model has the potential to reduce the gap +between the maximum AUC and minimum AUC evaluations across gender and age +groups. However, in a data-efficient generalization, the model increases the +bias when the data amount decreases. These findings suggest that when deploying +a Foundation Model in real-life scenarios with limited data, the possibility of +fairness issues should be considered. + +
+
+ comment: Preprint of paper to be presented at Fairness and Ethics Towards + Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during + ECCV 2024 +
+
+
+
+
+ + ☆ Improving the Prediction of Individual Engagement in Recommendations + Using Cognitive Models + + +
+ For public health programs with limited resources, the ability to predict how +behaviors change over time and in response to interventions is crucial for +deciding when and to whom interventions should be allocated. Using data from a +real-world maternal health program, we demonstrate how a cognitive model based +on Instance-Based Learning (IBL) Theory can augment existing purely +computational approaches. Our findings show that, compared to general +time-series forecasters (e.g., LSTMs), IBL models, which reflect human +decision-making processes, better predict the dynamics of individuals' states. +Additionally, IBL provides estimates of the volatility in individuals' states +and their sensitivity to interventions, which can improve the efficiency of +training of other time series models. + +
+
+
+
+
+ + ♻ ☆ Embedded FPGA Developments in 130nm and 28nm CMOS for Machine Learning + in Particle Detector Readout + + +
+ Embedded field programmable gate array (eFPGA) technology allows the +implementation of reconfigurable logic within the design of an +application-specific integrated circuit (ASIC). This approach offers the low +power and efficiency of an ASIC along with the ease of FPGA configuration, +particularly beneficial for the use case of machine learning in the data +pipeline of next-generation collider experiments. An open-source framework +called "FABulous" was used to design eFPGAs using 130 nm and 28 nm CMOS +technology nodes, which were subsequently fabricated and verified through +testing. The capability of an eFPGA to act as a front-end readout chip was +assessed using simulation of high energy particles passing through a silicon +pixel sensor. A machine learning-based classifier, designed for reduction of +sensor data at the source, was synthesized and configured onto the eFPGA. A +successful proof-of-concept was demonstrated through reproduction of the +expected algorithm result on the eFPGA with perfect accuracy. Further +development of the eFPGA technology and its application to collider detector +readout is discussed. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Flextron: Many-in-One Flexible Large Language Model + + +
+ Training modern LLMs is extremely resource intensive, and customizing them +for various deployment scenarios characterized by limited compute and memory +resources through repeated training is impractical. In this paper, we introduce +Flextron, a network architecture and post-training model optimization framework +supporting flexible model deployment. The Flextron architecture utilizes a +nested elastic structure to rapidly adapt to specific user-defined latency and +accuracy targets during inference with no additional fine-tuning required. It +is also input-adaptive, and can automatically route tokens through its +sub-networks for improved performance and efficiency. We present a +sample-efficient training method and associated routing algorithms for +systematically transforming an existing trained LLM into a Flextron model. We +evaluate Flextron on the GPT-3 and LLama-2 family of LLMs, and demonstrate +superior performance over multiple end-to-end trained variants and other +state-of-the-art elastic networks, all with a single pretraining run that +consumes a mere 7.63% tokens compared to original pretraining. + +
+
+
+
+
+ + ♻ ☆ Examining Pathological Bias in a Generative Adversarial Network + Discriminator: A Case Study on a StyleGAN3 Model + + +
+ Generative adversarial networks (GANs) generate photorealistic faces that are +often indistinguishable by humans from real faces. While biases in machine +learning models are often assumed to be due to biases in training data, we find +pathological internal color and luminance biases in the discriminator of a +pre-trained StyleGAN3-r model that are not explicable by the training data. We +also find that the discriminator systematically stratifies scores by both +image- and face-level qualities and that this disproportionately affects images +across gender, race, and other categories. We examine axes common in research +on stereotyping in social psychology. + +
+
+
+
+
+ + ♻ ☆ GINN-KAN: Interpretability pipelining with applications in Physics + Informed Neural Networks + + +
+ Neural networks are powerful function approximators, yet their ``black-box" +nature often renders them opaque and difficult to interpret. While many +post-hoc explanation methods exist, they typically fail to capture the +underlying reasoning processes of the networks. A truly interpretable neural +network would be trained similarly to conventional models using techniques such +as backpropagation, but additionally provide insights into the learned +input-output relationships. In this work, we introduce the concept of +interpretability pipelineing, to incorporate multiple interpretability +techniques to outperform each individual technique. To this end, we first +evaluate several architectures that promise such interpretability, with a +particular focus on two recent models selected for their potential to +incorporate interpretability into standard neural network architectures while +still leveraging backpropagation: the Growing Interpretable Neural Network +(GINN) and Kolmogorov Arnold Networks (KAN). We analyze the limitations and +strengths of each and introduce a novel interpretable neural network GINN-KAN +that synthesizes the advantages of both models. When tested on the Feynman +symbolic regression benchmark datasets, GINN-KAN outperforms both GINN and KAN. +To highlight the capabilities and the generalizability of this approach, we +position GINN-KAN as an alternative to conventional black-box networks in +Physics-Informed Neural Networks (PINNs). We expect this to have far-reaching +implications in the application of deep learning pipelines in the natural +sciences. Our experiments with this interpretable PINN on 15 different partial +differential equations demonstrate that GINN-KAN augmented PINNs outperform +PINNs with black-box networks in solving differential equations and surpass the +capabilities of both GINN and KAN. + +
+
+
+
+
+ + ♻ ☆ A Deep Learning Based Resource Allocator for Communication Systems with + Dynamic User Utility Demands + + +
+ Deep learning (DL) based resource allocation (RA) has recently gained +significant attention due to its performance efficiency. However, most related +studies assume an ideal case where the number of users and their utility +demands, e.g., data rate constraints, are fixed, and the designed DL-based RA +scheme exploits a policy trained only for these fixed parameters. Consequently, +computationally complex policy retraining is required whenever these parameters +change. In this paper, we introduce a DL-based resource allocator (ALCOR) that +allows users to adjust their utility demands freely, such as based on their +application layer requirements. ALCOR employs deep neural networks (DNNs) as +the policy in a time-sharing problem. The underlying optimization algorithm +iteratively optimizes the on-off status of users to satisfy their utility +demands in expectation. The policy performs unconstrained RA (URA)--RA without +considering user utility demands--among active users to maximize the sum +utility (SU) at each time instant. Depending on the chosen URA scheme, ALCOR +can perform RA in either a centralized or distributed scenario. Derived +convergence analyses provide guarantees for ALCOR's convergence, and numerical +experiments corroborate its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Geometric Neural Network based on Phase Space for BCI-EEG decoding + + +
+ Objective: The integration of Deep Learning (DL) algorithms on brain signal +analysis is still in its nascent stages compared to their success in fields +like Computer Vision. This is particularly true for BCI, where the brain +activity is decoded to control external devices without requiring muscle +control. Electroencephalography (EEG) is a widely adopted choice for designing +BCI systems due to its non-invasive and cost-effective nature and excellent +temporal resolution. Still, it comes at the expense of limited training data, +poor signal-to-noise, and a large variability across and within-subject +recordings. Finally, setting up a BCI system with many electrodes takes a long +time, hindering the widespread adoption of reliable DL architectures in BCIs +outside research laboratories. To improve adoption, we need to improve user +comfort using, for instance, reliable algorithms that operate with few +electrodes. Approach: Our research aims to develop a DL algorithm that delivers +effective results with a limited number of electrodes. Taking advantage of the +Augmented Covariance Method and the framework of SPDNet, we propose the +Phase-SPDNet architecture and analyze its performance and the interpretability +of the results. The evaluation is conducted on 5-fold cross-validation, using +only three electrodes positioned above the Motor Cortex. The methodology was +tested on nearly 100 subjects from several open-source datasets using the +Mother Of All BCI Benchmark (MOABB) framework. Main results: The results of our +Phase-SPDNet demonstrate that the augmented approach combined with the SPDNet +significantly outperforms all the current state-of-the-art DL architecture in +MI decoding. Significance: This new architecture is explainable and with a low +number of trainable parameters. + +
+
+
+
+
+ + ♻ ☆ On-Device Training of Fully Quantized Deep Neural Networks on Cortex-M + Microcontrollers + + +
+ On-device training of DNNs allows models to adapt and fine-tune to newly +collected data or changing domains while deployed on microcontroller units +(MCUs). However, DNN training is a resource-intensive task, making the +implementation and execution of DNN training algorithms on MCUs challenging due +to low processor speeds, constrained throughput, limited floating-point +support, and memory constraints. In this work, we explore on-device training of +DNNs for Cortex-M MCUs. We present a method that enables efficient training of +DNNs completely in place on the MCU using fully quantized training (FQT) and +dynamic partial gradient updates. We demonstrate the feasibility of our +approach on multiple vision and time-series datasets and provide insights into +the tradeoff between training accuracy, memory overhead, energy, and latency on +real hardware. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Provable Probabilistic Imaging using Score-Based Generative Priors + + +
+ Estimating high-quality images while also quantifying their uncertainty are +two desired features in an image reconstruction algorithm for solving ill-posed +inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as +a principled framework for characterizing the space of possible solutions to a +general inverse problem. PMC is able to incorporate expressive score-based +generative priors for high-quality image reconstruction while also performing +uncertainty quantification via posterior sampling. In particular, we develop +two PMC algorithms that can be viewed as the sampling analogues of the +traditional plug-and-play priors (PnP) and regularization by denoising (RED) +algorithms. To improve the sampling efficiency, we introduce weighted annealing +into these PMC algorithms, further developing two additional annealed PMC +algorithms (APMC). We establish a theoretical analysis for characterizing the +convergence behavior of PMC algorithms. Our analysis provides non-asymptotic +stationarity guarantees in terms of the Fisher information, fully compatible +with the joint presence of weighted annealing, potentially non-log-concave +likelihoods, and imperfect score networks. We demonstrate the performance of +the PMC algorithms on multiple representative inverse problems with both linear +and nonlinear forward models. Experimental results show that PMC significantly +improves reconstruction quality and enables high-fidelity uncertainty +quantification. + +
+
+
+
+
+ + ♻ ☆ Correlation recurrent units: A novel neural architecture for improving + the predictive performance of time-series data + + +
+ The time-series forecasting (TSF) problem is a traditional problem in the +field of artificial intelligence. Models such as Recurrent Neural Network +(RNN), Long Short Term Memory (LSTM), and GRU (Gate Recurrent Units) have +contributed to improving the predictive accuracy of TSF. Furthermore, model +structures have been proposed to combine time-series decomposition methods, +such as seasonal-trend decomposition using Loess (STL) to ensure improved +predictive accuracy. However, because this approach is learned in an +independent model for each component, it cannot learn the relationships between +time-series components. In this study, we propose a new neural architecture +called a correlation recurrent unit (CRU) that can perform time series +decomposition within a neural cell and learn correlations (autocorrelation and +correlation) between each decomposition component. The proposed neural +architecture was evaluated through comparative experiments with previous +studies using five univariate time-series datasets and four multivariate +time-series data. The results showed that long- and short-term predictive +performance was improved by more than 10%. The experimental results show that +the proposed CRU is an excellent method for TSF problems compared to other +neural architectures. + +
+
+
+
+
+ + ♻ ☆ RecurrentGemma: Moving Past Transformers for Efficient Open Language + Models + + +
+ We introduce RecurrentGemma, a family of open language models which uses +Google's novel Griffin architecture. Griffin combines linear recurrences with +local attention to achieve excellent performance on language. It has a +fixed-sized state, which reduces memory use and enables efficient inference on +long sequences. We provide two sizes of models, containing 2B and 9B +parameters, and provide pre-trained and instruction tuned variants for both. +Our models achieve comparable performance to similarly-sized Gemma baselines +despite being trained on fewer tokens. + +
+
+
+
+
+ + ♻ ☆ A Statistical Framework of Watermarks for Large Language Models: Pivot, + Detection Efficiency and Optimal Rules + + +
+ Since ChatGPT was introduced in November 2022, embedding (nearly) +unnoticeable statistical signals into text generated by large language models +(LLMs), also known as watermarking, has been used as a principled approach to +provable detection of LLM-generated text from its human-written counterpart. In +this paper, we introduce a general and flexible framework for reasoning about +the statistical efficiency of watermarks and designing powerful detection +rules. Inspired by the hypothesis testing formulation of watermark detection, +our framework starts by selecting a pivotal statistic of the text and a secret +key -- provided by the LLM to the verifier -- to enable controlling the false +positive rate (the error of mistakenly detecting human-written text as +LLM-generated). Next, this framework allows one to evaluate the power of +watermark detection rules by obtaining a closed-form expression of the +asymptotic false negative rate (the error of incorrectly classifying +LLM-generated text as human-written). Our framework further reduces the problem +of determining the optimal detection rule to solving a minimax optimization +program. We apply this framework to two representative watermarks -- one of +which has been internally implemented at OpenAI -- and obtain several findings +that can be instrumental in guiding the practice of implementing watermarks. In +particular, we derive optimal detection rules for these watermarks under our +framework. These theoretically derived detection rules are demonstrated to be +competitive and sometimes enjoy a higher power than existing detection +approaches through numerical experiments. + +
+
+
+
+
+ + ♻ ☆ Guaranteed Coverage Prediction Intervals with Gaussian Process + Regression + + +
+ Gaussian Process Regression (GPR) is a popular regression method, which +unlike most Machine Learning techniques, provides estimates of uncertainty for +its predictions. These uncertainty estimates however, are based on the +assumption that the model is well-specified, an assumption that is violated in +most practical applications, since the required knowledge is rarely available. +As a result, the produced uncertainty estimates can become very misleading; for +example the prediction intervals (PIs) produced for the 95% confidence level +may cover much less than 95% of the true labels. To address this issue, this +paper introduces an extension of GPR based on a Machine Learning framework +called, Conformal Prediction (CP). This extension guarantees the production of +PIs with the required coverage even when the model is completely misspecified. +The proposed approach combines the advantages of GPR with the valid coverage +guarantee of CP, while the performed experimental results demonstrate its +superiority over existing methods. + +
+
+ comment: 12 pages. This article has been accepted for publication in IEEE + Transactions on Pattern Analysis and Machine Intelligence. This is the + author's version which has not been fully edited and content may change prior + to final publication. Citation information: DOI 10.1109/TPAMI.2024.3418214 +
+
+
+
+
+ + ♻ ☆ FRANC: A Lightweight Framework for High-Quality Code Generation SC + + +
+ In recent years, the use of automated source code generation utilizing +transformer-based generative models has expanded, and these models can generate +functional code according to the requirements of the developers. However, +recent research revealed that these automatically generated source codes can +contain vulnerabilities and other quality issues. Despite researchers' and +practitioners' attempts to enhance code generation models, retraining and +fine-tuning large language models is time-consuming and resource-intensive. +Thus, we describe FRANC, a lightweight framework for recommending more secure +and high-quality source code derived from transformer-based code generation +models. FRANC includes a static filter to make the generated code compilable +with heuristics and a quality-aware ranker to sort the code snippets based on a +quality score. Moreover, the framework uses prompt engineering to fix +persistent quality issues. We evaluated the framework with five Python and Java +code generation models and six prompt datasets, including a newly created one +in this work (SOEval). The static filter improves 9% to 46% Java suggestions +and 10% to 43% Python suggestions regarding compilability. The average +improvement over the NDCG@10 score for the ranking system is 0.0763, and the +repairing techniques repair the highest 80% of prompts. FRANC takes, on +average, 1.98 seconds for Java; for Python, it takes 0.08 seconds. + +
+
+ comment: Accepted at the 24th IEEE International Conference on Source Code + Analysis and Manipulation (SCAM 2024) +
+
+
+
+
+ + ♻ ☆ The Fault in our Stars: Quality Assessment of Code Generation Benchmarks SC + + +
+ Large Language Models (LLMs) are gaining popularity among software engineers. +A crucial aspect of developing effective code generation LLMs is to evaluate +these models using a robust benchmark. Evaluation benchmarks with quality +issues can provide a false sense of performance. In this work, we conduct the +first-of-its-kind study of the quality of prompts within benchmarks used to +compare the performance of different code generation models. To conduct this +study, we analyzed 3,566 prompts from 9 code generation benchmarks to identify +quality issues in them. We also investigated whether fixing the identified +quality issues in the benchmarks' prompts affects a model's performance. We +also studied memorization issues of the evaluation dataset, which can put into +question a benchmark's trustworthiness. We found that code generation +evaluation benchmarks mainly focused on Python and coding exercises and had +very limited contextual dependencies to challenge the model. These datasets and +the developers' prompts suffer from quality issues like spelling and +grammatical errors, unclear sentences to express developers' intent, and not +using proper documentation style. Fixing all these issues in the benchmarks can +lead to a better performance for Python code generation, but not a significant +improvement was observed for Java code generation. We also found evidence that +GPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues. + +
+
+ comment: Accepted at the 24th IEEE International Conference on Source Code + Analysis and Manipulation(SCAM 2024) +
+
+
+
+
+ + ♻ ☆ Unveiling the Statistical Foundations of Chain-of-Thought Prompting + Methods + + +
+ Chain-of-Thought (CoT) prompting and its variants have gained popularity as +effective methods for solving multi-step reasoning problems using pretrained +large language models (LLMs). In this work, we analyze CoT prompting from a +statistical estimation perspective, providing a comprehensive characterization +of its sample complexity. To this end, we introduce a multi-step latent +variable model that encapsulates the reasoning process, where the latent +variable encodes the task information. Under this framework, we demonstrate +that when the pretraining dataset is sufficiently large, the estimator formed +by CoT prompting is equivalent to a Bayesian estimator. This estimator +effectively solves the multi-step reasoning problem by aggregating a posterior +distribution inferred from the demonstration examples in the prompt. Moreover, +we prove that the statistical error of the CoT estimator can be decomposed into +two main components: (i) a prompting error, which arises from inferring the +true task using CoT prompts, and (ii) the statistical error of the pretrained +LLM. We establish that, under appropriate assumptions, the prompting error +decays exponentially to zero as the number of demonstrations increases. +Additionally, we explicitly characterize the approximation and generalization +errors of the pretrained LLM. Notably, we construct a transformer model that +approximates the target distribution of the multi-step reasoning problem with +an error that decreases exponentially in the number of transformer blocks. Our +analysis extends to other variants of CoT, including Self-Consistent CoT, +Tree-of-Thought, and Selection-Inference, offering a broad perspective on the +efficacy of these methods. We also provide numerical experiments to validate +the theoretical findings. + +
+
+ comment: 150 pages, 18 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ A Framework to Model ML Engineering Processes + + +
+ The development of Machine Learning (ML) based systems is complex and +requires multidisciplinary teams with diverse skill sets. This may lead to +communication issues or misapplication of best practices. Process models can +alleviate these challenges by standardizing task orchestration, providing a +common language to facilitate communication, and nurturing a collaborative +environment. Unfortunately, current process modeling languages are not suitable +for describing the development of such systems. In this paper, we introduce a +framework for modeling ML-based software development processes, built around a +domain-specific language and derived from an analysis of scientific and gray +literature. A supporting toolkit is also available. + +
+
+
+
+
+ + ♻ ☆ Stick to your Role! Stability of Personal Values Expressed in Large + Language Models + + +
+ The standard way to study Large Language Models (LLMs) with benchmarks or +psychology questionnaires is to provide many different queries from similar +minimal contexts (e.g. multiple choice questions). However, due to LLMs' highly +context-dependent nature, conclusions from such minimal-context evaluations may +be little informative about the model's behavior in deployment (where it will +be exposed to many new contexts). We argue that context-dependence +(specifically, value stability) should be studied as a specific property of +LLMs and used as another dimension of LLM comparison (alongside others such as +cognitive abilities, knowledge, or model size). We present a case-study on the +stability of value expression over different contexts (simulated conversations +on different topics) as measured using a standard psychology questionnaire +(PVQ) and on behavioral downstream tasks. Reusing methods from psychology, we +study Rank-order stability on the population (interpersonal) level, and +Ipsative stability on the individual (intrapersonal) level. We consider two +settings (with and without instructing LLMs to simulate particular personas), +two simulated populations, and three downstream tasks. We observe consistent +trends in the stability of models and model families - Mixtral, Mistral, +GPT-3.5 and Qwen families are more stable than LLaMa-2 and Phi. The consistency +of these trends implies that some models exhibit higher value stability than +others, and that stability can be estimated with the set of introduced +methodological tools. When instructed to simulate particular personas, LLMs +exhibit low Rank-order stability, which further diminishes with conversation +length. This highlights the need for future research on LLMs that coherently +simulate different personas. This paper provides a foundational step in that +direction, and, to our knowledge, it is the first study of value stability in +LLMs. + +
+
+ comment: The project website and code are available at + https://sites.google.com/view/llmvaluestability Published in PLOS ONE ( + https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0309114 ), + and a shorter version at CogSci 24 ( + https://escholarship.org/uc/item/7w4823c6 ) +
+
+
+
+
+ + ♻ ☆ A Metric-based Principal Curve Approach for Learning One-dimensional + Manifold + + +
+ Principal curve is a well-known statistical method oriented in manifold +learning using concepts from differential geometry. In this paper, we propose a +novel metric-based principal curve (MPC) method that learns one-dimensional +manifold of spatial data. Synthetic datasets Real applications using MNIST +dataset show that our method can learn the one-dimensional manifold well in +terms of the shape. + +
+
+
+
+
+ + ♻ ☆ Marked Neural Spatio-Temporal Point Process Involving a Dynamic Graph + Neural Network + + +
+ Temporal Point Processes (TPPs) have recently become increasingly interesting +for learning dynamics in graph data. A reason for this is that learning on +dynamic graph data is becoming more relevant, since data from many scientific +fields, ranging from mathematics, biology, social sciences, and physics to +computer science, is naturally related and inherently dynamic. In addition, +TPPs provide a meaningful characterization of event streams and a prediction +mechanism for future events. Therefore, (semi-)parameterized Neural TPPs have +been introduced whose characterization can be (partially) learned and, thus, +enable the representation of more complex phenomena. However, the research on +modeling dynamic graphs with TPPs is relatively young, and only a few models +for node attribute changes or evolving edges have been proposed yet. To allow +for learning on fully dynamic graph streams, i.e., graphs that can change in +their structure (addition/deletion of nodes/edge) and in their node/edge +attributes, we propose a Marked Neural Spatio-Temporal Point Process (MNSTPP). +It leverages a Dynamic Graph Neural Network to learn a Marked TPP that handles +attributes and spatial data to model and predict any event in a graph stream. + +
+
+
+
+
+ + ♻ ☆ Analysis of Diagnostics (Part I): Prevalence, Uncertainty + Quantification, and Machine Learning + + +
+ Diagnostic testing provides a unique setting for studying and developing +tools in classification theory. In such contexts, the concept of prevalence, +i.e. the number of individuals with a given condition, is fundamental, both as +an inherent quantity of interest and as a parameter that controls +classification accuracy. This manuscript is the first in a two-part series that +studies deeper connections between classification theory and prevalence, +showing how the latter establishes a more complete theory of uncertainty +quantification (UQ) for certain types of machine learning (ML). We motivate +this analysis via a lemma demonstrating that general classifiers minimizing a +prevalence-weighted error contain the same probabilistic information as +Bayes-optimal classifiers, which depend on conditional probability densities. +This leads us to study relative probability level-sets $B^\star (q)$, which are +reinterpreted as both classification boundaries and useful tools for +quantifying uncertainty in class labels. To realize this in practice, we also +propose a numerical, homotopy algorithm that estimates the $B^\star (q)$ by +minimizing a prevalence-weighted empirical error. The successes and +shortcomings of this method motivate us to revisit properties of the level +sets, and we deduce the corresponding classifiers obey a useful monotonicity +property that stabilizes the numerics and points to important extensions to UQ +of ML. Throughout, we validate our methods in the context of synthetic data and +a research-use-only SARS-CoV-2 enzyme-linked immunosorbent (ELISA) assay. + +
+
+
+
+
+ + ♻ ☆ When Multi-Task Learning Meets Partial Supervision: A Computer Vision + Review + + +
+ Multi-Task Learning (MTL) aims to learn multiple tasks simultaneously while +exploiting their mutual relationships. By using shared resources to +simultaneously calculate multiple outputs, this learning paradigm has the +potential to have lower memory requirements and inference times compared to the +traditional approach of using separate methods for each task. Previous work in +MTL has mainly focused on fully-supervised methods, as task relationships can +not only be leveraged to lower the level of data-dependency of those methods +but they can also improve performance. However, MTL introduces a set of +challenges due to a complex optimisation scheme and a higher labeling +requirement. This review focuses on how MTL could be utilised under different +partial supervision settings to address these challenges. First, this review +analyses how MTL traditionally uses different parameter sharing techniques to +transfer knowledge in between tasks. Second, it presents the different +challenges arising from such a multi-objective optimisation scheme. Third, it +introduces how task groupings can be achieved by analysing task relationships. +Fourth, it focuses on how partially supervised methods applied to MTL can +tackle the aforementioned challenges. Lastly, this review presents the +available datasets, tools and benchmarking results of such methods. + +
+
+ comment: Accepted by Proceedings of the IEEE +
+
+
+
+
+ + ♻ ☆ QEDCartographer: Automating Formal Verification Using Reward-Free + Reinforcement Learning ICSE + + +
+ Formal verification is a promising method for producing reliable software, +but the difficulty of manually writing verification proofs severely limits its +utility in practice. Recent methods have automated some proof synthesis by +guiding a search through the proof space using a theorem prover. Unfortunately, +the theorem prover provides only the crudest estimate of progress, resulting in +effectively undirected search. To address this problem, we create +QEDCartographer, an automated proof-synthesis tool that combines supervised and +reinforcement learning to more effectively explore the proof space. +QEDCartographer incorporates the proofs' branching structure, enabling +reward-free search and overcoming the sparse reward problem inherent to formal +verification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K +theorems from 124 open-source Coq projects. QEDCartographer fully automatically +proves 21.4% of the test-set theorems. Previous search-based proof-synthesis +tools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on +supervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively. +Diva, which combines 62 tools, proves 19.2%. Comparing to the most effective +prior tool, Proverbot9001, QEDCartographer produces 26% shorter proofs 27% +faster, on average over the theorems both tools prove. Together, +QEDCartographer and non-learning-based CoqHammer prove 31.8% of the theorems, +while CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement +learning is a fruitful research direction for improving proof-synthesis tools' +search mechanisms. + +
+
+ comment: Published in the International Conference on Software Engineering + (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan + Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal + Verification Using Reward-Free Reinforcement Learning, in Proceedings of the + 47th International Conference on Software Engineering (ICSE), 2025 +
+
+
+
+
+ + ♻ ☆ Research on the Spatial Data Intelligent Foundation Model + + +
+ This report focuses on spatial data intelligent large models, delving into +the principles, methods, and cutting-edge applications of these models. It +provides an in-depth discussion on the definition, development history, current +status, and trends of spatial data intelligent large models, as well as the +challenges they face. The report systematically elucidates the key technologies +of spatial data intelligent large models and their applications in urban +environments, aerospace remote sensing, geography, transportation, and other +scenarios. Additionally, it summarizes the latest application cases of spatial +data intelligent large models in themes such as urban development, multimodal +systems, remote sensing, smart transportation, and resource environments. +Finally, the report concludes with an overview and outlook on the development +prospects of spatial data intelligent large models. + +
+
+ comment: V1 and V2 are in Chinese language, other versions are in English +
+
+
+
+
+ + ♻ ☆ FADE: Towards Fairness-aware Augmentation for Domain Generalization via + Classifier-Guided Score-based Diffusion Models + + +
+ Fairness-aware domain generalization (FairDG) has emerged as a critical +challenge for deploying trustworthy AI systems, particularly in scenarios +involving distribution shifts. Traditional methods for addressing fairness have +failed in domain generalization due to their lack of consideration for +distribution shifts. Although disentanglement has been used to tackle FairDG, +it is limited by its strong assumptions. To overcome these limitations, we +propose Fairness-aware Classifier-Guided Score-based Diffusion Models (FADE) as +a novel approach to effectively address the FairDG issue. Specifically, we +first pre-train a score-based diffusion model (SDM) and two classifiers to +equip the model with strong generalization capabilities across different +domains. Then, we guide the SDM using these pre-trained classifiers to +effectively eliminate sensitive information from the generated data. Finally, +the generated fair data is used to train downstream classifiers, ensuring +robust performance under new data distributions. Extensive experiments on three +real-world datasets demonstrate that FADE not only enhances fairness but also +improves accuracy in the presence of distribution shifts. Additionally, FADE +outperforms existing methods in achieving the best accuracy-fairness +trade-offs. + +
+
+
+
+
+ + ♻ ☆ Re-Nerfing: Improving Novel View Synthesis through Novel View Synthesis + + +
+ Recent neural rendering and reconstruction techniques, such as NeRFs or +Gaussian Splatting, have shown remarkable novel view synthesis capabilities but +require hundreds of images of the scene from diverse viewpoints to render +high-quality novel views. With fewer images available, these methods start to +fail since they can no longer correctly triangulate the underlying 3D geometry +and converge to a non-optimal solution. These failures can manifest as floaters +or blurry renderings in sparsely observed areas of the scene. In this paper, we +propose Re-Nerfing, a simple and general add-on approach that leverages novel +view synthesis itself to tackle this problem. Using an already trained NVS +method, we render novel views between existing ones and augment the training +data to optimize a second model. This introduces additional multi-view +constraints and allows the second model to converge to a better solution. With +Re-Nerfing we achieve significant improvements upon multiple pipelines based on +NeRF and Gaussian-Splatting in sparse view settings of the mip-NeRF 360 and +LLFF datasets. Notably, Re-Nerfing does not require prior knowledge or extra +supervision signals, making it a flexible and practical add-on. + +
+
+ comment: Code will be released upon acceptance +
+
+
+
+
+ + ♻ ☆ Contextual Bandit with Herding Effects: Algorithms and Recommendation + Applications PRICAI 2024 + + +
+ Contextual bandits serve as a fundamental algorithmic framework for +optimizing recommendation decisions online. Though extensive attention has been +paid to tailoring contextual bandits for recommendation applications, the +"herding effects" in user feedback have been ignored. These herding effects +bias user feedback toward historical ratings, breaking down the assumption of +unbiased feedback inherent in contextual bandits. This paper develops a novel +variant of the contextual bandit that is tailored to address the feedback bias +caused by the herding effects. A user feedback model is formulated to capture +this feedback bias. We design the TS-Conf (Thompson Sampling under Conformity) +algorithm, which employs posterior sampling to balance the exploration and +exploitation tradeoff. We prove an upper bound for the regret of the algorithm, +revealing the impact of herding effects on learning speed. Extensive +experiments on datasets demonstrate that TS-Conf outperforms four benchmark +algorithms. Analysis reveals that TS-Conf effectively mitigates the negative +impact of herding effects, resulting in faster learning and improved +recommendation accuracy. + +
+
+ comment: Published as a conference paper at PRICAI 2024 +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Amortized Bayesian Inference + + +
+ Sensitivity analyses reveal the influence of various modeling choices on the +outcomes of statistical analyses. While theoretically appealing, they are +overwhelmingly inefficient for complex Bayesian models. In this work, we +propose sensitivity-aware amortized Bayesian inference (SA-ABI), a multifaceted +approach to efficiently integrate sensitivity analyses into simulation-based +inference with neural networks. First, we utilize weight sharing to encode the +structural similarities between alternative likelihood and prior specifications +in the training process with minimal computational overhead. Second, we +leverage the rapid inference of neural networks to assess sensitivity to data +perturbations and preprocessing steps. In contrast to most other Bayesian +approaches, both steps circumvent the costly bottleneck of refitting the model +for each choice of likelihood, prior, or data set. Finally, we propose to use +deep ensembles to detect sensitivity arising from unreliable approximation +(e.g., due to model misspecification). We demonstrate the effectiveness of our +method in applied modeling problems, ranging from disease outbreak dynamics and +global warming thresholds to human decision-making. Our results support +sensitivity-aware inference as a default choice for amortized Bayesian +workflows, automatically providing modelers with insights into otherwise hidden +dimensions. + +
+
+ comment: Published in TMLR (2024) +
+
+
+
+
+ + ♻ ☆ Articulation Work and Tinkering for Fairness in Machine Learning + + +
+ The field of fair AI aims to counter biased algorithms through computational +modelling. However, it faces increasing criticism for perpetuating the use of +overly technical and reductionist methods. As a result, novel approaches appear +in the field to address more socially-oriented and interdisciplinary (SOI) +perspectives on fair AI. In this paper, we take this dynamic as the starting +point to study the tension between computer science (CS) and SOI research. By +drawing on STS and CSCW theory, we position fair AI research as a matter of +'organizational alignment': what makes research 'doable' is the successful +alignment of three levels of work organization (the social world, the +laboratory, and the experiment). Based on qualitative interviews with CS +researchers, we analyze the tasks, resources, and actors required for doable +research in the case of fair AI. We find that CS researchers engage with SOI +research to some extent, but organizational conditions, articulation work, and +ambiguities of the social world constrain the doability of SOI research for +them. Based on our findings, we identify and discuss problems for aligning CS +and SOI as fair AI continues to evolve. + +
+
+
+
+
+ + ♻ ☆ Forecasting Intraday Power Output by a Set of PV Systems using Recurrent + Neural Networks and Physical Covariates + + +
+ Accurate intraday forecasts of the power output by PhotoVoltaic (PV) systems +are critical to improve the operation of energy distribution grids. We describe +a neural autoregressive model that aims to perform such intraday forecasts. We +build upon a physical, deterministic PV performance model, the output of which +is used as covariates in the context of the neural model. In addition, our +application data relates to a geographically distributed set of PV systems. We +address all PV sites with a single neural model, which embeds the information +about the PV site in specific covariates. We use a scale-free approach which +relies on the explicit modeling of seasonal effects. Our proposal repurposes a +model initially used in the retail sector and discloses a novel truncated +Gaussian output distribution. An ablation study and a comparison to alternative +architectures from the literature shows that the components in the best +performing proposed model variant work synergistically to reach a skill score +of 15.72% with respect to the physical model, used as a baseline. + +
+
+ comment: 25 pages, 7 figures, Accepted for publication in Neural Computing and + Applications on 12/07/2024 +
+
+
+
+
+ + ♻ ☆ Language-specific Calibration for Pruning Multilingual Language Models + + +
+ Recent advances in large language model (LLM) pruning have shown +state-of-the-art compression results in post-training and retraining-free +settings while maintaining high predictive performance. However, such research +mainly considers calibrating pruning using English text, despite the +multilingual nature of modern LLMs and their frequent uses in non-English +languages. In this paper, we set out to explore effective strategies for +calibrating the pruning of multilingual language models. We present the first +comprehensive empirical study, comparing different calibration languages for +pruning multilingual models across diverse tasks, models, and state-of-the-art +pruning techniques. Our results present practical suggestions, for example, +calibrating in the target language can efficiently yield lower perplexity, but +does not necessarily benefit downstream tasks. Our further analysis experiments +unveil that calibration in the target language mainly contributes to preserving +language-specific features related to fluency and coherence, but might not +contribute to capturing language-agnostic features such as language +understanding and reasoning. Last, we provide practical recommendations for +future practitioners. + +
+
+
+
+
+ + ♻ ☆ Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal + Time Series Imputation CIKM'2024 + + +
+ Spatiotemporal time series are usually collected via monitoring sensors +placed at different locations, which usually contain missing values due to +various failures, such as mechanical damages and Internet outages. Imputing the +missing values is crucial for analyzing time series. When recovering a specific +data point, most existing methods consider all the information relevant to that +point regardless of the cause-and-effect relationship. During data collection, +it is inevitable that some unknown confounders are included, e.g., background +noise in time series and non-causal shortcut edges in the constructed sensor +network. These confounders could open backdoor paths and establish non-causal +correlations between the input and output. Over-exploiting these non-causal +correlations could cause overfitting. In this paper, we first revisit +spatiotemporal time series imputation from a causal perspective and show how to +block the confounders via the frontdoor adjustment. Based on the results of +frontdoor adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph +Neural Network (Casper), which contains a novel Prompt Based Decoder (PBD) and +a Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of +confounders and SCA could discover the sparse causal relationships among +embeddings. Theoretical analysis reveals that SCA discovers causal +relationships based on the values of gradients. We evaluate Casper on three +real-world datasets, and the experimental results show that Casper could +outperform the baselines and could effectively discover causal relationships. + +
+
+ comment: Accepted by CIKM'2024 +
+
+
+
+
+ + ♻ ☆ Inferring Individual Direct Causal Effects Under Heterogeneous Peer + Influence + + +
+ Causal inference in networks should account for interference, which occurs +when a unit's outcome is influenced by treatments or outcomes of peers. +Heterogeneous peer influence (HPI) occurs when a unit's outcome is influenced +differently by different peers based on their attributes and relationships, or +when each unit has a different susceptibility to peer influence. Existing +solutions to estimating direct causal effects under interference consider +either homogeneous influence from peers or specific heterogeneous influence +mechanisms (e.g., based on local neighborhood structure). This paper presents a +methodology for estimating individual direct causal effects in the presence of +HPI where the mechanism of influence is not known a priori. We propose a +structural causal model for networks that can capture different possible +assumptions about network structure, interference conditions, and causal +dependence and enables reasoning about identifiability in the presence of HPI. +We find potential heterogeneous contexts using the causal model and propose a +novel graph neural network-based estimator to estimate individual direct causal +effects. We show that state-of-the-art methods for individual direct effect +estimation produce biased results in the presence of HPI, and that our proposed +estimator is robust. + +
+
+
+
+
+ + ♻ ☆ A Platform-Agnostic Deep Reinforcement Learning Framework for Effective + Sim2Real Transfer towards Autonomous Driving + + +
+ Deep Reinforcement Learning (DRL) has shown remarkable success in solving +complex tasks across various research fields. However, transferring DRL agents +to the real world is still challenging due to the significant discrepancies +between simulation and reality. To address this issue, we propose a robust DRL +framework that leverages platform-dependent perception modules to extract +task-relevant information and train a lane-following and overtaking agent in +simulation. This framework facilitates the seamless transfer of the DRL agent +to new simulated environments and the real world with minimal effort. We +evaluate the performance of the agent in various driving scenarios in both +simulation and the real world, and compare it to human players and the PID +baseline in simulation. Our proposed framework significantly reduces the gaps +between different platforms and the Sim2Real gap, enabling the trained agent to +achieve similar performance in both simulation and the real world, driving the +vehicle effectively. + +
+
+
+
+
+ + ♻ ☆ Improving the forecast accuracy of wind power by leveraging multiple + hierarchical structure + + +
+ Renewable energy generation is of utmost importance for global +decarbonization. Forecasting renewable energies, particularly wind energy, is +challenging due to the inherent uncertainty in wind energy generation, which +depends on weather conditions. Recent advances in hierarchical forecasting +through reconciliation have demonstrated a significant increase in the quality +of wind energy forecasts for short-term periods. We leverage the +cross-sectional and temporal hierarchical structure of turbines in wind farms +and build cross-temporal hierarchies to further investigate how integrated +cross-sectional and temporal dimensions can add value to forecast accuracy in +wind farms. We found that cross-temporal reconciliation was superior to +individual cross-sectional reconciliation at multiple temporal aggregations. +Additionally, machine learning based forecasts that were cross-temporally +reconciled demonstrated high accuracy at coarser temporal granularities, which +may encourage adoption for short-term wind forecasts. Empirically, we provide +insights for decision-makers on the best methods for forecasting high-frequency +wind data across different forecasting horizons and levels. + +
+
+ comment: 41 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ DeepMIF: Deep Monotonic Implicit Fields for Large-Scale LiDAR 3D Mapping + + +
+ Recently, significant progress has been achieved in sensing real large-scale +outdoor 3D environments, particularly by using modern acquisition equipment +such as LiDAR sensors. Unfortunately, they are fundamentally limited in their +ability to produce dense, complete 3D scenes. To address this issue, recent +learning-based methods integrate neural implicit representations and +optimizable feature grids to approximate surfaces of 3D scenes. However, +naively fitting samples along raw LiDAR rays leads to noisy 3D mapping results +due to the nature of sparse, conflicting LiDAR measurements. Instead, in this +work we depart from fitting LiDAR data exactly, instead letting the network +optimize a non-metric monotonic implicit field defined in 3D space. To fit our +field, we design a learning system integrating a monotonicity loss that enables +optimizing neural monotonic fields and leverages recent progress in large-scale +3D mapping. Our algorithm achieves high-quality dense 3D mapping performance as +captured by multiple quantitative and perceptual measures and visual results +obtained for Mai City, Newer College, and KITTI benchmarks. The code of our +approach will be made publicly available. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image + Generation from Spontaneous Facial Expression Reaction + + +
+ Researchers have proposed to use data of human preference feedback to +fine-tune text-to-image generative models. However, the scalability of human +feedback collection has been limited by its reliance on manual annotation. +Therefore, we develop and test a method to automatically score user preferences +from their spontaneous facial expression reaction to the generated images. We +collect a dataset of Facial Expression Reaction to Generated Images (FERGI) and +show that the activations of multiple facial action units (AUs) are highly +correlated with user evaluations of the generated images. We develop an FAU-Net +(Facial Action Units Neural Network), which receives inputs from an AU +estimation model, to automatically score user preferences for text-to-image +generation based on their facial expression reactions, which is complementary +to the pre-trained scoring models based on the input text prompts and generated +images. Integrating our FAU-Net valence score with the pre-trained scoring +models improves their consistency with human preferences. This method of +automatic annotation with facial expression analysis can be potentially +generalized to other generation tasks. The code is available at +https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at +the same link for research purposes. + +
+
+
+
+
+ + ♻ ☆ Trade-off between Gradient Measurement Efficiency and Expressivity in + Deep Quantum Neural Networks + + +
+ Quantum neural networks (QNNs) require an efficient training algorithm to +achieve practical quantum advantages. A promising approach is the use of +gradient-based optimization algorithms, where gradients are estimated through +quantum measurements. However, general QNNs lack an efficient gradient +measurement algorithm, which poses a fundamental and practical challenge to +realizing scalable QNNs. In this work, we rigorously prove a trade-off between +gradient measurement efficiency, defined as the mean number of simultaneously +measurable gradient components, and expressivity in a wide class of deep QNNs, +elucidating the theoretical limits and possibilities of efficient gradient +estimation. This trade-off implies that a more expressive QNN requires a higher +measurement cost in gradient estimation, whereas we can increase gradient +measurement efficiency by reducing the QNN expressivity to suit a given task. +We further propose a general QNN ansatz called the stabilizer-logical product +ansatz (SLPA), which can reach the upper limit of the trade-off inequality by +leveraging the symmetric structure of the quantum circuit. In learning an +unknown symmetric function, the SLPA drastically reduces the quantum resources +required for training while maintaining accuracy and trainability compared to a +well-designed symmetric circuit based on the parameter-shift method. Our +results not only reveal a theoretical understanding of efficient training in +QNNs but also provide a standard and broadly applicable efficient QNN design. + +
+
+ comment: 31 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Domain-decoupled Physics-informed Neural Networks with Closed-form + Gradients for Fast Model Learning of Dynamical Systems + + +
+ Physics-informed neural networks (PINNs) are trained using physical equations +and can also incorporate unmodeled effects by learning from data. PINNs for +control (PINCs) of dynamical systems are gaining interest due to their +prediction speed compared to classical numerical integration methods for +nonlinear state-space models, making them suitable for real-time control +applications. We introduce the domain-decoupled physics-informed neural network +(DD-PINN) to address current limitations of PINC in handling large and complex +nonlinear dynamical systems. The time domain is decoupled from the feed-forward +neural network to construct an Ansatz function, allowing for calculation of +gradients in closed form. This approach significantly reduces training times, +especially for large dynamical systems, compared to PINC, which relies on +graph-based automatic differentiation. Additionally, the DD-PINN inherently +fulfills the initial condition and supports higher-order excitation inputs, +simplifying the training process and enabling improved prediction accuracy. +Validation on three systems - a nonlinear mass-spring-damper, a +five-mass-chain, and a two-link robot - demonstrates that the DD-PINN achieves +significantly shorter training times. In cases where the PINC's prediction +diverges, the DD-PINN's prediction remains stable and accurate due to higher +physics loss reduction or use of a higher-order excitation input. The DD-PINN +allows for fast and accurate learning of large dynamical systems previously out +of reach for the PINC. + +
+
+ comment: Accepted to International Conference on Informatics in Control, + Automation and Robotics (ICINCO) 2024 +
+
+
+
+
+ + ♻ ☆ Solid Waste Detection, Monitoring and Mapping in Remote Sensing Images: + A Survey + + +
+ The detection and characterization of illegal solid waste disposal sites are +essential for environmental protection, particularly for mitigating pollution +and health hazards. Improperly managed landfills contaminate soil and +groundwater via rainwater infiltration, posing threats to both animals and +humans. Traditional landfill identification approaches, such as on-site +inspections, are time-consuming and expensive. Remote sensing is a +cost-effective solution for the identification and monitoring of solid waste +disposal sites that enables broad coverage and repeated acquisitions over time. +Earth Observation (EO) satellites, equipped with an array of sensors and +imaging capabilities, have been providing high-resolution data for several +decades. Researchers proposed specialized techniques that leverage remote +sensing imagery to perform a range of tasks such as waste site detection, +dumping site monitoring, and assessment of suitable locations for new +landfills. This review aims to provide a detailed illustration of the most +relevant proposals for the detection and monitoring of solid waste sites by +describing and comparing the approaches, the implemented techniques, and the +employed data. Furthermore, since the data sources are of the utmost importance +for developing an effective solid waste detection model, a comprehensive +overview of the satellites and publicly available data sets is presented. +Finally, this paper identifies the open issues in the state-of-the-art and +discusses the relevant research directions for reducing the costs and improving +the effectiveness of novel solid waste detection methods. + +
+
+
+
+
+ + ♻ ☆ Beyond Uniform Query Distribution: Key-Driven Grouped Query Attention + + +
+ The Transformer architecture has revolutionized deep learning through its +Self-Attention mechanism, which effectively captures contextual information. +However, the memory footprint of Self-Attention presents significant challenges +for long-sequence tasks. Grouped Query Attention (GQA) addresses this issue by +grouping queries and mean-pooling the corresponding key-value heads - reducing +the number of overall parameters and memory requirements in a flexible manner +without adversely compromising model accuracy. In this work, we introduce +enhancements to GQA, focusing on two novel approaches that deviate from the +static nature of grouping: Key-Distributed GQA (KDGQA) and Dynamic +Key-Distributed GQA (DGQA), which leverage information from the norms of the +key heads to inform query allocation. Specifically, KDGQA looks at the ratios +of the norms of the key heads during each forward pass, while DGQA examines the +ratios of the norms as they evolve through training. Additionally, we present +Perturbed GQA (PGQA) as a case-study, which introduces variability in (static) +group formation via subtracting noise from the attention maps. Our experiments +with up-trained Vision Transformers, for Image Classification on datasets such +as CIFAR-10, CIFAR-100, Food101, and Tiny ImageNet, demonstrate the promise of +these variants in improving upon the original GQA through more informed and +adaptive grouping mechanisms: specifically ViT-L experiences accuracy gains of +up to 8% when utilizing DGQA in comparison to GQA and other variants. We +further analyze the impact of the number of Key-Value Heads on performance, +underscoring the importance of utilizing query-key affinities. Code is +available on GitHub. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Symplectic Bregman divergences + + +
+ We present a generalization of Bregman divergences in symplectic vector +spaces that we term symplectic Bregman divergences. Symplectic Bregman +divergences are derived from a symplectic generalization of the Fenchel-Young +inequality which relies on the notion of symplectic subdifferentials. The +symplectic Fenchel-Young inequality is obtained using the symplectic Fenchel +transform which is defined with respect to the symplectic form. Since +symplectic forms can be generically built from pairings of dual systems, we get +a generalization of Bregman divergences in dual systems obtained by equivalent +symplectic Bregman divergences. In particular, when the symplectic form is +derived from an inner product, we show that the corresponding symplectic +Bregman divergences amount to ordinary Bregman divergences with respect to +composite inner products. Some potential applications of symplectic divergences +in geometric mechanics, information geometry, and learning dynamics in machine +learning are touched upon. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ AnomalyLLM: Few-shot Anomaly Edge Detection for Dynamic Graphs using + Large Language Models + + +
+ Detecting anomaly edges for dynamic graphs aims to identify edges +significantly deviating from the normal pattern and can be applied in various +domains, such as cybersecurity, financial transactions and AIOps. With the +evolving of time, the types of anomaly edges are emerging and the labeled +anomaly samples are few for each type. Current methods are either designed to +detect randomly inserted edges or require sufficient labeled data for model +training, which harms their applicability for real-world applications. In this +paper, we study this problem by cooperating with the rich knowledge encoded in +large language models(LLMs) and propose a method, namely AnomalyLLM. To align +the dynamic graph with LLMs, AnomalyLLM pre-trains a dynamic-aware encoder to +generate the representations of edges and reprograms the edges using the +prototypes of word embeddings. Along with the encoder, we design an in-context +learning framework that integrates the information of a few labeled samples to +achieve few-shot anomaly detection. Experiments on four datasets reveal that +AnomalyLLM can not only significantly improve the performance of few-shot +anomaly detection, but also achieve superior results on new anomalies without +any update of model parameters. + +
+
+ comment: 13pages +
+
+
+
+
+ + ♻ ☆ ViIK: Flow-based Vision Inverse Kinematics Solver with Fusing Collision + Checking + + +
+ Inverse Kinematics (IK) is to find the robot's configurations that satisfy +the target pose of the end effector. In motion planning, diverse configurations +were required in case a feasible trajectory was not found. Meanwhile, collision +checking (CC), e.g. Oriented bounding box (OBB), Discrete Oriented Polytope +(DOP), and Quickhull \cite{quickhull}, needs to be done for each configuration +provided by the IK solver to ensure every goal configuration for motion +planning is available. This means the classical IK solver and CC algorithm +should be executed repeatedly for every configuration. Thus, the preparation +time is long when the required number of goal configurations is large, e.g. +motion planning in cluster environments. Moreover, structured maps, which might +be difficult to obtain, were required by classical collision-checking +algorithms. To sidestep such two issues, we propose a flow-based vision method +that can output diverse available configurations by fusing inverse kinematics +and collision checking, named Vision Inverse Kinematics solver (ViIK). +Moreover, ViIK uses RGB images as the perception of environments. ViIK can +output 1000 configurations within 40 ms, and the accuracy is about 3 +millimeters and 1.5 degrees. The higher accuracy can be obtained by being +refined by the classical IK solver within a few iterations. The self-collision +rates can be lower than 2%. The collision-with-env rates can be lower than 10% +in most scenes. The code is available at: https://github.com/AdamQLMeng/ViIK. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Neural Network for Concrete Manufacturing Process + Optimization + + +
+ Concrete manufacturing projects are one of the most common ones for +consulting agencies. Because of the highly non-linear dependency of input +materials like ash, water, cement, superplastic, etc; with the resultant +strength of concrete, it gets difficult for machine learning models to +successfully capture this relation and perform cost optimizations. This paper +highlights how PINNs (Physics Informed Neural Networks) can be useful in the +given situation. This state-of-the-art model shall also get compared with +traditional models like Linear Regression, Random Forest, Gradient Boosting, +and Deep Neural Network. Results of the research highlights how well PINNs +performed even with reduced dataset, thus resolving one of the biggest issues +of limited data availability for ML models. On an average, PINN got the loss +value reduced by 26.3% even with 40% lesser data compared to the Deep Neural +Network. In addition to predicting strength of the concrete given the quantity +of raw materials, the paper also highlights the use of heuristic optimization +method like Particle Swarm Optimization (PSO) in predicting quantity of raw +materials required to manufacture concrete of given strength with least cost. + +
+
+
+
+
+ + ♻ ☆ Procedural Adherence and Interpretability Through Neuro-Symbolic + Generative Agents + + +
+ The surge in popularity of large language models (LLMs) has opened doors for +new approaches to the creation of interactive agents. However, managing and +interpreting the temporal behavior of such agents over the course of a +potentially infinite interaction remain challenging. The stateful, long-term +horizon reasoning required for coherent agent behavior does not fit well into +the LLM paradigm. We propose a combination of formal logic-based program +synthesis and LLM content generation to bring guarantees of procedural +adherence and interpretability to generative agent behavior. To illustrate the +benefit of procedural adherence and interpretability, we use Temporal Stream +Logic (TSL) to generate an automaton that enforces an interpretable, high-level +temporal structure on an agent. With the automaton tracking the context of the +interaction and making decisions to guide the conversation accordingly, we can +drive content generation in a way that allows the LLM to focus on a shorter +context window. We evaluated our approach on different tasks involved in +creating an interactive agent specialized for generating +choose-your-own-adventure games. We found that over all of the tasks, an +automaton-enhanced agent with procedural guarantees achieves at least 96% +adherence to its temporal constraints, whereas a purely LLM-based agent +demonstrates as low as 14.67% adherence. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Wireless Channel Aware Data Augmentation Methods for Deep Learning-Based + Indoor Localization + + +
+ Indoor localization is a challenging problem that - unlike outdoor +localization - lacks a universal and robust solution. Machine Learning (ML), +particularly Deep Learning (DL), methods have been investigated as a promising +approach. Although such methods bring remarkable localization accuracy, they +heavily depend on the training data collected from the environment. The data +collection is usually a laborious and time-consuming task, but Data +Augmentation (DA) can be used to alleviate this issue. In this paper, different +from previously used DA, we propose methods that utilize the domain knowledge +about wireless propagation channels and devices. The methods exploit the +typical hardware component drift in the transceivers and/or the statistical +behavior of the channel, in combination with the measured Power Delay Profile +(PDP). We comprehensively evaluate the proposed methods to demonstrate their +effectiveness. This investigation mainly focuses on the impact of factors such +as the number of measurements, augmentation proportion, and the environment of +interest impact the effectiveness of the different DA methods. We show that in +the low-data regime (few actual measurements available), localization accuracy +increases up to 50%, matching non-augmented results in the high-data regime. In +addition, the proposed methods may outperform the measurement-only high-data +performance by up to 33% using only 1/4 of the amount of measured data. We also +exhibit the effect of different training data distribution and quality on the +effectiveness of DA. Finally, we demonstrate the power of the proposed methods +when employed along with Transfer Learning (TL) to address the data scarcity in +target and/or source environments. + +
+
+ comment: 13 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Lipschitz-regularized gradient flows and generative particle algorithms + for high-dimensional scarce data + + +
+ We build a new class of generative algorithms capable of efficiently learning +an arbitrary target distribution from possibly scarce, high-dimensional data +and subsequently generate new samples. These generative algorithms are +particle-based and are constructed as gradient flows of Lipschitz-regularized +Kullback-Leibler or other $f$-divergences, where data from a source +distribution can be stably transported as particles, towards the vicinity of +the target distribution. As a highlighted result in data integration, we +demonstrate that the proposed algorithms correctly transport gene expression +data points with dimension exceeding 54K, while the sample size is typically +only in the hundreds. + +
+
+
+
+
+ + ♻ ☆ Decentralized Online Learning for Random Inverse Problems Over Graphs + + +
+ We propose a decentralized online learning algorithm for distributed random +inverse problems over network graphs with online measurements, and unifies the +distributed parameter estimation in Hilbert spaces and the least mean square +problem in reproducing kernel Hilbert spaces (RKHS-LMS). We transform the +convergence of the algorithm into the asymptotic stability of a class of +inhomogeneous random difference equations in Hilbert spaces with +$L_{2}$-bounded martingale difference terms and develop the $L_2$-asymptotic +stability theory in Hilbert spaces. We show that if the network graph is +connected and the sequence of forward operators satisfies the +infinite-dimensional spatio-temporal persistence of excitation condition, then +the estimates of all nodes are mean square and almost surely strongly +consistent. Moreover, we propose a decentralized online learning algorithm in +RKHS based on non-stationary online data streams, and prove that the algorithm +is mean square and almost surely strongly consistent if the operators induced +by the random input data satisfy the infinite-dimensional spatio-temporal +persistence of excitation condition. + +
+
+
+
+
+ + ♻ ☆ Quantum-machine-assisted Drug Discovery: Survey and Perspective + + +
+ Drug discovery and development is a highly complex and costly endeavor, +typically requiring over a decade and substantial financial investment to bring +a new drug to market. Traditional computer-aided drug design (CADD) has made +significant progress in accelerating this process, but the development of +quantum computing offers potential due to its unique capabilities. This paper +discusses the integration of quantum computing into drug discovery and +development, focusing on how quantum technologies might accelerate and enhance +various stages of the drug development cycle. Specifically, we explore the +application of quantum computing in addressing challenges related to drug +discovery, such as molecular simulation and the prediction of drug-target +interactions, as well as the optimization of clinical trial outcomes. By +leveraging the inherent capabilities of quantum computing, we might be able to +reduce the time and cost associated with bringing new drugs to market, +ultimately benefiting public health. + +
+
+ comment: 27 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Heat Death of Generative Models in Closed-Loop Learning + + +
+ Improvement and adoption of generative machine learning models is rapidly +accelerating, as exemplified by the popularity of LLMs (Large Language Models) +for text, and diffusion models for image generation. As generative models +become widespread, data they generate is incorporated into shared content +through the public web. This opens the question of what happens when data +generated by a model is fed back to the model in subsequent training campaigns. +This is a question about the stability of the training process, whether the +distribution of publicly accessible content, which we refer to as "knowledge", +remains stable or collapses. + Small scale empirical experiments reported in the literature show that this +closed-loop training process is prone to degenerating. Models may start +producing gibberish data, or sample from only a small subset of the desired +data distribution (a phenomenon referred to as mode collapse). So far there has +been only limited theoretical understanding of this process, in part due to the +complexity of the deep networks underlying these generative models. + The aim of this paper is to provide insights into this process (that we refer +to as "generative closed-loop learning") by studying the learning dynamics of +generative models that are fed back their own produced content in addition to +their original training dataset. The sampling of many of these models can be +controlled via a "temperature" parameter. Using dynamical systems tools, we +show that, unless a sufficient amount of external data is introduced at each +iteration, any non-trivial temperature leads the model to asymptotically +degenerate. In fact, either the generative distribution collapses to a small +set of outputs or becomes uniform over a large set of outputs. + +
+
+
+
+
+ + ♻ ☆ Graphical vs. Deep Generative Models: Measuring the Impact of + Differentially Private Mechanisms and Budgets on Utility CCS 2024 + + +
+ Generative models trained with Differential Privacy (DP) can produce +synthetic data while reducing privacy risks. However, navigating their +privacy-utility tradeoffs makes finding the best models for specific +settings/tasks challenging. This paper bridges this gap by profiling how DP +generative models for tabular data distribute privacy budgets across rows and +columns, which is one of the primary sources of utility degradation. We compare +graphical and deep generative models, focusing on the key factors contributing +to how privacy budgets are spent, i.e., underlying modeling techniques, DP +mechanisms, and data dimensionality. + Through our measurement study, we shed light on the characteristics that make +different models suitable for various settings and tasks. For instance, we find +that graphical models distribute privacy budgets horizontally and thus cannot +handle relatively wide datasets for a fixed training time; also, the +performance on the task they were optimized for monotonically increases with +more data but could also overfit. Deep generative models spend their budgets +per iteration, so their behavior is less predictable with varying dataset +dimensions, but are more flexible as they could perform better if trained on +more features. Moreover, low levels of privacy ($\epsilon\geq100$) could help +some models generalize, achieving better results than without applying DP. We +believe our work will aid the deployment of DP synthetic data techniques by +navigating through the best candidate models vis-a-vis the dataset features, +desired privacy levels, and downstream tasks. + +
+
+ comment: A shorter version of this paper appears in the Proceedings of the + 31st ACM Conference on Computer and Communications Security (ACM CCS 2024). + This is the full version +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Kangaroo: A Powerful Video-Language Model Supporting Long-context Video + Input + + +
+ Rapid advancements have been made in extending Large Language Models (LLMs) +to Large Multi-modal Models (LMMs). However, extending input modality of LLMs +to video data remains a challenging endeavor, especially for long videos. Due +to insufficient access to large-scale high-quality video data and the excessive +compression of visual features, current methods exhibit limitations in +effectively processing long videos. In this paper, we introduce Kangaroo, a +powerful Video LMM aimed at addressing these challenges. Confronted with issue +of inadequate training data, we develop a data curation system to build a +large-scale dataset with high-quality annotations for vision-language +pre-training and instruction tuning. In addition, we design a curriculum +training pipeline with gradually increasing resolution and number of input +frames to accommodate long videos. Evaluation results demonstrate that, with 8B +parameters, Kangaroo achieves state-of-the-art performance across a variety of +video understanding benchmarks while exhibiting competitive results on others. +Particularly, on benchmarks specialized for long videos, Kangaroo excels some +larger models with over 10B parameters and proprietary models. + +
+
+
+
+
+ + ☆ A Simple Baseline with Single-encoder for Referring Image Segmentation + + +
+ Referring image segmentation (RIS) requires dense vision-language +interactions between visual pixels and textual words to segment objects based +on a given description. However, commonly adapted dual-encoders in RIS, e.g., +Swin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal +dual-encoder), lack dense multi-modal interactions during pre-training, leading +to a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods +often rely on multi-modal fusion modules that interact two encoders, but this +approach leads to high computational costs. In this paper, we present a novel +RIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of +shared self-attention across all framework components. This enables seamless +interactions of two modalities from input to final prediction, producing +granularly aligned multi-modal features. Furthermore, we propose lightweight +yet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which +contribute to the high efficiency of our model. Our simple baseline with a +single encoder achieves outstanding performances on the RIS benchmark datasets +while maintaining computational efficiency, compared to the most recent SoTA +methods based on dual-encoders. + +
+
+ comment: ArXiv pre-print +
+
+
+
+
+ + ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images + + +
+ Text-to-image generation models have achieved remarkable advancements in +recent years, aiming to produce realistic images from textual descriptions. +However, these models often struggle with generating anatomically accurate +representations of human hands. The resulting images frequently exhibit issues +such as incorrect numbers of fingers, unnatural twisting or interlacing of +fingers, or blurred and indistinct hands. These issues stem from the inherent +complexity of hand structures and the difficulty in aligning textual +descriptions with precise visual depictions of hands. To address these +challenges, we propose a novel approach named Hand1000 that enables the +generation of realistic hand images with target gesture using only 1,000 +training samples. The training of Hand1000 is divided into three stages with +the first stage aiming to enhance the model's understanding of hand anatomy by +using a pre-trained hand gesture recognition model to extract gesture +representation. The second stage further optimizes text embedding by +incorporating the extracted hand gesture representation, to improve alignment +between the textual descriptions and the generated hand images. The third stage +utilizes the optimized embedding to fine-tune the Stable Diffusion model to +generate realistic hand images. In addition, we construct the first publicly +available dataset specifically designed for text-to-hand image generation. +Based on the existing hand gesture recognition dataset, we adopt advanced image +captioning models and LLaMA3 to generate high-quality textual descriptions +enriched with detailed gesture information. Extensive experiments demonstrate +that Hand1000 significantly outperforms existing models in producing +anatomically correct hand images while faithfully representing other details in +the text, such as faces, clothing, and colors. + +
+
+ comment: Project page https://haozhuo-zhang.github.io/Hand1000-project-page/ +
+
+
+
+
+ + ☆ SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge + + +
+ With the advancements in singing voice generation and the growing presence of +AI singers on media platforms, the inaugural Singing Voice Deepfake Detection +(SVDD) Challenge aims to advance research in identifying AI-generated singing +voices from authentic singers. This challenge features two tracks: a controlled +setting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The +CtrSVDD track utilizes publicly available singing vocal data to generate +deepfakes using state-of-the-art singing voice synthesis and conversion +systems. Meanwhile, the WildSVDD track expands upon the existing SingFake +dataset, which includes data sourced from popular user-generated content +websites. For the CtrSVDD track, we received submissions from 47 teams, with 37 +surpassing our baselines and the top team achieving a 1.65% equal error rate. +For the WildSVDD track, we benchmarked the baselines. This paper reviews these +results, discusses key findings, and outlines future directions for SVDD +research. + +
+
+
+
+
+ + ♻ ☆ Proceedings of The second international workshop on eXplainable AI for + the Arts (XAIxArts) + + +
+ This second international workshop on explainable AI for the Arts (XAIxArts) +brought together a community of researchers in HCI, Interaction Design, AI, +explainable AI (XAI), and digital arts to explore the role of XAI for the Arts. +Workshop held at the 16th ACM Conference on Creativity and Cognition (C&C +2024), Chicago, USA. + +
+
+ comment: Proceedings of The second international workshop on eXplainable AI + for the Arts (XAIxArts) +
+
+
+
+
+ + ♻ ☆ MambaGesture: Enhancing Co-Speech Gesture Generation with Mamba and + Disentangled Multi-Modality Fusion ACM MM 2024 + + +
+ Co-speech gesture generation is crucial for producing synchronized and +realistic human gestures that accompany speech, enhancing the animation of +lifelike avatars in virtual environments. While diffusion models have shown +impressive capabilities, current approaches often overlook a wide range of +modalities and their interactions, resulting in less dynamic and contextually +varied gestures. To address these challenges, we present MambaGesture, a novel +framework integrating a Mamba-based attention block, MambaAttn, with a +multi-modality feature fusion module, SEAD. The MambaAttn block combines the +sequential data processing strengths of the Mamba model with the contextual +richness of attention mechanisms, enhancing the temporal coherence of generated +gestures. SEAD adeptly fuses audio, text, style, and emotion modalities, +employing disentanglement to deepen the fusion process and yield gestures with +greater realism and diversity. Our approach, rigorously evaluated on the +multi-modal BEAT dataset, demonstrates significant improvements in Fr\'echet +Gesture Distance (FGD), diversity scores, and beat alignment, achieving +state-of-the-art performance in co-speech gesture generation. Project website: +$\href{https://fcchit.github.io/mambagesture/}{\textit{https://fcchit.github.io/mambagesture/}}$. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and + Results + + +
+ Video quality assessment (VQA) is a crucial task in the development of video +compression standards, as it directly impacts the viewer experience. This paper +presents the results of the Compressed Video Quality Assessment challenge, held +in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV +2024. The challenge aimed to evaluate the performance of VQA methods on a +diverse dataset of 459 videos, encoded with 14 codecs of various compression +standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a +comprehensive collection of compression artifacts. To measure the methods +performance, we employed traditional correlation coefficients between their +predictions and subjective scores, which were collected via large-scale +crowdsourced pairwise human comparisons. For training purposes, participants +were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a +previously developed dataset of 1022 videos. Up to 30 participating teams +registered for the challenge, while we report the results of 6 teams, which +submitted valid final solutions and code for reproducing the results. Moreover, +we calculated and present the performance of state-of-the-art VQA methods on +the developed dataset, providing a comprehensive benchmark for future research. +The dataset, results, and online leaderboard are publicly available at +https://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html. + +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`